citrus 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,904 @@
1
+ # Citrus is a compact and powerful parsing library for Ruby that combines the
2
+ # elegance and expressiveness of the language with the simplicity and power of
3
+ # parsing expression grammars.
4
+ #
5
+ # http://github.com/mjijackson/citrus
6
+ module Citrus
7
+ VERSION = [1, 0, 0]
8
+
9
+ Infinity = 1.0 / 0
10
+
11
+ autoload 'PEG', 'citrus/peg'
12
+
13
+ # Returns the current version of Citrus as a string.
14
+ def self.version
15
+ VERSION.join('.')
16
+ end
17
+
18
+ # Loads the grammar from the given +file+ into the global scope using #eval.
19
+ def self.load(file)
20
+ file << '.citrus' unless File.file?(file)
21
+ raise "Cannot find file #{file}" unless File.file?(file)
22
+ raise "Cannot read file #{file}" unless File.readable?(file)
23
+ self.eval(File.read(file))
24
+ end
25
+
26
+ # Evaluates the given Citrus parsing expression grammar +code+ in the global
27
+ # scope. Returns an array of any grammar modules that were created.
28
+ def self.eval(code)
29
+ file = PEG.parse(code)
30
+ file.value
31
+ end
32
+
33
+ # This error is raised whenever a parse fails.
34
+ class ParseError < Exception
35
+ def initialize(input)
36
+ @input = input
37
+ c = consumed
38
+ s = [0, c.length - 40].max
39
+ msg = "Failed to parse input at offset %d" % max_offset
40
+ msg += ", just after %s" % c[s, c.length].inspect + "\n"
41
+ super(msg)
42
+ end
43
+
44
+ # The Input object that was used for the parse.
45
+ attr_reader :input
46
+
47
+ # Returns the maximum offset that was reached before the error occurred.
48
+ def max_offset
49
+ input.max_offset
50
+ end
51
+
52
+ # Returns the portion of the input string that was successfully consumed
53
+ # before the parse failed.
54
+ def consumed
55
+ input[0, max_offset]
56
+ end
57
+ end
58
+
59
+ # Inclusion of this module into another extends the receiver with the grammar
60
+ # helper methods in GrammarMethods. Although this module does not actually
61
+ # provide any methods, constants, or variables to modules that include it, the
62
+ # mere act of inclusion provides a useful lookup mechanism to determine if a
63
+ # module is in fact a grammar.
64
+ module Grammar
65
+ # Creates a new anonymous module that includes Grammar. If a +block+ is
66
+ # provided, it will be called with the new module as its first argument if
67
+ # its +arity+ is 1 or +instance_eval+'d in the context of the new module
68
+ # otherwise. See http://blog.grayproductions.net/articles/dsl_block_styles
69
+ # for the rationale behind this decision.
70
+ #
71
+ # Grammars created with this method may be assigned a name by being assigned
72
+ # to some constant, e.g.:
73
+ #
74
+ # Calc = Grammar.new {}
75
+ #
76
+ def self.new(&block)
77
+ mod = Module.new { include Grammar }
78
+ block.arity == 1 ? block[mod] : mod.instance_eval(&block) if block
79
+ mod
80
+ end
81
+
82
+ # Extends all modules that +include Grammar+ with GrammarMethods and
83
+ # exposes Module#include.
84
+ def self.included(mod)
85
+ mod.extend(GrammarMethods)
86
+ class << mod; public :include end
87
+ end
88
+ end
89
+
90
+ # Contains methods that are available to Grammar modules at the class level.
91
+ module GrammarMethods
92
+ # Returns the name of this grammar as a string.
93
+ def name
94
+ super.to_s
95
+ end
96
+
97
+ # Returns an array of all grammars that have been included in this grammar
98
+ # in the reverse order they were included.
99
+ def included_grammars
100
+ included_modules.select {|mod| mod.include?(Grammar) }
101
+ end
102
+
103
+ # Returns an array of all names of rules in this grammar as symbols ordered
104
+ # in the same way they were defined (i.e. rules that were defined later
105
+ # appear later in the array).
106
+ def rule_names
107
+ @rule_names ||= []
108
+ end
109
+
110
+ # Returns a hash of all Rule objects in this grammar, keyed by rule name.
111
+ def rules
112
+ @rules ||= {}
113
+ end
114
+
115
+ # Returns +true+ if this grammar has a rule with the given +name+.
116
+ def has_rule?(name)
117
+ rules.key?(name.to_sym)
118
+ end
119
+
120
+ # Loops through the rule tree for the given +rule+ looking for any Super
121
+ # rules. When it finds one, it sets that rule's rule name to the given
122
+ # +name+.
123
+ def setup_super(rule, name) # :nodoc:
124
+ if Nonterminal === rule
125
+ rule.rules.each {|r| setup_super(r, name) }
126
+ elsif Super === rule
127
+ rule.rule_name = name
128
+ end
129
+ end
130
+ private :setup_super
131
+
132
+ # Searches the inheritance hierarchy of this grammar for a rule named +name+
133
+ # and returns it on success. Returns +nil+ on failure.
134
+ def super_rule(name)
135
+ sym = name.to_sym
136
+ included_grammars.each do |g|
137
+ r = g.rule(sym)
138
+ return r if r
139
+ end
140
+ nil
141
+ end
142
+
143
+ # Gets/sets the rule with the given +name+. If +obj+ is given the rule
144
+ # will be set to the value of +obj+ passed through Rule#create. If a block
145
+ # is given, its return value will be used for the value of +obj+.
146
+ #
147
+ # It is important to note that this method will also check any included
148
+ # grammars for a rule with the given +name+ if one cannot be found in this
149
+ # grammar.
150
+ def rule(name, obj=nil)
151
+ sym = name.to_sym
152
+
153
+ obj = Proc.new.call if block_given?
154
+
155
+ if obj
156
+ rule_names << sym unless has_rule?(sym)
157
+
158
+ rule = Rule.create(obj)
159
+ rule.name = name
160
+ setup_super(rule, name)
161
+ rule.grammar = self
162
+
163
+ rules[sym] = rule
164
+ end
165
+
166
+ rules[sym] || super_rule(sym)
167
+ rescue => e
168
+ raise "Cannot create rule \"#{name}\": " + e.message
169
+ end
170
+
171
+ # Gets/sets the +name+ of the root rule of this grammar.
172
+ def root(name=nil)
173
+ @root = name.to_sym if name
174
+ # The first rule in a grammar is the default root.
175
+ @root || rule_names.first
176
+ end
177
+
178
+ # Creates a new Super for the rule currently being defined in the grammar. A
179
+ # block may be provided to specify semantic behavior (via #ext).
180
+ def sup(&block)
181
+ ext(Super.new, block)
182
+ end
183
+
184
+ # Creates a new AndPredicate using the given +rule+. A block may be provided
185
+ # to specify semantic behavior (via #ext).
186
+ def andp(rule, &block)
187
+ ext(AndPredicate.new(rule), block)
188
+ end
189
+
190
+ # Creates a new NotPredicate using the given +rule+. A block may be provided
191
+ # to specify semantic behavior (via #ext).
192
+ def notp(rule, &block)
193
+ ext(NotPredicate.new(rule), block)
194
+ end
195
+
196
+ # Creates a new Label using the given +rule+ and +label+. A block may be
197
+ # provided to specify semantic behavior (via #ext).
198
+ def label(rule, label, &block)
199
+ ext(Label.new(label, rule), block)
200
+ end
201
+
202
+ # Creates a new Repeat using the given +rule+. +min+ and +max+ specify the
203
+ # minimum and maximum number of times the rule must match. A block may be
204
+ # provided to specify semantic behavior (via #ext).
205
+ def rep(rule, min=1, max=Infinity, &block)
206
+ ext(Repeat.new(min, max, rule), block)
207
+ end
208
+
209
+ # An alias for #rep.
210
+ def one_or_more(rule, &block)
211
+ rep(rule, &block)
212
+ end
213
+
214
+ # An alias for #rep with a minimum of 0.
215
+ def zero_or_more(rule, &block)
216
+ rep(rule, 0, &block)
217
+ end
218
+
219
+ # An alias for #rep with a minimum of 0 and a maximum of 1.
220
+ def zero_or_one(rule, &block)
221
+ rep(rule, 0, 1, &block)
222
+ end
223
+
224
+ # Creates a new Sequence using all arguments. A block may be provided to
225
+ # specify semantic behavior (via #ext).
226
+ def all(*args, &block)
227
+ ext(Sequence.new(args), block)
228
+ end
229
+
230
+ # Creates a new Choice using all arguments. A block may be provided to
231
+ # specify semantic behavior (via #ext).
232
+ def any(*args, &block)
233
+ ext(Choice.new(args), block)
234
+ end
235
+
236
+ # Specifies a Module that will be used to extend all matches created with
237
+ # the given +rule+. A block may also be given that will be used to create
238
+ # an anonymous module. See Rule#ext=.
239
+ def ext(rule, mod=nil)
240
+ rule = Rule.create(rule)
241
+ mod = Proc.new if block_given?
242
+ rule.ext = mod if mod
243
+ rule
244
+ end
245
+
246
+ # Parses the given +string+ from the given +offset+ using the rules in this
247
+ # grammar. A ParseError is raised if there is no match made or if
248
+ # +consume_all+ is +true+ and the entire input string cannot be consumed.
249
+ def parse(string, offset=0, enable_memo=false, consume_all=true)
250
+ raise "No root rule specified" unless root
251
+
252
+ root_rule = rule(root)
253
+ raise "No rule named \"#{root}\"" unless root_rule
254
+
255
+ input = Input.new(string, enable_memo)
256
+ match = input.match(root_rule, offset)
257
+
258
+ if !match || (consume_all && match.length != string.length)
259
+ raise ParseError.new(input)
260
+ end
261
+
262
+ match
263
+ end
264
+ end
265
+
266
+ # This class represents the core of the parsing algorithm. It wraps the input
267
+ # string and serves matches to all nonterminals.
268
+ class Input
269
+ # Takes the input +string+ that is to be parsed. If +enable_memo+ is +true+
270
+ # a cache is created that holds references to already generated matches.
271
+ def initialize(string, enable_memo=false)
272
+ @string = string
273
+ @max_offset = 0
274
+ if enable_memo
275
+ @cache = {}
276
+ @cache_hits = 0
277
+ end
278
+ end
279
+
280
+ # The input string.
281
+ attr_reader :string
282
+
283
+ # The maximum offset that has been achieved.
284
+ attr_reader :max_offset
285
+
286
+ # A two-level hash of rule id's and offsets to their respective matches.
287
+ # Only present if memoing is enabled.
288
+ attr_reader :cache
289
+
290
+ # The number of times the cache was hit. Only present if memoing is enabled.
291
+ attr_reader :cache_hits
292
+
293
+ # Sends all arguments to this input's +string+.
294
+ def [](*args)
295
+ @string.__send__(:[], *args)
296
+ end
297
+
298
+ # Returns the length of this input.
299
+ def length
300
+ @string.length
301
+ end
302
+
303
+ # Returns the match for a given +rule+ at +offset+. If memoing is enabled
304
+ # and a match does not already exist for the given rule/offset pair then
305
+ # the rule is executed and the result is cached before returning. See
306
+ # http://pdos.csail.mit.edu/~baford/packrat/icfp02/ for more information
307
+ # on memoing match results (also known as packrat parsing).
308
+ def match(rule, offset=0)
309
+ @max_offset = offset if offset > @max_offset
310
+
311
+ if @cache
312
+ c = @cache[rule.id] ||= {}
313
+
314
+ if c.key?(offset)
315
+ @cache_hits += 1
316
+ c[offset]
317
+ else
318
+ c[offset] = rule.match(self, offset)
319
+ end
320
+ else
321
+ rule.match(self, offset)
322
+ end
323
+ end
324
+ end
325
+
326
+ # A Rule is an object that is used by a grammar to create matches on the
327
+ # Input during parsing.
328
+ module Rule
329
+ # Returns a new Rule object depending on the type of object given.
330
+ def self.create(obj)
331
+ case obj
332
+ when Rule then obj
333
+ when Symbol then Alias.new(obj)
334
+ when String then FixedWidth.new(obj)
335
+ when Regexp then Expression.new(obj)
336
+ when Array then Sequence.new(obj)
337
+ when Range then Choice.new(obj.to_a)
338
+ when Numeric then FixedWidth.new(obj.to_s)
339
+ else
340
+ raise ArgumentError, "Invalid rule object: #{obj.inspect}"
341
+ end
342
+ end
343
+
344
+ @uniq_id = 0
345
+
346
+ # Generates a new rule id.
347
+ def self.new_id
348
+ @uniq_id += 1
349
+ end
350
+
351
+ # The grammar this rule belongs to.
352
+ attr_accessor :grammar
353
+
354
+ # An integer id that is unique to this rule.
355
+ def id
356
+ @id ||= Rule.new_id
357
+ end
358
+
359
+ # Sets the name of this rule.
360
+ def name=(name)
361
+ @name = name.to_sym
362
+ end
363
+
364
+ # The name of this rule.
365
+ attr_reader :name
366
+
367
+ # Specifies a module that will be used to extend all Match objects that
368
+ # result from this rule. If +mod+ is a Proc, it is used to create an
369
+ # anonymous module.
370
+ def ext=(mod)
371
+ mod = Module.new(&mod) if Proc === mod
372
+ @ext = mod
373
+ end
374
+
375
+ # The module this rule uses to extend new matches.
376
+ attr_reader :ext
377
+
378
+ # Returns +true+ if this rule is a Terminal.
379
+ def terminal?
380
+ is_a?(Terminal)
381
+ end
382
+
383
+ # Returns +true+ if this rule needs to be surrounded by parentheses when
384
+ # using #embed.
385
+ def paren?
386
+ false
387
+ end
388
+
389
+ # Returns a string version of this rule that is suitable to be used in the
390
+ # string representation of another rule.
391
+ def embed
392
+ name ? name.to_s : (paren? ? '(%s)' % to_s : to_s)
393
+ end
394
+
395
+ def inspect # :nodoc:
396
+ to_s
397
+ end
398
+
399
+ private
400
+
401
+ def extend_match(match)
402
+ match.extend(ext) if ext
403
+ end
404
+
405
+ def create_match(data, offset)
406
+ match = Match.new(data, offset)
407
+ extend_match(match)
408
+ match.name = name
409
+ match
410
+ end
411
+ end
412
+
413
+ # A Proxy is a Rule that is a placeholder for another rule. It stores the
414
+ # name of some other rule in the grammar internally and resolves it to the
415
+ # actual Rule object at runtime. This lazy evaluation permits us to create
416
+ # Proxy objects for rules that we may not know the definition of yet.
417
+ module Proxy
418
+ include Rule
419
+
420
+ def initialize(name='<proxy>')
421
+ self.rule_name = name
422
+ end
423
+
424
+ # Sets the name of the rule this rule is proxy for.
425
+ def rule_name=(name)
426
+ @rule_name = name.to_sym
427
+ end
428
+
429
+ # The name of this proxy's rule.
430
+ attr_reader :rule_name
431
+
432
+ # Returns the underlying Rule for this proxy.
433
+ def rule
434
+ @rule ||= resolve!
435
+ end
436
+
437
+ # Returns the Match for this proxy's #rule on +input+ at the given +offset+,
438
+ # +nil+ if no match can be made.
439
+ def match(input, offset=0)
440
+ m = input.match(rule, offset)
441
+ if m
442
+ extend_match(m)
443
+ # If this Proxy has a name then it should rename all of its matches.
444
+ m.name = name if name
445
+ m
446
+ end
447
+ end
448
+ end
449
+
450
+ # An Alias is a Proxy for a rule in the same grammar. It is used in rule
451
+ # definitions when a rule calls some other rule by name. The PEG notation is
452
+ # simply the name of another rule without any other punctuation, e.g.:
453
+ #
454
+ # name
455
+ #
456
+ class Alias
457
+ include Proxy
458
+
459
+ # Returns the PEG notation of this rule as a string.
460
+ def to_s
461
+ rule_name.to_s
462
+ end
463
+
464
+ private
465
+
466
+ # Searches this proxy's grammar and any included grammars for a rule with
467
+ # this proxy's #rule_name. Raises an error if one cannot be found.
468
+ def resolve!
469
+ rule = grammar.rule(rule_name)
470
+ raise RuntimeError, 'No rule named "%s" in grammar %s' %
471
+ [rule_name, grammar.name] unless rule
472
+ rule
473
+ end
474
+ end
475
+
476
+ # A Super is a Proxy for a rule of the same name that was defined previously
477
+ # in the grammar's inheritance chain. Thus, Super's work like Ruby's +super+,
478
+ # only for rules in a grammar instead of methods in a module. The PEG notation
479
+ # is the word +super+ without any other punctuation, e.g.:
480
+ #
481
+ # super
482
+ #
483
+ class Super
484
+ include Proxy
485
+
486
+ # Returns the PEG notation of this rule as a string.
487
+ def to_s
488
+ 'super'
489
+ end
490
+
491
+ private
492
+
493
+ # Searches this proxy's included grammars for a rule with this proxy's
494
+ # #rule_name. Raises an error if one cannot be found.
495
+ def resolve!
496
+ rule = grammar.super_rule(rule_name)
497
+ raise RuntimeError, 'No rule named "%s" in hierarchy of grammar %s' %
498
+ [rule_name, grammar.name] unless rule
499
+ rule
500
+ end
501
+ end
502
+
503
+ # A Terminal is a Rule that matches directly on the input stream and may not
504
+ # contain any other rule.
505
+ module Terminal
506
+ include Rule
507
+
508
+ def initialize(rule)
509
+ @rule = rule
510
+ end
511
+
512
+ # The actual String or Regexp object this rule uses to match.
513
+ attr_reader :rule
514
+
515
+ # Returns the PEG notation of this rule as a string.
516
+ def to_s
517
+ rule.inspect
518
+ end
519
+ end
520
+
521
+ # A FixedWidth is a Terminal that matches based on its length. The PEG
522
+ # notation is any sequence of characters enclosed in either single or double
523
+ # quotes, e.g.:
524
+ #
525
+ # 'expr'
526
+ # "expr"
527
+ #
528
+ class FixedWidth
529
+ include Terminal
530
+
531
+ def initialize(rule='')
532
+ raise ArgumentError, "FixedWidth must be a String" unless String === rule
533
+ super
534
+ end
535
+
536
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
537
+ # no match can be made.
538
+ def match(input, offset=0)
539
+ create_match(rule.dup, offset) if rule == input[offset, rule.length]
540
+ end
541
+ end
542
+
543
+ # An Expression is a Terminal that has the same semantics as a regular
544
+ # expression in Ruby. The expression must match at the beginning of the input
545
+ # (index 0). The PEG notation is identical to Ruby's regular expression
546
+ # notation, e.g.:
547
+ #
548
+ # /expr/
549
+ #
550
+ # Character classes and the dot symbol may also be used in PEG notation for
551
+ # compatibility with other PEG implementations, e.g.:
552
+ #
553
+ # [a-zA-Z]
554
+ # .
555
+ #
556
+ class Expression
557
+ include Terminal
558
+
559
+ def initialize(rule=/^/)
560
+ raise ArgumentError, "Expression must be a Regexp" unless Regexp === rule
561
+ super
562
+ end
563
+
564
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
565
+ # no match can be made.
566
+ def match(input, offset=0)
567
+ result = input[offset, input.length - offset].match(rule)
568
+ create_match(result, offset) if result && result.begin(0) == 0
569
+ end
570
+ end
571
+
572
+ # A Nonterminal is a Rule that augments the matching behavior of one or more
573
+ # other rules. Nonterminals may not match directly on the input, but instead
574
+ # invoke the rule(s) they contain to determine if a match can be made from
575
+ # the collective result.
576
+ module Nonterminal
577
+ include Rule
578
+
579
+ def initialize(rules=[])
580
+ @rules = rules.map {|r| Rule.create(r) }
581
+ end
582
+
583
+ # An array of the actual Rule objects this rule uses to match.
584
+ attr_reader :rules
585
+
586
+ def grammar=(grammar)
587
+ @rules.each {|r| r.grammar = grammar }
588
+ super
589
+ end
590
+ end
591
+
592
+ # A Predicate is a Nonterminal that contains one other rule.
593
+ module Predicate
594
+ include Nonterminal
595
+
596
+ def initialize(rule='')
597
+ super([ rule ])
598
+ end
599
+
600
+ # Returns the Rule object this rule uses to match.
601
+ def rule
602
+ rules[0]
603
+ end
604
+ end
605
+
606
+ # An AndPredicate is a Predicate that contains a rule that must match. Upon
607
+ # success an empty match is returned and no input is consumed. The PEG
608
+ # notation is any expression preceeded by an ampersand, e.g.:
609
+ #
610
+ # &expr
611
+ #
612
+ class AndPredicate
613
+ include Predicate
614
+
615
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
616
+ # no match can be made.
617
+ def match(input, offset=0)
618
+ create_match('', offset) if input.match(rule, offset)
619
+ end
620
+
621
+ # Returns the PEG notation of this rule as a string.
622
+ def to_s
623
+ '&' + rule.embed
624
+ end
625
+ end
626
+
627
+ # A NotPredicate is a Predicate that contains a rule that must not match. Upon
628
+ # success an empty match is returned and no input is consumed. The PEG
629
+ # notation is any expression preceeded by an exclamation mark, e.g.:
630
+ #
631
+ # !expr
632
+ #
633
+ class NotPredicate
634
+ include Predicate
635
+
636
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
637
+ # no match can be made.
638
+ def match(input, offset=0)
639
+ create_match('', offset) unless input.match(rule, offset)
640
+ end
641
+
642
+ # Returns the PEG notation of this rule as a string.
643
+ def to_s
644
+ '!' + rule.embed
645
+ end
646
+ end
647
+
648
+ # A Label is a Predicate that applies a new name to any matches made by its
649
+ # rule. The PEG notation is any sequence of word characters (i.e.
650
+ # <tt>[a-zA-Z0-9_]</tt>) followed by a colon, followed by any other
651
+ # expression, e.g.:
652
+ #
653
+ # label:expr
654
+ #
655
+ class Label
656
+ include Predicate
657
+
658
+ def initialize(label='<label>', rule='')
659
+ @label = label.to_sym
660
+ super(rule)
661
+ end
662
+
663
+ # The symbol this rule uses to re-name all its matches.
664
+ attr_reader :label
665
+
666
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
667
+ # no match can be made. When a Label makes a match, it re-names the match to
668
+ # the value of its label.
669
+ def match(input, offset=0)
670
+ m = rule.match(input, offset)
671
+ if m
672
+ extend_match(m)
673
+ m.name = label
674
+ m
675
+ end
676
+ end
677
+
678
+ # Returns the PEG notation of this rule as a string.
679
+ def to_s
680
+ label.to_s + ':' + rule.embed
681
+ end
682
+ end
683
+
684
+ # A Repeat is a Predicate that specifies a minimum and maximum number of times
685
+ # its rule must match. The PEG notation is an integer, +N+, followed by an
686
+ # asterisk, followed by another integer, +M+, all of which follow any other
687
+ # expression, e.g.:
688
+ #
689
+ # expr N*M
690
+ #
691
+ # In this notation +N+ specifies the minimum number of times the preceeding
692
+ # expression must match and +M+ specifies the maximum. If +N+ is ommitted,
693
+ # it is assumed to be 0. Likewise, if +M+ is omitted, it is assumed to be
694
+ # infinity (no maximum). Thus, an expression followed by only an asterisk may
695
+ # match any number of times, including zero.
696
+ #
697
+ # The shorthand notation <tt>+</tt> and <tt>?</tt> may be used for the common
698
+ # cases of <tt>1*</tt> and <tt>*1</tt> respectively, e.g.:
699
+ #
700
+ # expr+
701
+ # expr?
702
+ #
703
+ class Repeat
704
+ include Predicate
705
+
706
+ def initialize(min=1, max=Infinity, rule='')
707
+ raise ArgumentError, "Min cannot be greater than max" if min > max
708
+ @range = Range.new(min, max)
709
+ super(rule)
710
+ end
711
+
712
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
713
+ # no match can be made.
714
+ def match(input, offset=0)
715
+ matches = []
716
+ os = offset
717
+ while matches.length < @range.end
718
+ m = input.match(rule, os)
719
+ break unless m
720
+ matches << m
721
+ os += m.length
722
+ end
723
+ create_match(matches, offset) if @range.include?(matches.length)
724
+ end
725
+
726
+ # Returns the operator this rule uses as a string. Will be one of
727
+ # <tt>+</tt>, <tt>?</tt>, or <tt>N*M</tt>.
728
+ def operator
729
+ unless @operator
730
+ m = [@range.begin, @range.end].map do |n|
731
+ n == 0 || n == Infinity ? '' : n.to_s
732
+ end
733
+ @operator = case m
734
+ when ['', '1'] then '?'
735
+ when ['1', ''] then '+'
736
+ else m.join('*')
737
+ end
738
+ end
739
+ @operator
740
+ end
741
+
742
+ # Returns the PEG notation of this rule as a string.
743
+ def to_s
744
+ rule.embed + operator
745
+ end
746
+ end
747
+
748
+ # A List is a Nonterminal that contains any number of other rules and tests
749
+ # them for matches in sequential order.
750
+ module List
751
+ include Nonterminal
752
+
753
+ def paren?
754
+ rules.length > 1
755
+ end
756
+ end
757
+
758
+ # A Choice is a List where only one rule must match. The PEG notation is two
759
+ # or more expressions separated by a vertical bar, e.g.:
760
+ #
761
+ # expr | expr
762
+ #
763
+ class Choice
764
+ include List
765
+
766
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
767
+ # no match can be made.
768
+ def match(input, offset=0)
769
+ rules.each do |rule|
770
+ m = input.match(rule, offset)
771
+ return create_match([m], offset) if m
772
+ end
773
+ nil
774
+ end
775
+
776
+ # Returns the PEG notation of this rule as a string.
777
+ def to_s
778
+ rules.map {|r| r.embed }.join(' | ')
779
+ end
780
+ end
781
+
782
+ # A Sequence is a List where all rules must match. The PEG notation is two or
783
+ # more expressions separated by a space, e.g.:
784
+ #
785
+ # expr expr
786
+ #
787
+ class Sequence
788
+ include List
789
+
790
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
791
+ # no match can be made.
792
+ def match(input, offset=0)
793
+ matches = []
794
+ os = offset
795
+ rules.each do |rule|
796
+ m = input.match(rule, os)
797
+ break unless m
798
+ matches << m
799
+ os += m.length
800
+ end
801
+ create_match(matches, offset) if matches.length == rules.length
802
+ end
803
+
804
+ # Returns the PEG notation of this rule as a string.
805
+ def to_s
806
+ rules.map {|r| r.embed }.join(' ')
807
+ end
808
+ end
809
+
810
+ # The base class for all matches. Matches are organized into a tree where any
811
+ # match may contain any number of other matches. This class provides several
812
+ # convenient tree traversal methods that help when examining parse results.
813
+ class Match
814
+ def initialize(data, offset=0)
815
+ case data
816
+ when String
817
+ @text = data
818
+ when MatchData
819
+ @text = data[0]
820
+ @captures = data.captures
821
+ when Array
822
+ @matches = data
823
+ end
824
+
825
+ @offset = offset
826
+ end
827
+
828
+ # The name by which this match can be accessed from a parent match. This
829
+ # will be the name of the rule that generated the match in most cases.
830
+ # However, if the match is the result of a Label this will be the value of
831
+ # the label.
832
+ attr_accessor :name
833
+
834
+ # The offset in the input at which this match occurred.
835
+ attr_reader :offset
836
+
837
+ # An array of all sub-matches of this match.
838
+ def matches
839
+ @matches ||= []
840
+ end
841
+
842
+ # An array of substrings returned by MatchData#captures if this match was
843
+ # created by an Expression.
844
+ def captures
845
+ @captures ||= []
846
+ end
847
+
848
+ # Returns the raw text value of this match, which may simply be an
849
+ # aggregate of the text of all sub-matches if this match is not #terminal?.
850
+ def text
851
+ @text ||= matches.inject('') {|s, m| s << m.text }
852
+ end
853
+
854
+ alias to_s text
855
+
856
+ # Returns the length of this match's #text value as an Integer.
857
+ def length
858
+ text.length
859
+ end
860
+
861
+ # Passes all arguments to the #text of this match.
862
+ def [](*args)
863
+ text.__send__(:[], *args)
864
+ end
865
+
866
+ # Returns an array of all sub-matches with the given +name+. If +deep+ is
867
+ # +false+, returns only sub-matches that are immediate descendants of this
868
+ # match.
869
+ def find(name, deep=true)
870
+ sym = name.to_sym
871
+ ms = matches.select {|m| sym == m.name }
872
+ ms.concat(matches.map {|m| m.find(name, deep) }.flatten) if deep
873
+ ms
874
+ end
875
+
876
+ # A shortcut for retrieving the first immediate sub-match of this match. If
877
+ # +name+ is given, attempts to retrieve the first immediate sub-match named
878
+ # +name+.
879
+ def first(name=nil)
880
+ name.nil? ? matches.first : find(name, false).first
881
+ end
882
+
883
+ # Returns +true+ if this match has no descendants (was created from a
884
+ # Terminal).
885
+ def terminal?
886
+ matches.length == 0
887
+ end
888
+
889
+ # Checks equality by comparing this match's #text value to +obj+.
890
+ def ==(obj)
891
+ text == obj
892
+ end
893
+
894
+ alias eql? ==
895
+
896
+ # Uses #match to allow sub-matches of this match to be called by name as
897
+ # instance methods.
898
+ def method_missing(sym, *args)
899
+ m = first(sym)
900
+ return m if m
901
+ raise 'No match named "%s" in %s (%s)' % [sym, self, name]
902
+ end
903
+ end
904
+ end