citrus 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,904 @@
1
+ # Citrus is a compact and powerful parsing library for Ruby that combines the
2
+ # elegance and expressiveness of the language with the simplicity and power of
3
+ # parsing expression grammars.
4
+ #
5
+ # http://github.com/mjijackson/citrus
6
+ module Citrus
7
+ VERSION = [1, 0, 0]
8
+
9
+ Infinity = 1.0 / 0
10
+
11
+ autoload 'PEG', 'citrus/peg'
12
+
13
+ # Returns the current version of Citrus as a string.
14
+ def self.version
15
+ VERSION.join('.')
16
+ end
17
+
18
+ # Loads the grammar from the given +file+ into the global scope using #eval.
19
+ def self.load(file)
20
+ file << '.citrus' unless File.file?(file)
21
+ raise "Cannot find file #{file}" unless File.file?(file)
22
+ raise "Cannot read file #{file}" unless File.readable?(file)
23
+ self.eval(File.read(file))
24
+ end
25
+
26
+ # Evaluates the given Citrus parsing expression grammar +code+ in the global
27
+ # scope. Returns an array of any grammar modules that were created.
28
+ def self.eval(code)
29
+ file = PEG.parse(code)
30
+ file.value
31
+ end
32
+
33
+ # This error is raised whenever a parse fails.
34
+ class ParseError < Exception
35
+ def initialize(input)
36
+ @input = input
37
+ c = consumed
38
+ s = [0, c.length - 40].max
39
+ msg = "Failed to parse input at offset %d" % max_offset
40
+ msg += ", just after %s" % c[s, c.length].inspect + "\n"
41
+ super(msg)
42
+ end
43
+
44
+ # The Input object that was used for the parse.
45
+ attr_reader :input
46
+
47
+ # Returns the maximum offset that was reached before the error occurred.
48
+ def max_offset
49
+ input.max_offset
50
+ end
51
+
52
+ # Returns the portion of the input string that was successfully consumed
53
+ # before the parse failed.
54
+ def consumed
55
+ input[0, max_offset]
56
+ end
57
+ end
58
+
59
+ # Inclusion of this module into another extends the receiver with the grammar
60
+ # helper methods in GrammarMethods. Although this module does not actually
61
+ # provide any methods, constants, or variables to modules that include it, the
62
+ # mere act of inclusion provides a useful lookup mechanism to determine if a
63
+ # module is in fact a grammar.
64
+ module Grammar
65
+ # Creates a new anonymous module that includes Grammar. If a +block+ is
66
+ # provided, it will be called with the new module as its first argument if
67
+ # its +arity+ is 1 or +instance_eval+'d in the context of the new module
68
+ # otherwise. See http://blog.grayproductions.net/articles/dsl_block_styles
69
+ # for the rationale behind this decision.
70
+ #
71
+ # Grammars created with this method may be assigned a name by being assigned
72
+ # to some constant, e.g.:
73
+ #
74
+ # Calc = Grammar.new {}
75
+ #
76
+ def self.new(&block)
77
+ mod = Module.new { include Grammar }
78
+ block.arity == 1 ? block[mod] : mod.instance_eval(&block) if block
79
+ mod
80
+ end
81
+
82
+ # Extends all modules that +include Grammar+ with GrammarMethods and
83
+ # exposes Module#include.
84
+ def self.included(mod)
85
+ mod.extend(GrammarMethods)
86
+ class << mod; public :include end
87
+ end
88
+ end
89
+
90
+ # Contains methods that are available to Grammar modules at the class level.
91
+ module GrammarMethods
92
+ # Returns the name of this grammar as a string.
93
+ def name
94
+ super.to_s
95
+ end
96
+
97
+ # Returns an array of all grammars that have been included in this grammar
98
+ # in the reverse order they were included.
99
+ def included_grammars
100
+ included_modules.select {|mod| mod.include?(Grammar) }
101
+ end
102
+
103
+ # Returns an array of all names of rules in this grammar as symbols ordered
104
+ # in the same way they were defined (i.e. rules that were defined later
105
+ # appear later in the array).
106
+ def rule_names
107
+ @rule_names ||= []
108
+ end
109
+
110
+ # Returns a hash of all Rule objects in this grammar, keyed by rule name.
111
+ def rules
112
+ @rules ||= {}
113
+ end
114
+
115
+ # Returns +true+ if this grammar has a rule with the given +name+.
116
+ def has_rule?(name)
117
+ rules.key?(name.to_sym)
118
+ end
119
+
120
+ # Loops through the rule tree for the given +rule+ looking for any Super
121
+ # rules. When it finds one, it sets that rule's rule name to the given
122
+ # +name+.
123
+ def setup_super(rule, name) # :nodoc:
124
+ if Nonterminal === rule
125
+ rule.rules.each {|r| setup_super(r, name) }
126
+ elsif Super === rule
127
+ rule.rule_name = name
128
+ end
129
+ end
130
+ private :setup_super
131
+
132
+ # Searches the inheritance hierarchy of this grammar for a rule named +name+
133
+ # and returns it on success. Returns +nil+ on failure.
134
+ def super_rule(name)
135
+ sym = name.to_sym
136
+ included_grammars.each do |g|
137
+ r = g.rule(sym)
138
+ return r if r
139
+ end
140
+ nil
141
+ end
142
+
143
+ # Gets/sets the rule with the given +name+. If +obj+ is given the rule
144
+ # will be set to the value of +obj+ passed through Rule#create. If a block
145
+ # is given, its return value will be used for the value of +obj+.
146
+ #
147
+ # It is important to note that this method will also check any included
148
+ # grammars for a rule with the given +name+ if one cannot be found in this
149
+ # grammar.
150
+ def rule(name, obj=nil)
151
+ sym = name.to_sym
152
+
153
+ obj = Proc.new.call if block_given?
154
+
155
+ if obj
156
+ rule_names << sym unless has_rule?(sym)
157
+
158
+ rule = Rule.create(obj)
159
+ rule.name = name
160
+ setup_super(rule, name)
161
+ rule.grammar = self
162
+
163
+ rules[sym] = rule
164
+ end
165
+
166
+ rules[sym] || super_rule(sym)
167
+ rescue => e
168
+ raise "Cannot create rule \"#{name}\": " + e.message
169
+ end
170
+
171
+ # Gets/sets the +name+ of the root rule of this grammar.
172
+ def root(name=nil)
173
+ @root = name.to_sym if name
174
+ # The first rule in a grammar is the default root.
175
+ @root || rule_names.first
176
+ end
177
+
178
+ # Creates a new Super for the rule currently being defined in the grammar. A
179
+ # block may be provided to specify semantic behavior (via #ext).
180
+ def sup(&block)
181
+ ext(Super.new, block)
182
+ end
183
+
184
+ # Creates a new AndPredicate using the given +rule+. A block may be provided
185
+ # to specify semantic behavior (via #ext).
186
+ def andp(rule, &block)
187
+ ext(AndPredicate.new(rule), block)
188
+ end
189
+
190
+ # Creates a new NotPredicate using the given +rule+. A block may be provided
191
+ # to specify semantic behavior (via #ext).
192
+ def notp(rule, &block)
193
+ ext(NotPredicate.new(rule), block)
194
+ end
195
+
196
+ # Creates a new Label using the given +rule+ and +label+. A block may be
197
+ # provided to specify semantic behavior (via #ext).
198
+ def label(rule, label, &block)
199
+ ext(Label.new(label, rule), block)
200
+ end
201
+
202
+ # Creates a new Repeat using the given +rule+. +min+ and +max+ specify the
203
+ # minimum and maximum number of times the rule must match. A block may be
204
+ # provided to specify semantic behavior (via #ext).
205
+ def rep(rule, min=1, max=Infinity, &block)
206
+ ext(Repeat.new(min, max, rule), block)
207
+ end
208
+
209
+ # An alias for #rep.
210
+ def one_or_more(rule, &block)
211
+ rep(rule, &block)
212
+ end
213
+
214
+ # An alias for #rep with a minimum of 0.
215
+ def zero_or_more(rule, &block)
216
+ rep(rule, 0, &block)
217
+ end
218
+
219
+ # An alias for #rep with a minimum of 0 and a maximum of 1.
220
+ def zero_or_one(rule, &block)
221
+ rep(rule, 0, 1, &block)
222
+ end
223
+
224
+ # Creates a new Sequence using all arguments. A block may be provided to
225
+ # specify semantic behavior (via #ext).
226
+ def all(*args, &block)
227
+ ext(Sequence.new(args), block)
228
+ end
229
+
230
+ # Creates a new Choice using all arguments. A block may be provided to
231
+ # specify semantic behavior (via #ext).
232
+ def any(*args, &block)
233
+ ext(Choice.new(args), block)
234
+ end
235
+
236
+ # Specifies a Module that will be used to extend all matches created with
237
+ # the given +rule+. A block may also be given that will be used to create
238
+ # an anonymous module. See Rule#ext=.
239
+ def ext(rule, mod=nil)
240
+ rule = Rule.create(rule)
241
+ mod = Proc.new if block_given?
242
+ rule.ext = mod if mod
243
+ rule
244
+ end
245
+
246
+ # Parses the given +string+ from the given +offset+ using the rules in this
247
+ # grammar. A ParseError is raised if there is no match made or if
248
+ # +consume_all+ is +true+ and the entire input string cannot be consumed.
249
+ def parse(string, offset=0, enable_memo=false, consume_all=true)
250
+ raise "No root rule specified" unless root
251
+
252
+ root_rule = rule(root)
253
+ raise "No rule named \"#{root}\"" unless root_rule
254
+
255
+ input = Input.new(string, enable_memo)
256
+ match = input.match(root_rule, offset)
257
+
258
+ if !match || (consume_all && match.length != string.length)
259
+ raise ParseError.new(input)
260
+ end
261
+
262
+ match
263
+ end
264
+ end
265
+
266
+ # This class represents the core of the parsing algorithm. It wraps the input
267
+ # string and serves matches to all nonterminals.
268
+ class Input
269
+ # Takes the input +string+ that is to be parsed. If +enable_memo+ is +true+
270
+ # a cache is created that holds references to already generated matches.
271
+ def initialize(string, enable_memo=false)
272
+ @string = string
273
+ @max_offset = 0
274
+ if enable_memo
275
+ @cache = {}
276
+ @cache_hits = 0
277
+ end
278
+ end
279
+
280
+ # The input string.
281
+ attr_reader :string
282
+
283
+ # The maximum offset that has been achieved.
284
+ attr_reader :max_offset
285
+
286
+ # A two-level hash of rule id's and offsets to their respective matches.
287
+ # Only present if memoing is enabled.
288
+ attr_reader :cache
289
+
290
+ # The number of times the cache was hit. Only present if memoing is enabled.
291
+ attr_reader :cache_hits
292
+
293
+ # Sends all arguments to this input's +string+.
294
+ def [](*args)
295
+ @string.__send__(:[], *args)
296
+ end
297
+
298
+ # Returns the length of this input.
299
+ def length
300
+ @string.length
301
+ end
302
+
303
+ # Returns the match for a given +rule+ at +offset+. If memoing is enabled
304
+ # and a match does not already exist for the given rule/offset pair then
305
+ # the rule is executed and the result is cached before returning. See
306
+ # http://pdos.csail.mit.edu/~baford/packrat/icfp02/ for more information
307
+ # on memoing match results (also known as packrat parsing).
308
+ def match(rule, offset=0)
309
+ @max_offset = offset if offset > @max_offset
310
+
311
+ if @cache
312
+ c = @cache[rule.id] ||= {}
313
+
314
+ if c.key?(offset)
315
+ @cache_hits += 1
316
+ c[offset]
317
+ else
318
+ c[offset] = rule.match(self, offset)
319
+ end
320
+ else
321
+ rule.match(self, offset)
322
+ end
323
+ end
324
+ end
325
+
326
+ # A Rule is an object that is used by a grammar to create matches on the
327
+ # Input during parsing.
328
+ module Rule
329
+ # Returns a new Rule object depending on the type of object given.
330
+ def self.create(obj)
331
+ case obj
332
+ when Rule then obj
333
+ when Symbol then Alias.new(obj)
334
+ when String then FixedWidth.new(obj)
335
+ when Regexp then Expression.new(obj)
336
+ when Array then Sequence.new(obj)
337
+ when Range then Choice.new(obj.to_a)
338
+ when Numeric then FixedWidth.new(obj.to_s)
339
+ else
340
+ raise ArgumentError, "Invalid rule object: #{obj.inspect}"
341
+ end
342
+ end
343
+
344
+ @uniq_id = 0
345
+
346
+ # Generates a new rule id.
347
+ def self.new_id
348
+ @uniq_id += 1
349
+ end
350
+
351
+ # The grammar this rule belongs to.
352
+ attr_accessor :grammar
353
+
354
+ # An integer id that is unique to this rule.
355
+ def id
356
+ @id ||= Rule.new_id
357
+ end
358
+
359
+ # Sets the name of this rule.
360
+ def name=(name)
361
+ @name = name.to_sym
362
+ end
363
+
364
+ # The name of this rule.
365
+ attr_reader :name
366
+
367
+ # Specifies a module that will be used to extend all Match objects that
368
+ # result from this rule. If +mod+ is a Proc, it is used to create an
369
+ # anonymous module.
370
+ def ext=(mod)
371
+ mod = Module.new(&mod) if Proc === mod
372
+ @ext = mod
373
+ end
374
+
375
+ # The module this rule uses to extend new matches.
376
+ attr_reader :ext
377
+
378
+ # Returns +true+ if this rule is a Terminal.
379
+ def terminal?
380
+ is_a?(Terminal)
381
+ end
382
+
383
+ # Returns +true+ if this rule needs to be surrounded by parentheses when
384
+ # using #embed.
385
+ def paren?
386
+ false
387
+ end
388
+
389
+ # Returns a string version of this rule that is suitable to be used in the
390
+ # string representation of another rule.
391
+ def embed
392
+ name ? name.to_s : (paren? ? '(%s)' % to_s : to_s)
393
+ end
394
+
395
+ def inspect # :nodoc:
396
+ to_s
397
+ end
398
+
399
+ private
400
+
401
+ def extend_match(match)
402
+ match.extend(ext) if ext
403
+ end
404
+
405
+ def create_match(data, offset)
406
+ match = Match.new(data, offset)
407
+ extend_match(match)
408
+ match.name = name
409
+ match
410
+ end
411
+ end
412
+
413
+ # A Proxy is a Rule that is a placeholder for another rule. It stores the
414
+ # name of some other rule in the grammar internally and resolves it to the
415
+ # actual Rule object at runtime. This lazy evaluation permits us to create
416
+ # Proxy objects for rules that we may not know the definition of yet.
417
+ module Proxy
418
+ include Rule
419
+
420
+ def initialize(name='<proxy>')
421
+ self.rule_name = name
422
+ end
423
+
424
+ # Sets the name of the rule this rule is proxy for.
425
+ def rule_name=(name)
426
+ @rule_name = name.to_sym
427
+ end
428
+
429
+ # The name of this proxy's rule.
430
+ attr_reader :rule_name
431
+
432
+ # Returns the underlying Rule for this proxy.
433
+ def rule
434
+ @rule ||= resolve!
435
+ end
436
+
437
+ # Returns the Match for this proxy's #rule on +input+ at the given +offset+,
438
+ # +nil+ if no match can be made.
439
+ def match(input, offset=0)
440
+ m = input.match(rule, offset)
441
+ if m
442
+ extend_match(m)
443
+ # If this Proxy has a name then it should rename all of its matches.
444
+ m.name = name if name
445
+ m
446
+ end
447
+ end
448
+ end
449
+
450
+ # An Alias is a Proxy for a rule in the same grammar. It is used in rule
451
+ # definitions when a rule calls some other rule by name. The PEG notation is
452
+ # simply the name of another rule without any other punctuation, e.g.:
453
+ #
454
+ # name
455
+ #
456
+ class Alias
457
+ include Proxy
458
+
459
+ # Returns the PEG notation of this rule as a string.
460
+ def to_s
461
+ rule_name.to_s
462
+ end
463
+
464
+ private
465
+
466
+ # Searches this proxy's grammar and any included grammars for a rule with
467
+ # this proxy's #rule_name. Raises an error if one cannot be found.
468
+ def resolve!
469
+ rule = grammar.rule(rule_name)
470
+ raise RuntimeError, 'No rule named "%s" in grammar %s' %
471
+ [rule_name, grammar.name] unless rule
472
+ rule
473
+ end
474
+ end
475
+
476
+ # A Super is a Proxy for a rule of the same name that was defined previously
477
+ # in the grammar's inheritance chain. Thus, Super's work like Ruby's +super+,
478
+ # only for rules in a grammar instead of methods in a module. The PEG notation
479
+ # is the word +super+ without any other punctuation, e.g.:
480
+ #
481
+ # super
482
+ #
483
+ class Super
484
+ include Proxy
485
+
486
+ # Returns the PEG notation of this rule as a string.
487
+ def to_s
488
+ 'super'
489
+ end
490
+
491
+ private
492
+
493
+ # Searches this proxy's included grammars for a rule with this proxy's
494
+ # #rule_name. Raises an error if one cannot be found.
495
+ def resolve!
496
+ rule = grammar.super_rule(rule_name)
497
+ raise RuntimeError, 'No rule named "%s" in hierarchy of grammar %s' %
498
+ [rule_name, grammar.name] unless rule
499
+ rule
500
+ end
501
+ end
502
+
503
+ # A Terminal is a Rule that matches directly on the input stream and may not
504
+ # contain any other rule.
505
+ module Terminal
506
+ include Rule
507
+
508
+ def initialize(rule)
509
+ @rule = rule
510
+ end
511
+
512
+ # The actual String or Regexp object this rule uses to match.
513
+ attr_reader :rule
514
+
515
+ # Returns the PEG notation of this rule as a string.
516
+ def to_s
517
+ rule.inspect
518
+ end
519
+ end
520
+
521
+ # A FixedWidth is a Terminal that matches based on its length. The PEG
522
+ # notation is any sequence of characters enclosed in either single or double
523
+ # quotes, e.g.:
524
+ #
525
+ # 'expr'
526
+ # "expr"
527
+ #
528
+ class FixedWidth
529
+ include Terminal
530
+
531
+ def initialize(rule='')
532
+ raise ArgumentError, "FixedWidth must be a String" unless String === rule
533
+ super
534
+ end
535
+
536
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
537
+ # no match can be made.
538
+ def match(input, offset=0)
539
+ create_match(rule.dup, offset) if rule == input[offset, rule.length]
540
+ end
541
+ end
542
+
543
+ # An Expression is a Terminal that has the same semantics as a regular
544
+ # expression in Ruby. The expression must match at the beginning of the input
545
+ # (index 0). The PEG notation is identical to Ruby's regular expression
546
+ # notation, e.g.:
547
+ #
548
+ # /expr/
549
+ #
550
+ # Character classes and the dot symbol may also be used in PEG notation for
551
+ # compatibility with other PEG implementations, e.g.:
552
+ #
553
+ # [a-zA-Z]
554
+ # .
555
+ #
556
+ class Expression
557
+ include Terminal
558
+
559
+ def initialize(rule=/^/)
560
+ raise ArgumentError, "Expression must be a Regexp" unless Regexp === rule
561
+ super
562
+ end
563
+
564
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
565
+ # no match can be made.
566
+ def match(input, offset=0)
567
+ result = input[offset, input.length - offset].match(rule)
568
+ create_match(result, offset) if result && result.begin(0) == 0
569
+ end
570
+ end
571
+
572
+ # A Nonterminal is a Rule that augments the matching behavior of one or more
573
+ # other rules. Nonterminals may not match directly on the input, but instead
574
+ # invoke the rule(s) they contain to determine if a match can be made from
575
+ # the collective result.
576
+ module Nonterminal
577
+ include Rule
578
+
579
+ def initialize(rules=[])
580
+ @rules = rules.map {|r| Rule.create(r) }
581
+ end
582
+
583
+ # An array of the actual Rule objects this rule uses to match.
584
+ attr_reader :rules
585
+
586
+ def grammar=(grammar)
587
+ @rules.each {|r| r.grammar = grammar }
588
+ super
589
+ end
590
+ end
591
+
592
+ # A Predicate is a Nonterminal that contains one other rule.
593
+ module Predicate
594
+ include Nonterminal
595
+
596
+ def initialize(rule='')
597
+ super([ rule ])
598
+ end
599
+
600
+ # Returns the Rule object this rule uses to match.
601
+ def rule
602
+ rules[0]
603
+ end
604
+ end
605
+
606
+ # An AndPredicate is a Predicate that contains a rule that must match. Upon
607
+ # success an empty match is returned and no input is consumed. The PEG
608
+ # notation is any expression preceeded by an ampersand, e.g.:
609
+ #
610
+ # &expr
611
+ #
612
+ class AndPredicate
613
+ include Predicate
614
+
615
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
616
+ # no match can be made.
617
+ def match(input, offset=0)
618
+ create_match('', offset) if input.match(rule, offset)
619
+ end
620
+
621
+ # Returns the PEG notation of this rule as a string.
622
+ def to_s
623
+ '&' + rule.embed
624
+ end
625
+ end
626
+
627
+ # A NotPredicate is a Predicate that contains a rule that must not match. Upon
628
+ # success an empty match is returned and no input is consumed. The PEG
629
+ # notation is any expression preceeded by an exclamation mark, e.g.:
630
+ #
631
+ # !expr
632
+ #
633
+ class NotPredicate
634
+ include Predicate
635
+
636
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
637
+ # no match can be made.
638
+ def match(input, offset=0)
639
+ create_match('', offset) unless input.match(rule, offset)
640
+ end
641
+
642
+ # Returns the PEG notation of this rule as a string.
643
+ def to_s
644
+ '!' + rule.embed
645
+ end
646
+ end
647
+
648
+ # A Label is a Predicate that applies a new name to any matches made by its
649
+ # rule. The PEG notation is any sequence of word characters (i.e.
650
+ # <tt>[a-zA-Z0-9_]</tt>) followed by a colon, followed by any other
651
+ # expression, e.g.:
652
+ #
653
+ # label:expr
654
+ #
655
+ class Label
656
+ include Predicate
657
+
658
+ def initialize(label='<label>', rule='')
659
+ @label = label.to_sym
660
+ super(rule)
661
+ end
662
+
663
+ # The symbol this rule uses to re-name all its matches.
664
+ attr_reader :label
665
+
666
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
667
+ # no match can be made. When a Label makes a match, it re-names the match to
668
+ # the value of its label.
669
+ def match(input, offset=0)
670
+ m = rule.match(input, offset)
671
+ if m
672
+ extend_match(m)
673
+ m.name = label
674
+ m
675
+ end
676
+ end
677
+
678
+ # Returns the PEG notation of this rule as a string.
679
+ def to_s
680
+ label.to_s + ':' + rule.embed
681
+ end
682
+ end
683
+
684
+ # A Repeat is a Predicate that specifies a minimum and maximum number of times
685
+ # its rule must match. The PEG notation is an integer, +N+, followed by an
686
+ # asterisk, followed by another integer, +M+, all of which follow any other
687
+ # expression, e.g.:
688
+ #
689
+ # expr N*M
690
+ #
691
+ # In this notation +N+ specifies the minimum number of times the preceeding
692
+ # expression must match and +M+ specifies the maximum. If +N+ is ommitted,
693
+ # it is assumed to be 0. Likewise, if +M+ is omitted, it is assumed to be
694
+ # infinity (no maximum). Thus, an expression followed by only an asterisk may
695
+ # match any number of times, including zero.
696
+ #
697
+ # The shorthand notation <tt>+</tt> and <tt>?</tt> may be used for the common
698
+ # cases of <tt>1*</tt> and <tt>*1</tt> respectively, e.g.:
699
+ #
700
+ # expr+
701
+ # expr?
702
+ #
703
+ class Repeat
704
+ include Predicate
705
+
706
+ def initialize(min=1, max=Infinity, rule='')
707
+ raise ArgumentError, "Min cannot be greater than max" if min > max
708
+ @range = Range.new(min, max)
709
+ super(rule)
710
+ end
711
+
712
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
713
+ # no match can be made.
714
+ def match(input, offset=0)
715
+ matches = []
716
+ os = offset
717
+ while matches.length < @range.end
718
+ m = input.match(rule, os)
719
+ break unless m
720
+ matches << m
721
+ os += m.length
722
+ end
723
+ create_match(matches, offset) if @range.include?(matches.length)
724
+ end
725
+
726
+ # Returns the operator this rule uses as a string. Will be one of
727
+ # <tt>+</tt>, <tt>?</tt>, or <tt>N*M</tt>.
728
+ def operator
729
+ unless @operator
730
+ m = [@range.begin, @range.end].map do |n|
731
+ n == 0 || n == Infinity ? '' : n.to_s
732
+ end
733
+ @operator = case m
734
+ when ['', '1'] then '?'
735
+ when ['1', ''] then '+'
736
+ else m.join('*')
737
+ end
738
+ end
739
+ @operator
740
+ end
741
+
742
+ # Returns the PEG notation of this rule as a string.
743
+ def to_s
744
+ rule.embed + operator
745
+ end
746
+ end
747
+
748
+ # A List is a Nonterminal that contains any number of other rules and tests
749
+ # them for matches in sequential order.
750
+ module List
751
+ include Nonterminal
752
+
753
+ def paren?
754
+ rules.length > 1
755
+ end
756
+ end
757
+
758
+ # A Choice is a List where only one rule must match. The PEG notation is two
759
+ # or more expressions separated by a vertical bar, e.g.:
760
+ #
761
+ # expr | expr
762
+ #
763
+ class Choice
764
+ include List
765
+
766
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
767
+ # no match can be made.
768
+ def match(input, offset=0)
769
+ rules.each do |rule|
770
+ m = input.match(rule, offset)
771
+ return create_match([m], offset) if m
772
+ end
773
+ nil
774
+ end
775
+
776
+ # Returns the PEG notation of this rule as a string.
777
+ def to_s
778
+ rules.map {|r| r.embed }.join(' | ')
779
+ end
780
+ end
781
+
782
+ # A Sequence is a List where all rules must match. The PEG notation is two or
783
+ # more expressions separated by a space, e.g.:
784
+ #
785
+ # expr expr
786
+ #
787
+ class Sequence
788
+ include List
789
+
790
+ # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
791
+ # no match can be made.
792
+ def match(input, offset=0)
793
+ matches = []
794
+ os = offset
795
+ rules.each do |rule|
796
+ m = input.match(rule, os)
797
+ break unless m
798
+ matches << m
799
+ os += m.length
800
+ end
801
+ create_match(matches, offset) if matches.length == rules.length
802
+ end
803
+
804
+ # Returns the PEG notation of this rule as a string.
805
+ def to_s
806
+ rules.map {|r| r.embed }.join(' ')
807
+ end
808
+ end
809
+
810
+ # The base class for all matches. Matches are organized into a tree where any
811
+ # match may contain any number of other matches. This class provides several
812
+ # convenient tree traversal methods that help when examining parse results.
813
+ class Match
814
+ def initialize(data, offset=0)
815
+ case data
816
+ when String
817
+ @text = data
818
+ when MatchData
819
+ @text = data[0]
820
+ @captures = data.captures
821
+ when Array
822
+ @matches = data
823
+ end
824
+
825
+ @offset = offset
826
+ end
827
+
828
+ # The name by which this match can be accessed from a parent match. This
829
+ # will be the name of the rule that generated the match in most cases.
830
+ # However, if the match is the result of a Label this will be the value of
831
+ # the label.
832
+ attr_accessor :name
833
+
834
+ # The offset in the input at which this match occurred.
835
+ attr_reader :offset
836
+
837
+ # An array of all sub-matches of this match.
838
+ def matches
839
+ @matches ||= []
840
+ end
841
+
842
+ # An array of substrings returned by MatchData#captures if this match was
843
+ # created by an Expression.
844
+ def captures
845
+ @captures ||= []
846
+ end
847
+
848
+ # Returns the raw text value of this match, which may simply be an
849
+ # aggregate of the text of all sub-matches if this match is not #terminal?.
850
+ def text
851
+ @text ||= matches.inject('') {|s, m| s << m.text }
852
+ end
853
+
854
+ alias to_s text
855
+
856
+ # Returns the length of this match's #text value as an Integer.
857
+ def length
858
+ text.length
859
+ end
860
+
861
+ # Passes all arguments to the #text of this match.
862
+ def [](*args)
863
+ text.__send__(:[], *args)
864
+ end
865
+
866
+ # Returns an array of all sub-matches with the given +name+. If +deep+ is
867
+ # +false+, returns only sub-matches that are immediate descendants of this
868
+ # match.
869
+ def find(name, deep=true)
870
+ sym = name.to_sym
871
+ ms = matches.select {|m| sym == m.name }
872
+ ms.concat(matches.map {|m| m.find(name, deep) }.flatten) if deep
873
+ ms
874
+ end
875
+
876
+ # A shortcut for retrieving the first immediate sub-match of this match. If
877
+ # +name+ is given, attempts to retrieve the first immediate sub-match named
878
+ # +name+.
879
+ def first(name=nil)
880
+ name.nil? ? matches.first : find(name, false).first
881
+ end
882
+
883
+ # Returns +true+ if this match has no descendants (was created from a
884
+ # Terminal).
885
+ def terminal?
886
+ matches.length == 0
887
+ end
888
+
889
+ # Checks equality by comparing this match's #text value to +obj+.
890
+ def ==(obj)
891
+ text == obj
892
+ end
893
+
894
+ alias eql? ==
895
+
896
+ # Uses #match to allow sub-matches of this match to be called by name as
897
+ # instance methods.
898
+ def method_missing(sym, *args)
899
+ m = first(sym)
900
+ return m if m
901
+ raise 'No match named "%s" in %s (%s)' % [sym, self, name]
902
+ end
903
+ end
904
+ end