citrus 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +86 -0
- data/Rakefile +67 -0
- data/citrus.gemspec +29 -0
- data/examples/calc.citrus +103 -0
- data/examples/calc.rb +95 -0
- data/examples/calc_sugar.rb +94 -0
- data/lib/citrus.rb +904 -0
- data/lib/citrus/debug.rb +37 -0
- data/lib/citrus/peg.rb +375 -0
- data/lib/citrus/sugar.rb +25 -0
- data/test/alias_test.rb +66 -0
- data/test/and_predicate_test.rb +27 -0
- data/test/calc_peg_test.rb +6 -0
- data/test/calc_sugar_test.rb +6 -0
- data/test/calc_test.rb +6 -0
- data/test/choice_test.rb +62 -0
- data/test/expression_test.rb +29 -0
- data/test/fixed_width_test.rb +37 -0
- data/test/grammar_test.rb +129 -0
- data/test/helper.rb +143 -0
- data/test/label_test.rb +26 -0
- data/test/match_test.rb +76 -0
- data/test/not_predicate_test.rb +27 -0
- data/test/peg_test.rb +663 -0
- data/test/repeat_test.rb +93 -0
- data/test/rule_test.rb +49 -0
- data/test/sequence_test.rb +53 -0
- data/test/super_test.rb +66 -0
- metadata +133 -0
data/lib/citrus.rb
ADDED
@@ -0,0 +1,904 @@
|
|
1
|
+
# Citrus is a compact and powerful parsing library for Ruby that combines the
|
2
|
+
# elegance and expressiveness of the language with the simplicity and power of
|
3
|
+
# parsing expression grammars.
|
4
|
+
#
|
5
|
+
# http://github.com/mjijackson/citrus
|
6
|
+
module Citrus
|
7
|
+
VERSION = [1, 0, 0]
|
8
|
+
|
9
|
+
Infinity = 1.0 / 0
|
10
|
+
|
11
|
+
autoload 'PEG', 'citrus/peg'
|
12
|
+
|
13
|
+
# Returns the current version of Citrus as a string.
|
14
|
+
def self.version
|
15
|
+
VERSION.join('.')
|
16
|
+
end
|
17
|
+
|
18
|
+
# Loads the grammar from the given +file+ into the global scope using #eval.
|
19
|
+
def self.load(file)
|
20
|
+
file << '.citrus' unless File.file?(file)
|
21
|
+
raise "Cannot find file #{file}" unless File.file?(file)
|
22
|
+
raise "Cannot read file #{file}" unless File.readable?(file)
|
23
|
+
self.eval(File.read(file))
|
24
|
+
end
|
25
|
+
|
26
|
+
# Evaluates the given Citrus parsing expression grammar +code+ in the global
|
27
|
+
# scope. Returns an array of any grammar modules that were created.
|
28
|
+
def self.eval(code)
|
29
|
+
file = PEG.parse(code)
|
30
|
+
file.value
|
31
|
+
end
|
32
|
+
|
33
|
+
# This error is raised whenever a parse fails.
|
34
|
+
class ParseError < Exception
|
35
|
+
def initialize(input)
|
36
|
+
@input = input
|
37
|
+
c = consumed
|
38
|
+
s = [0, c.length - 40].max
|
39
|
+
msg = "Failed to parse input at offset %d" % max_offset
|
40
|
+
msg += ", just after %s" % c[s, c.length].inspect + "\n"
|
41
|
+
super(msg)
|
42
|
+
end
|
43
|
+
|
44
|
+
# The Input object that was used for the parse.
|
45
|
+
attr_reader :input
|
46
|
+
|
47
|
+
# Returns the maximum offset that was reached before the error occurred.
|
48
|
+
def max_offset
|
49
|
+
input.max_offset
|
50
|
+
end
|
51
|
+
|
52
|
+
# Returns the portion of the input string that was successfully consumed
|
53
|
+
# before the parse failed.
|
54
|
+
def consumed
|
55
|
+
input[0, max_offset]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Inclusion of this module into another extends the receiver with the grammar
|
60
|
+
# helper methods in GrammarMethods. Although this module does not actually
|
61
|
+
# provide any methods, constants, or variables to modules that include it, the
|
62
|
+
# mere act of inclusion provides a useful lookup mechanism to determine if a
|
63
|
+
# module is in fact a grammar.
|
64
|
+
module Grammar
|
65
|
+
# Creates a new anonymous module that includes Grammar. If a +block+ is
|
66
|
+
# provided, it will be called with the new module as its first argument if
|
67
|
+
# its +arity+ is 1 or +instance_eval+'d in the context of the new module
|
68
|
+
# otherwise. See http://blog.grayproductions.net/articles/dsl_block_styles
|
69
|
+
# for the rationale behind this decision.
|
70
|
+
#
|
71
|
+
# Grammars created with this method may be assigned a name by being assigned
|
72
|
+
# to some constant, e.g.:
|
73
|
+
#
|
74
|
+
# Calc = Grammar.new {}
|
75
|
+
#
|
76
|
+
def self.new(&block)
|
77
|
+
mod = Module.new { include Grammar }
|
78
|
+
block.arity == 1 ? block[mod] : mod.instance_eval(&block) if block
|
79
|
+
mod
|
80
|
+
end
|
81
|
+
|
82
|
+
# Extends all modules that +include Grammar+ with GrammarMethods and
|
83
|
+
# exposes Module#include.
|
84
|
+
def self.included(mod)
|
85
|
+
mod.extend(GrammarMethods)
|
86
|
+
class << mod; public :include end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Contains methods that are available to Grammar modules at the class level.
|
91
|
+
module GrammarMethods
|
92
|
+
# Returns the name of this grammar as a string.
|
93
|
+
def name
|
94
|
+
super.to_s
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns an array of all grammars that have been included in this grammar
|
98
|
+
# in the reverse order they were included.
|
99
|
+
def included_grammars
|
100
|
+
included_modules.select {|mod| mod.include?(Grammar) }
|
101
|
+
end
|
102
|
+
|
103
|
+
# Returns an array of all names of rules in this grammar as symbols ordered
|
104
|
+
# in the same way they were defined (i.e. rules that were defined later
|
105
|
+
# appear later in the array).
|
106
|
+
def rule_names
|
107
|
+
@rule_names ||= []
|
108
|
+
end
|
109
|
+
|
110
|
+
# Returns a hash of all Rule objects in this grammar, keyed by rule name.
|
111
|
+
def rules
|
112
|
+
@rules ||= {}
|
113
|
+
end
|
114
|
+
|
115
|
+
# Returns +true+ if this grammar has a rule with the given +name+.
|
116
|
+
def has_rule?(name)
|
117
|
+
rules.key?(name.to_sym)
|
118
|
+
end
|
119
|
+
|
120
|
+
# Loops through the rule tree for the given +rule+ looking for any Super
|
121
|
+
# rules. When it finds one, it sets that rule's rule name to the given
|
122
|
+
# +name+.
|
123
|
+
def setup_super(rule, name) # :nodoc:
|
124
|
+
if Nonterminal === rule
|
125
|
+
rule.rules.each {|r| setup_super(r, name) }
|
126
|
+
elsif Super === rule
|
127
|
+
rule.rule_name = name
|
128
|
+
end
|
129
|
+
end
|
130
|
+
private :setup_super
|
131
|
+
|
132
|
+
# Searches the inheritance hierarchy of this grammar for a rule named +name+
|
133
|
+
# and returns it on success. Returns +nil+ on failure.
|
134
|
+
def super_rule(name)
|
135
|
+
sym = name.to_sym
|
136
|
+
included_grammars.each do |g|
|
137
|
+
r = g.rule(sym)
|
138
|
+
return r if r
|
139
|
+
end
|
140
|
+
nil
|
141
|
+
end
|
142
|
+
|
143
|
+
# Gets/sets the rule with the given +name+. If +obj+ is given the rule
|
144
|
+
# will be set to the value of +obj+ passed through Rule#create. If a block
|
145
|
+
# is given, its return value will be used for the value of +obj+.
|
146
|
+
#
|
147
|
+
# It is important to note that this method will also check any included
|
148
|
+
# grammars for a rule with the given +name+ if one cannot be found in this
|
149
|
+
# grammar.
|
150
|
+
def rule(name, obj=nil)
|
151
|
+
sym = name.to_sym
|
152
|
+
|
153
|
+
obj = Proc.new.call if block_given?
|
154
|
+
|
155
|
+
if obj
|
156
|
+
rule_names << sym unless has_rule?(sym)
|
157
|
+
|
158
|
+
rule = Rule.create(obj)
|
159
|
+
rule.name = name
|
160
|
+
setup_super(rule, name)
|
161
|
+
rule.grammar = self
|
162
|
+
|
163
|
+
rules[sym] = rule
|
164
|
+
end
|
165
|
+
|
166
|
+
rules[sym] || super_rule(sym)
|
167
|
+
rescue => e
|
168
|
+
raise "Cannot create rule \"#{name}\": " + e.message
|
169
|
+
end
|
170
|
+
|
171
|
+
# Gets/sets the +name+ of the root rule of this grammar.
|
172
|
+
def root(name=nil)
|
173
|
+
@root = name.to_sym if name
|
174
|
+
# The first rule in a grammar is the default root.
|
175
|
+
@root || rule_names.first
|
176
|
+
end
|
177
|
+
|
178
|
+
# Creates a new Super for the rule currently being defined in the grammar. A
|
179
|
+
# block may be provided to specify semantic behavior (via #ext).
|
180
|
+
def sup(&block)
|
181
|
+
ext(Super.new, block)
|
182
|
+
end
|
183
|
+
|
184
|
+
# Creates a new AndPredicate using the given +rule+. A block may be provided
|
185
|
+
# to specify semantic behavior (via #ext).
|
186
|
+
def andp(rule, &block)
|
187
|
+
ext(AndPredicate.new(rule), block)
|
188
|
+
end
|
189
|
+
|
190
|
+
# Creates a new NotPredicate using the given +rule+. A block may be provided
|
191
|
+
# to specify semantic behavior (via #ext).
|
192
|
+
def notp(rule, &block)
|
193
|
+
ext(NotPredicate.new(rule), block)
|
194
|
+
end
|
195
|
+
|
196
|
+
# Creates a new Label using the given +rule+ and +label+. A block may be
|
197
|
+
# provided to specify semantic behavior (via #ext).
|
198
|
+
def label(rule, label, &block)
|
199
|
+
ext(Label.new(label, rule), block)
|
200
|
+
end
|
201
|
+
|
202
|
+
# Creates a new Repeat using the given +rule+. +min+ and +max+ specify the
|
203
|
+
# minimum and maximum number of times the rule must match. A block may be
|
204
|
+
# provided to specify semantic behavior (via #ext).
|
205
|
+
def rep(rule, min=1, max=Infinity, &block)
|
206
|
+
ext(Repeat.new(min, max, rule), block)
|
207
|
+
end
|
208
|
+
|
209
|
+
# An alias for #rep.
|
210
|
+
def one_or_more(rule, &block)
|
211
|
+
rep(rule, &block)
|
212
|
+
end
|
213
|
+
|
214
|
+
# An alias for #rep with a minimum of 0.
|
215
|
+
def zero_or_more(rule, &block)
|
216
|
+
rep(rule, 0, &block)
|
217
|
+
end
|
218
|
+
|
219
|
+
# An alias for #rep with a minimum of 0 and a maximum of 1.
|
220
|
+
def zero_or_one(rule, &block)
|
221
|
+
rep(rule, 0, 1, &block)
|
222
|
+
end
|
223
|
+
|
224
|
+
# Creates a new Sequence using all arguments. A block may be provided to
|
225
|
+
# specify semantic behavior (via #ext).
|
226
|
+
def all(*args, &block)
|
227
|
+
ext(Sequence.new(args), block)
|
228
|
+
end
|
229
|
+
|
230
|
+
# Creates a new Choice using all arguments. A block may be provided to
|
231
|
+
# specify semantic behavior (via #ext).
|
232
|
+
def any(*args, &block)
|
233
|
+
ext(Choice.new(args), block)
|
234
|
+
end
|
235
|
+
|
236
|
+
# Specifies a Module that will be used to extend all matches created with
|
237
|
+
# the given +rule+. A block may also be given that will be used to create
|
238
|
+
# an anonymous module. See Rule#ext=.
|
239
|
+
def ext(rule, mod=nil)
|
240
|
+
rule = Rule.create(rule)
|
241
|
+
mod = Proc.new if block_given?
|
242
|
+
rule.ext = mod if mod
|
243
|
+
rule
|
244
|
+
end
|
245
|
+
|
246
|
+
# Parses the given +string+ from the given +offset+ using the rules in this
|
247
|
+
# grammar. A ParseError is raised if there is no match made or if
|
248
|
+
# +consume_all+ is +true+ and the entire input string cannot be consumed.
|
249
|
+
def parse(string, offset=0, enable_memo=false, consume_all=true)
|
250
|
+
raise "No root rule specified" unless root
|
251
|
+
|
252
|
+
root_rule = rule(root)
|
253
|
+
raise "No rule named \"#{root}\"" unless root_rule
|
254
|
+
|
255
|
+
input = Input.new(string, enable_memo)
|
256
|
+
match = input.match(root_rule, offset)
|
257
|
+
|
258
|
+
if !match || (consume_all && match.length != string.length)
|
259
|
+
raise ParseError.new(input)
|
260
|
+
end
|
261
|
+
|
262
|
+
match
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
# This class represents the core of the parsing algorithm. It wraps the input
|
267
|
+
# string and serves matches to all nonterminals.
|
268
|
+
class Input
|
269
|
+
# Takes the input +string+ that is to be parsed. If +enable_memo+ is +true+
|
270
|
+
# a cache is created that holds references to already generated matches.
|
271
|
+
def initialize(string, enable_memo=false)
|
272
|
+
@string = string
|
273
|
+
@max_offset = 0
|
274
|
+
if enable_memo
|
275
|
+
@cache = {}
|
276
|
+
@cache_hits = 0
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
# The input string.
|
281
|
+
attr_reader :string
|
282
|
+
|
283
|
+
# The maximum offset that has been achieved.
|
284
|
+
attr_reader :max_offset
|
285
|
+
|
286
|
+
# A two-level hash of rule id's and offsets to their respective matches.
|
287
|
+
# Only present if memoing is enabled.
|
288
|
+
attr_reader :cache
|
289
|
+
|
290
|
+
# The number of times the cache was hit. Only present if memoing is enabled.
|
291
|
+
attr_reader :cache_hits
|
292
|
+
|
293
|
+
# Sends all arguments to this input's +string+.
|
294
|
+
def [](*args)
|
295
|
+
@string.__send__(:[], *args)
|
296
|
+
end
|
297
|
+
|
298
|
+
# Returns the length of this input.
|
299
|
+
def length
|
300
|
+
@string.length
|
301
|
+
end
|
302
|
+
|
303
|
+
# Returns the match for a given +rule+ at +offset+. If memoing is enabled
|
304
|
+
# and a match does not already exist for the given rule/offset pair then
|
305
|
+
# the rule is executed and the result is cached before returning. See
|
306
|
+
# http://pdos.csail.mit.edu/~baford/packrat/icfp02/ for more information
|
307
|
+
# on memoing match results (also known as packrat parsing).
|
308
|
+
def match(rule, offset=0)
|
309
|
+
@max_offset = offset if offset > @max_offset
|
310
|
+
|
311
|
+
if @cache
|
312
|
+
c = @cache[rule.id] ||= {}
|
313
|
+
|
314
|
+
if c.key?(offset)
|
315
|
+
@cache_hits += 1
|
316
|
+
c[offset]
|
317
|
+
else
|
318
|
+
c[offset] = rule.match(self, offset)
|
319
|
+
end
|
320
|
+
else
|
321
|
+
rule.match(self, offset)
|
322
|
+
end
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
# A Rule is an object that is used by a grammar to create matches on the
|
327
|
+
# Input during parsing.
|
328
|
+
module Rule
|
329
|
+
# Returns a new Rule object depending on the type of object given.
|
330
|
+
def self.create(obj)
|
331
|
+
case obj
|
332
|
+
when Rule then obj
|
333
|
+
when Symbol then Alias.new(obj)
|
334
|
+
when String then FixedWidth.new(obj)
|
335
|
+
when Regexp then Expression.new(obj)
|
336
|
+
when Array then Sequence.new(obj)
|
337
|
+
when Range then Choice.new(obj.to_a)
|
338
|
+
when Numeric then FixedWidth.new(obj.to_s)
|
339
|
+
else
|
340
|
+
raise ArgumentError, "Invalid rule object: #{obj.inspect}"
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
@uniq_id = 0
|
345
|
+
|
346
|
+
# Generates a new rule id.
|
347
|
+
def self.new_id
|
348
|
+
@uniq_id += 1
|
349
|
+
end
|
350
|
+
|
351
|
+
# The grammar this rule belongs to.
|
352
|
+
attr_accessor :grammar
|
353
|
+
|
354
|
+
# An integer id that is unique to this rule.
|
355
|
+
def id
|
356
|
+
@id ||= Rule.new_id
|
357
|
+
end
|
358
|
+
|
359
|
+
# Sets the name of this rule.
|
360
|
+
def name=(name)
|
361
|
+
@name = name.to_sym
|
362
|
+
end
|
363
|
+
|
364
|
+
# The name of this rule.
|
365
|
+
attr_reader :name
|
366
|
+
|
367
|
+
# Specifies a module that will be used to extend all Match objects that
|
368
|
+
# result from this rule. If +mod+ is a Proc, it is used to create an
|
369
|
+
# anonymous module.
|
370
|
+
def ext=(mod)
|
371
|
+
mod = Module.new(&mod) if Proc === mod
|
372
|
+
@ext = mod
|
373
|
+
end
|
374
|
+
|
375
|
+
# The module this rule uses to extend new matches.
|
376
|
+
attr_reader :ext
|
377
|
+
|
378
|
+
# Returns +true+ if this rule is a Terminal.
|
379
|
+
def terminal?
|
380
|
+
is_a?(Terminal)
|
381
|
+
end
|
382
|
+
|
383
|
+
# Returns +true+ if this rule needs to be surrounded by parentheses when
|
384
|
+
# using #embed.
|
385
|
+
def paren?
|
386
|
+
false
|
387
|
+
end
|
388
|
+
|
389
|
+
# Returns a string version of this rule that is suitable to be used in the
|
390
|
+
# string representation of another rule.
|
391
|
+
def embed
|
392
|
+
name ? name.to_s : (paren? ? '(%s)' % to_s : to_s)
|
393
|
+
end
|
394
|
+
|
395
|
+
def inspect # :nodoc:
|
396
|
+
to_s
|
397
|
+
end
|
398
|
+
|
399
|
+
private
|
400
|
+
|
401
|
+
def extend_match(match)
|
402
|
+
match.extend(ext) if ext
|
403
|
+
end
|
404
|
+
|
405
|
+
def create_match(data, offset)
|
406
|
+
match = Match.new(data, offset)
|
407
|
+
extend_match(match)
|
408
|
+
match.name = name
|
409
|
+
match
|
410
|
+
end
|
411
|
+
end
|
412
|
+
|
413
|
+
# A Proxy is a Rule that is a placeholder for another rule. It stores the
|
414
|
+
# name of some other rule in the grammar internally and resolves it to the
|
415
|
+
# actual Rule object at runtime. This lazy evaluation permits us to create
|
416
|
+
# Proxy objects for rules that we may not know the definition of yet.
|
417
|
+
module Proxy
|
418
|
+
include Rule
|
419
|
+
|
420
|
+
def initialize(name='<proxy>')
|
421
|
+
self.rule_name = name
|
422
|
+
end
|
423
|
+
|
424
|
+
# Sets the name of the rule this rule is proxy for.
|
425
|
+
def rule_name=(name)
|
426
|
+
@rule_name = name.to_sym
|
427
|
+
end
|
428
|
+
|
429
|
+
# The name of this proxy's rule.
|
430
|
+
attr_reader :rule_name
|
431
|
+
|
432
|
+
# Returns the underlying Rule for this proxy.
|
433
|
+
def rule
|
434
|
+
@rule ||= resolve!
|
435
|
+
end
|
436
|
+
|
437
|
+
# Returns the Match for this proxy's #rule on +input+ at the given +offset+,
|
438
|
+
# +nil+ if no match can be made.
|
439
|
+
def match(input, offset=0)
|
440
|
+
m = input.match(rule, offset)
|
441
|
+
if m
|
442
|
+
extend_match(m)
|
443
|
+
# If this Proxy has a name then it should rename all of its matches.
|
444
|
+
m.name = name if name
|
445
|
+
m
|
446
|
+
end
|
447
|
+
end
|
448
|
+
end
|
449
|
+
|
450
|
+
# An Alias is a Proxy for a rule in the same grammar. It is used in rule
|
451
|
+
# definitions when a rule calls some other rule by name. The PEG notation is
|
452
|
+
# simply the name of another rule without any other punctuation, e.g.:
|
453
|
+
#
|
454
|
+
# name
|
455
|
+
#
|
456
|
+
class Alias
|
457
|
+
include Proxy
|
458
|
+
|
459
|
+
# Returns the PEG notation of this rule as a string.
|
460
|
+
def to_s
|
461
|
+
rule_name.to_s
|
462
|
+
end
|
463
|
+
|
464
|
+
private
|
465
|
+
|
466
|
+
# Searches this proxy's grammar and any included grammars for a rule with
|
467
|
+
# this proxy's #rule_name. Raises an error if one cannot be found.
|
468
|
+
def resolve!
|
469
|
+
rule = grammar.rule(rule_name)
|
470
|
+
raise RuntimeError, 'No rule named "%s" in grammar %s' %
|
471
|
+
[rule_name, grammar.name] unless rule
|
472
|
+
rule
|
473
|
+
end
|
474
|
+
end
|
475
|
+
|
476
|
+
# A Super is a Proxy for a rule of the same name that was defined previously
|
477
|
+
# in the grammar's inheritance chain. Thus, Super's work like Ruby's +super+,
|
478
|
+
# only for rules in a grammar instead of methods in a module. The PEG notation
|
479
|
+
# is the word +super+ without any other punctuation, e.g.:
|
480
|
+
#
|
481
|
+
# super
|
482
|
+
#
|
483
|
+
class Super
|
484
|
+
include Proxy
|
485
|
+
|
486
|
+
# Returns the PEG notation of this rule as a string.
|
487
|
+
def to_s
|
488
|
+
'super'
|
489
|
+
end
|
490
|
+
|
491
|
+
private
|
492
|
+
|
493
|
+
# Searches this proxy's included grammars for a rule with this proxy's
|
494
|
+
# #rule_name. Raises an error if one cannot be found.
|
495
|
+
def resolve!
|
496
|
+
rule = grammar.super_rule(rule_name)
|
497
|
+
raise RuntimeError, 'No rule named "%s" in hierarchy of grammar %s' %
|
498
|
+
[rule_name, grammar.name] unless rule
|
499
|
+
rule
|
500
|
+
end
|
501
|
+
end
|
502
|
+
|
503
|
+
# A Terminal is a Rule that matches directly on the input stream and may not
|
504
|
+
# contain any other rule.
|
505
|
+
module Terminal
|
506
|
+
include Rule
|
507
|
+
|
508
|
+
def initialize(rule)
|
509
|
+
@rule = rule
|
510
|
+
end
|
511
|
+
|
512
|
+
# The actual String or Regexp object this rule uses to match.
|
513
|
+
attr_reader :rule
|
514
|
+
|
515
|
+
# Returns the PEG notation of this rule as a string.
|
516
|
+
def to_s
|
517
|
+
rule.inspect
|
518
|
+
end
|
519
|
+
end
|
520
|
+
|
521
|
+
# A FixedWidth is a Terminal that matches based on its length. The PEG
|
522
|
+
# notation is any sequence of characters enclosed in either single or double
|
523
|
+
# quotes, e.g.:
|
524
|
+
#
|
525
|
+
# 'expr'
|
526
|
+
# "expr"
|
527
|
+
#
|
528
|
+
class FixedWidth
|
529
|
+
include Terminal
|
530
|
+
|
531
|
+
def initialize(rule='')
|
532
|
+
raise ArgumentError, "FixedWidth must be a String" unless String === rule
|
533
|
+
super
|
534
|
+
end
|
535
|
+
|
536
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
537
|
+
# no match can be made.
|
538
|
+
def match(input, offset=0)
|
539
|
+
create_match(rule.dup, offset) if rule == input[offset, rule.length]
|
540
|
+
end
|
541
|
+
end
|
542
|
+
|
543
|
+
# An Expression is a Terminal that has the same semantics as a regular
|
544
|
+
# expression in Ruby. The expression must match at the beginning of the input
|
545
|
+
# (index 0). The PEG notation is identical to Ruby's regular expression
|
546
|
+
# notation, e.g.:
|
547
|
+
#
|
548
|
+
# /expr/
|
549
|
+
#
|
550
|
+
# Character classes and the dot symbol may also be used in PEG notation for
|
551
|
+
# compatibility with other PEG implementations, e.g.:
|
552
|
+
#
|
553
|
+
# [a-zA-Z]
|
554
|
+
# .
|
555
|
+
#
|
556
|
+
class Expression
|
557
|
+
include Terminal
|
558
|
+
|
559
|
+
def initialize(rule=/^/)
|
560
|
+
raise ArgumentError, "Expression must be a Regexp" unless Regexp === rule
|
561
|
+
super
|
562
|
+
end
|
563
|
+
|
564
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
565
|
+
# no match can be made.
|
566
|
+
def match(input, offset=0)
|
567
|
+
result = input[offset, input.length - offset].match(rule)
|
568
|
+
create_match(result, offset) if result && result.begin(0) == 0
|
569
|
+
end
|
570
|
+
end
|
571
|
+
|
572
|
+
# A Nonterminal is a Rule that augments the matching behavior of one or more
|
573
|
+
# other rules. Nonterminals may not match directly on the input, but instead
|
574
|
+
# invoke the rule(s) they contain to determine if a match can be made from
|
575
|
+
# the collective result.
|
576
|
+
module Nonterminal
|
577
|
+
include Rule
|
578
|
+
|
579
|
+
def initialize(rules=[])
|
580
|
+
@rules = rules.map {|r| Rule.create(r) }
|
581
|
+
end
|
582
|
+
|
583
|
+
# An array of the actual Rule objects this rule uses to match.
|
584
|
+
attr_reader :rules
|
585
|
+
|
586
|
+
def grammar=(grammar)
|
587
|
+
@rules.each {|r| r.grammar = grammar }
|
588
|
+
super
|
589
|
+
end
|
590
|
+
end
|
591
|
+
|
592
|
+
# A Predicate is a Nonterminal that contains one other rule.
|
593
|
+
module Predicate
|
594
|
+
include Nonterminal
|
595
|
+
|
596
|
+
def initialize(rule='')
|
597
|
+
super([ rule ])
|
598
|
+
end
|
599
|
+
|
600
|
+
# Returns the Rule object this rule uses to match.
|
601
|
+
def rule
|
602
|
+
rules[0]
|
603
|
+
end
|
604
|
+
end
|
605
|
+
|
606
|
+
# An AndPredicate is a Predicate that contains a rule that must match. Upon
|
607
|
+
# success an empty match is returned and no input is consumed. The PEG
|
608
|
+
# notation is any expression preceeded by an ampersand, e.g.:
|
609
|
+
#
|
610
|
+
# &expr
|
611
|
+
#
|
612
|
+
class AndPredicate
|
613
|
+
include Predicate
|
614
|
+
|
615
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
616
|
+
# no match can be made.
|
617
|
+
def match(input, offset=0)
|
618
|
+
create_match('', offset) if input.match(rule, offset)
|
619
|
+
end
|
620
|
+
|
621
|
+
# Returns the PEG notation of this rule as a string.
|
622
|
+
def to_s
|
623
|
+
'&' + rule.embed
|
624
|
+
end
|
625
|
+
end
|
626
|
+
|
627
|
+
# A NotPredicate is a Predicate that contains a rule that must not match. Upon
|
628
|
+
# success an empty match is returned and no input is consumed. The PEG
|
629
|
+
# notation is any expression preceeded by an exclamation mark, e.g.:
|
630
|
+
#
|
631
|
+
# !expr
|
632
|
+
#
|
633
|
+
class NotPredicate
|
634
|
+
include Predicate
|
635
|
+
|
636
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
637
|
+
# no match can be made.
|
638
|
+
def match(input, offset=0)
|
639
|
+
create_match('', offset) unless input.match(rule, offset)
|
640
|
+
end
|
641
|
+
|
642
|
+
# Returns the PEG notation of this rule as a string.
|
643
|
+
def to_s
|
644
|
+
'!' + rule.embed
|
645
|
+
end
|
646
|
+
end
|
647
|
+
|
648
|
+
# A Label is a Predicate that applies a new name to any matches made by its
|
649
|
+
# rule. The PEG notation is any sequence of word characters (i.e.
|
650
|
+
# <tt>[a-zA-Z0-9_]</tt>) followed by a colon, followed by any other
|
651
|
+
# expression, e.g.:
|
652
|
+
#
|
653
|
+
# label:expr
|
654
|
+
#
|
655
|
+
class Label
|
656
|
+
include Predicate
|
657
|
+
|
658
|
+
def initialize(label='<label>', rule='')
|
659
|
+
@label = label.to_sym
|
660
|
+
super(rule)
|
661
|
+
end
|
662
|
+
|
663
|
+
# The symbol this rule uses to re-name all its matches.
|
664
|
+
attr_reader :label
|
665
|
+
|
666
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
667
|
+
# no match can be made. When a Label makes a match, it re-names the match to
|
668
|
+
# the value of its label.
|
669
|
+
def match(input, offset=0)
|
670
|
+
m = rule.match(input, offset)
|
671
|
+
if m
|
672
|
+
extend_match(m)
|
673
|
+
m.name = label
|
674
|
+
m
|
675
|
+
end
|
676
|
+
end
|
677
|
+
|
678
|
+
# Returns the PEG notation of this rule as a string.
|
679
|
+
def to_s
|
680
|
+
label.to_s + ':' + rule.embed
|
681
|
+
end
|
682
|
+
end
|
683
|
+
|
684
|
+
# A Repeat is a Predicate that specifies a minimum and maximum number of times
|
685
|
+
# its rule must match. The PEG notation is an integer, +N+, followed by an
|
686
|
+
# asterisk, followed by another integer, +M+, all of which follow any other
|
687
|
+
# expression, e.g.:
|
688
|
+
#
|
689
|
+
# expr N*M
|
690
|
+
#
|
691
|
+
# In this notation +N+ specifies the minimum number of times the preceeding
|
692
|
+
# expression must match and +M+ specifies the maximum. If +N+ is ommitted,
|
693
|
+
# it is assumed to be 0. Likewise, if +M+ is omitted, it is assumed to be
|
694
|
+
# infinity (no maximum). Thus, an expression followed by only an asterisk may
|
695
|
+
# match any number of times, including zero.
|
696
|
+
#
|
697
|
+
# The shorthand notation <tt>+</tt> and <tt>?</tt> may be used for the common
|
698
|
+
# cases of <tt>1*</tt> and <tt>*1</tt> respectively, e.g.:
|
699
|
+
#
|
700
|
+
# expr+
|
701
|
+
# expr?
|
702
|
+
#
|
703
|
+
class Repeat
|
704
|
+
include Predicate
|
705
|
+
|
706
|
+
def initialize(min=1, max=Infinity, rule='')
|
707
|
+
raise ArgumentError, "Min cannot be greater than max" if min > max
|
708
|
+
@range = Range.new(min, max)
|
709
|
+
super(rule)
|
710
|
+
end
|
711
|
+
|
712
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
713
|
+
# no match can be made.
|
714
|
+
def match(input, offset=0)
|
715
|
+
matches = []
|
716
|
+
os = offset
|
717
|
+
while matches.length < @range.end
|
718
|
+
m = input.match(rule, os)
|
719
|
+
break unless m
|
720
|
+
matches << m
|
721
|
+
os += m.length
|
722
|
+
end
|
723
|
+
create_match(matches, offset) if @range.include?(matches.length)
|
724
|
+
end
|
725
|
+
|
726
|
+
# Returns the operator this rule uses as a string. Will be one of
|
727
|
+
# <tt>+</tt>, <tt>?</tt>, or <tt>N*M</tt>.
|
728
|
+
def operator
|
729
|
+
unless @operator
|
730
|
+
m = [@range.begin, @range.end].map do |n|
|
731
|
+
n == 0 || n == Infinity ? '' : n.to_s
|
732
|
+
end
|
733
|
+
@operator = case m
|
734
|
+
when ['', '1'] then '?'
|
735
|
+
when ['1', ''] then '+'
|
736
|
+
else m.join('*')
|
737
|
+
end
|
738
|
+
end
|
739
|
+
@operator
|
740
|
+
end
|
741
|
+
|
742
|
+
# Returns the PEG notation of this rule as a string.
|
743
|
+
def to_s
|
744
|
+
rule.embed + operator
|
745
|
+
end
|
746
|
+
end
|
747
|
+
|
748
|
+
# A List is a Nonterminal that contains any number of other rules and tests
|
749
|
+
# them for matches in sequential order.
|
750
|
+
module List
|
751
|
+
include Nonterminal
|
752
|
+
|
753
|
+
def paren?
|
754
|
+
rules.length > 1
|
755
|
+
end
|
756
|
+
end
|
757
|
+
|
758
|
+
# A Choice is a List where only one rule must match. The PEG notation is two
|
759
|
+
# or more expressions separated by a vertical bar, e.g.:
|
760
|
+
#
|
761
|
+
# expr | expr
|
762
|
+
#
|
763
|
+
class Choice
|
764
|
+
include List
|
765
|
+
|
766
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
767
|
+
# no match can be made.
|
768
|
+
def match(input, offset=0)
|
769
|
+
rules.each do |rule|
|
770
|
+
m = input.match(rule, offset)
|
771
|
+
return create_match([m], offset) if m
|
772
|
+
end
|
773
|
+
nil
|
774
|
+
end
|
775
|
+
|
776
|
+
# Returns the PEG notation of this rule as a string.
|
777
|
+
def to_s
|
778
|
+
rules.map {|r| r.embed }.join(' | ')
|
779
|
+
end
|
780
|
+
end
|
781
|
+
|
782
|
+
# A Sequence is a List where all rules must match. The PEG notation is two or
|
783
|
+
# more expressions separated by a space, e.g.:
|
784
|
+
#
|
785
|
+
# expr expr
|
786
|
+
#
|
787
|
+
class Sequence
|
788
|
+
include List
|
789
|
+
|
790
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
791
|
+
# no match can be made.
|
792
|
+
def match(input, offset=0)
|
793
|
+
matches = []
|
794
|
+
os = offset
|
795
|
+
rules.each do |rule|
|
796
|
+
m = input.match(rule, os)
|
797
|
+
break unless m
|
798
|
+
matches << m
|
799
|
+
os += m.length
|
800
|
+
end
|
801
|
+
create_match(matches, offset) if matches.length == rules.length
|
802
|
+
end
|
803
|
+
|
804
|
+
# Returns the PEG notation of this rule as a string.
|
805
|
+
def to_s
|
806
|
+
rules.map {|r| r.embed }.join(' ')
|
807
|
+
end
|
808
|
+
end
|
809
|
+
|
810
|
+
# The base class for all matches. Matches are organized into a tree where any
|
811
|
+
# match may contain any number of other matches. This class provides several
|
812
|
+
# convenient tree traversal methods that help when examining parse results.
|
813
|
+
class Match
|
814
|
+
def initialize(data, offset=0)
|
815
|
+
case data
|
816
|
+
when String
|
817
|
+
@text = data
|
818
|
+
when MatchData
|
819
|
+
@text = data[0]
|
820
|
+
@captures = data.captures
|
821
|
+
when Array
|
822
|
+
@matches = data
|
823
|
+
end
|
824
|
+
|
825
|
+
@offset = offset
|
826
|
+
end
|
827
|
+
|
828
|
+
# The name by which this match can be accessed from a parent match. This
|
829
|
+
# will be the name of the rule that generated the match in most cases.
|
830
|
+
# However, if the match is the result of a Label this will be the value of
|
831
|
+
# the label.
|
832
|
+
attr_accessor :name
|
833
|
+
|
834
|
+
# The offset in the input at which this match occurred.
|
835
|
+
attr_reader :offset
|
836
|
+
|
837
|
+
# An array of all sub-matches of this match.
|
838
|
+
def matches
|
839
|
+
@matches ||= []
|
840
|
+
end
|
841
|
+
|
842
|
+
# An array of substrings returned by MatchData#captures if this match was
|
843
|
+
# created by an Expression.
|
844
|
+
def captures
|
845
|
+
@captures ||= []
|
846
|
+
end
|
847
|
+
|
848
|
+
# Returns the raw text value of this match, which may simply be an
|
849
|
+
# aggregate of the text of all sub-matches if this match is not #terminal?.
|
850
|
+
def text
|
851
|
+
@text ||= matches.inject('') {|s, m| s << m.text }
|
852
|
+
end
|
853
|
+
|
854
|
+
alias to_s text
|
855
|
+
|
856
|
+
# Returns the length of this match's #text value as an Integer.
|
857
|
+
def length
|
858
|
+
text.length
|
859
|
+
end
|
860
|
+
|
861
|
+
# Passes all arguments to the #text of this match.
|
862
|
+
def [](*args)
|
863
|
+
text.__send__(:[], *args)
|
864
|
+
end
|
865
|
+
|
866
|
+
# Returns an array of all sub-matches with the given +name+. If +deep+ is
|
867
|
+
# +false+, returns only sub-matches that are immediate descendants of this
|
868
|
+
# match.
|
869
|
+
def find(name, deep=true)
|
870
|
+
sym = name.to_sym
|
871
|
+
ms = matches.select {|m| sym == m.name }
|
872
|
+
ms.concat(matches.map {|m| m.find(name, deep) }.flatten) if deep
|
873
|
+
ms
|
874
|
+
end
|
875
|
+
|
876
|
+
# A shortcut for retrieving the first immediate sub-match of this match. If
|
877
|
+
# +name+ is given, attempts to retrieve the first immediate sub-match named
|
878
|
+
# +name+.
|
879
|
+
def first(name=nil)
|
880
|
+
name.nil? ? matches.first : find(name, false).first
|
881
|
+
end
|
882
|
+
|
883
|
+
# Returns +true+ if this match has no descendants (was created from a
|
884
|
+
# Terminal).
|
885
|
+
def terminal?
|
886
|
+
matches.length == 0
|
887
|
+
end
|
888
|
+
|
889
|
+
# Checks equality by comparing this match's #text value to +obj+.
|
890
|
+
def ==(obj)
|
891
|
+
text == obj
|
892
|
+
end
|
893
|
+
|
894
|
+
alias eql? ==
|
895
|
+
|
896
|
+
# Uses #match to allow sub-matches of this match to be called by name as
|
897
|
+
# instance methods.
|
898
|
+
def method_missing(sym, *args)
|
899
|
+
m = first(sym)
|
900
|
+
return m if m
|
901
|
+
raise 'No match named "%s" in %s (%s)' % [sym, self, name]
|
902
|
+
end
|
903
|
+
end
|
904
|
+
end
|