citrus 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +86 -0
- data/Rakefile +67 -0
- data/citrus.gemspec +29 -0
- data/examples/calc.citrus +103 -0
- data/examples/calc.rb +95 -0
- data/examples/calc_sugar.rb +94 -0
- data/lib/citrus.rb +904 -0
- data/lib/citrus/debug.rb +37 -0
- data/lib/citrus/peg.rb +375 -0
- data/lib/citrus/sugar.rb +25 -0
- data/test/alias_test.rb +66 -0
- data/test/and_predicate_test.rb +27 -0
- data/test/calc_peg_test.rb +6 -0
- data/test/calc_sugar_test.rb +6 -0
- data/test/calc_test.rb +6 -0
- data/test/choice_test.rb +62 -0
- data/test/expression_test.rb +29 -0
- data/test/fixed_width_test.rb +37 -0
- data/test/grammar_test.rb +129 -0
- data/test/helper.rb +143 -0
- data/test/label_test.rb +26 -0
- data/test/match_test.rb +76 -0
- data/test/not_predicate_test.rb +27 -0
- data/test/peg_test.rb +663 -0
- data/test/repeat_test.rb +93 -0
- data/test/rule_test.rb +49 -0
- data/test/sequence_test.rb +53 -0
- data/test/super_test.rb +66 -0
- metadata +133 -0
data/lib/citrus.rb
ADDED
@@ -0,0 +1,904 @@
|
|
1
|
+
# Citrus is a compact and powerful parsing library for Ruby that combines the
|
2
|
+
# elegance and expressiveness of the language with the simplicity and power of
|
3
|
+
# parsing expression grammars.
|
4
|
+
#
|
5
|
+
# http://github.com/mjijackson/citrus
|
6
|
+
module Citrus
|
7
|
+
VERSION = [1, 0, 0]
|
8
|
+
|
9
|
+
Infinity = 1.0 / 0
|
10
|
+
|
11
|
+
autoload 'PEG', 'citrus/peg'
|
12
|
+
|
13
|
+
# Returns the current version of Citrus as a string.
|
14
|
+
def self.version
|
15
|
+
VERSION.join('.')
|
16
|
+
end
|
17
|
+
|
18
|
+
# Loads the grammar from the given +file+ into the global scope using #eval.
|
19
|
+
def self.load(file)
|
20
|
+
file << '.citrus' unless File.file?(file)
|
21
|
+
raise "Cannot find file #{file}" unless File.file?(file)
|
22
|
+
raise "Cannot read file #{file}" unless File.readable?(file)
|
23
|
+
self.eval(File.read(file))
|
24
|
+
end
|
25
|
+
|
26
|
+
# Evaluates the given Citrus parsing expression grammar +code+ in the global
|
27
|
+
# scope. Returns an array of any grammar modules that were created.
|
28
|
+
def self.eval(code)
|
29
|
+
file = PEG.parse(code)
|
30
|
+
file.value
|
31
|
+
end
|
32
|
+
|
33
|
+
# This error is raised whenever a parse fails.
|
34
|
+
class ParseError < Exception
|
35
|
+
def initialize(input)
|
36
|
+
@input = input
|
37
|
+
c = consumed
|
38
|
+
s = [0, c.length - 40].max
|
39
|
+
msg = "Failed to parse input at offset %d" % max_offset
|
40
|
+
msg += ", just after %s" % c[s, c.length].inspect + "\n"
|
41
|
+
super(msg)
|
42
|
+
end
|
43
|
+
|
44
|
+
# The Input object that was used for the parse.
|
45
|
+
attr_reader :input
|
46
|
+
|
47
|
+
# Returns the maximum offset that was reached before the error occurred.
|
48
|
+
def max_offset
|
49
|
+
input.max_offset
|
50
|
+
end
|
51
|
+
|
52
|
+
# Returns the portion of the input string that was successfully consumed
|
53
|
+
# before the parse failed.
|
54
|
+
def consumed
|
55
|
+
input[0, max_offset]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Inclusion of this module into another extends the receiver with the grammar
|
60
|
+
# helper methods in GrammarMethods. Although this module does not actually
|
61
|
+
# provide any methods, constants, or variables to modules that include it, the
|
62
|
+
# mere act of inclusion provides a useful lookup mechanism to determine if a
|
63
|
+
# module is in fact a grammar.
|
64
|
+
module Grammar
|
65
|
+
# Creates a new anonymous module that includes Grammar. If a +block+ is
|
66
|
+
# provided, it will be called with the new module as its first argument if
|
67
|
+
# its +arity+ is 1 or +instance_eval+'d in the context of the new module
|
68
|
+
# otherwise. See http://blog.grayproductions.net/articles/dsl_block_styles
|
69
|
+
# for the rationale behind this decision.
|
70
|
+
#
|
71
|
+
# Grammars created with this method may be assigned a name by being assigned
|
72
|
+
# to some constant, e.g.:
|
73
|
+
#
|
74
|
+
# Calc = Grammar.new {}
|
75
|
+
#
|
76
|
+
def self.new(&block)
|
77
|
+
mod = Module.new { include Grammar }
|
78
|
+
block.arity == 1 ? block[mod] : mod.instance_eval(&block) if block
|
79
|
+
mod
|
80
|
+
end
|
81
|
+
|
82
|
+
# Extends all modules that +include Grammar+ with GrammarMethods and
|
83
|
+
# exposes Module#include.
|
84
|
+
def self.included(mod)
|
85
|
+
mod.extend(GrammarMethods)
|
86
|
+
class << mod; public :include end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Contains methods that are available to Grammar modules at the class level.
|
91
|
+
module GrammarMethods
|
92
|
+
# Returns the name of this grammar as a string.
|
93
|
+
def name
|
94
|
+
super.to_s
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns an array of all grammars that have been included in this grammar
|
98
|
+
# in the reverse order they were included.
|
99
|
+
def included_grammars
|
100
|
+
included_modules.select {|mod| mod.include?(Grammar) }
|
101
|
+
end
|
102
|
+
|
103
|
+
# Returns an array of all names of rules in this grammar as symbols ordered
|
104
|
+
# in the same way they were defined (i.e. rules that were defined later
|
105
|
+
# appear later in the array).
|
106
|
+
def rule_names
|
107
|
+
@rule_names ||= []
|
108
|
+
end
|
109
|
+
|
110
|
+
# Returns a hash of all Rule objects in this grammar, keyed by rule name.
|
111
|
+
def rules
|
112
|
+
@rules ||= {}
|
113
|
+
end
|
114
|
+
|
115
|
+
# Returns +true+ if this grammar has a rule with the given +name+.
|
116
|
+
def has_rule?(name)
|
117
|
+
rules.key?(name.to_sym)
|
118
|
+
end
|
119
|
+
|
120
|
+
# Loops through the rule tree for the given +rule+ looking for any Super
|
121
|
+
# rules. When it finds one, it sets that rule's rule name to the given
|
122
|
+
# +name+.
|
123
|
+
def setup_super(rule, name) # :nodoc:
|
124
|
+
if Nonterminal === rule
|
125
|
+
rule.rules.each {|r| setup_super(r, name) }
|
126
|
+
elsif Super === rule
|
127
|
+
rule.rule_name = name
|
128
|
+
end
|
129
|
+
end
|
130
|
+
private :setup_super
|
131
|
+
|
132
|
+
# Searches the inheritance hierarchy of this grammar for a rule named +name+
|
133
|
+
# and returns it on success. Returns +nil+ on failure.
|
134
|
+
def super_rule(name)
|
135
|
+
sym = name.to_sym
|
136
|
+
included_grammars.each do |g|
|
137
|
+
r = g.rule(sym)
|
138
|
+
return r if r
|
139
|
+
end
|
140
|
+
nil
|
141
|
+
end
|
142
|
+
|
143
|
+
# Gets/sets the rule with the given +name+. If +obj+ is given the rule
|
144
|
+
# will be set to the value of +obj+ passed through Rule#create. If a block
|
145
|
+
# is given, its return value will be used for the value of +obj+.
|
146
|
+
#
|
147
|
+
# It is important to note that this method will also check any included
|
148
|
+
# grammars for a rule with the given +name+ if one cannot be found in this
|
149
|
+
# grammar.
|
150
|
+
def rule(name, obj=nil)
|
151
|
+
sym = name.to_sym
|
152
|
+
|
153
|
+
obj = Proc.new.call if block_given?
|
154
|
+
|
155
|
+
if obj
|
156
|
+
rule_names << sym unless has_rule?(sym)
|
157
|
+
|
158
|
+
rule = Rule.create(obj)
|
159
|
+
rule.name = name
|
160
|
+
setup_super(rule, name)
|
161
|
+
rule.grammar = self
|
162
|
+
|
163
|
+
rules[sym] = rule
|
164
|
+
end
|
165
|
+
|
166
|
+
rules[sym] || super_rule(sym)
|
167
|
+
rescue => e
|
168
|
+
raise "Cannot create rule \"#{name}\": " + e.message
|
169
|
+
end
|
170
|
+
|
171
|
+
# Gets/sets the +name+ of the root rule of this grammar.
|
172
|
+
def root(name=nil)
|
173
|
+
@root = name.to_sym if name
|
174
|
+
# The first rule in a grammar is the default root.
|
175
|
+
@root || rule_names.first
|
176
|
+
end
|
177
|
+
|
178
|
+
# Creates a new Super for the rule currently being defined in the grammar. A
|
179
|
+
# block may be provided to specify semantic behavior (via #ext).
|
180
|
+
def sup(&block)
|
181
|
+
ext(Super.new, block)
|
182
|
+
end
|
183
|
+
|
184
|
+
# Creates a new AndPredicate using the given +rule+. A block may be provided
|
185
|
+
# to specify semantic behavior (via #ext).
|
186
|
+
def andp(rule, &block)
|
187
|
+
ext(AndPredicate.new(rule), block)
|
188
|
+
end
|
189
|
+
|
190
|
+
# Creates a new NotPredicate using the given +rule+. A block may be provided
|
191
|
+
# to specify semantic behavior (via #ext).
|
192
|
+
def notp(rule, &block)
|
193
|
+
ext(NotPredicate.new(rule), block)
|
194
|
+
end
|
195
|
+
|
196
|
+
# Creates a new Label using the given +rule+ and +label+. A block may be
|
197
|
+
# provided to specify semantic behavior (via #ext).
|
198
|
+
def label(rule, label, &block)
|
199
|
+
ext(Label.new(label, rule), block)
|
200
|
+
end
|
201
|
+
|
202
|
+
# Creates a new Repeat using the given +rule+. +min+ and +max+ specify the
|
203
|
+
# minimum and maximum number of times the rule must match. A block may be
|
204
|
+
# provided to specify semantic behavior (via #ext).
|
205
|
+
def rep(rule, min=1, max=Infinity, &block)
|
206
|
+
ext(Repeat.new(min, max, rule), block)
|
207
|
+
end
|
208
|
+
|
209
|
+
# An alias for #rep.
|
210
|
+
def one_or_more(rule, &block)
|
211
|
+
rep(rule, &block)
|
212
|
+
end
|
213
|
+
|
214
|
+
# An alias for #rep with a minimum of 0.
|
215
|
+
def zero_or_more(rule, &block)
|
216
|
+
rep(rule, 0, &block)
|
217
|
+
end
|
218
|
+
|
219
|
+
# An alias for #rep with a minimum of 0 and a maximum of 1.
|
220
|
+
def zero_or_one(rule, &block)
|
221
|
+
rep(rule, 0, 1, &block)
|
222
|
+
end
|
223
|
+
|
224
|
+
# Creates a new Sequence using all arguments. A block may be provided to
|
225
|
+
# specify semantic behavior (via #ext).
|
226
|
+
def all(*args, &block)
|
227
|
+
ext(Sequence.new(args), block)
|
228
|
+
end
|
229
|
+
|
230
|
+
# Creates a new Choice using all arguments. A block may be provided to
|
231
|
+
# specify semantic behavior (via #ext).
|
232
|
+
def any(*args, &block)
|
233
|
+
ext(Choice.new(args), block)
|
234
|
+
end
|
235
|
+
|
236
|
+
# Specifies a Module that will be used to extend all matches created with
|
237
|
+
# the given +rule+. A block may also be given that will be used to create
|
238
|
+
# an anonymous module. See Rule#ext=.
|
239
|
+
def ext(rule, mod=nil)
|
240
|
+
rule = Rule.create(rule)
|
241
|
+
mod = Proc.new if block_given?
|
242
|
+
rule.ext = mod if mod
|
243
|
+
rule
|
244
|
+
end
|
245
|
+
|
246
|
+
# Parses the given +string+ from the given +offset+ using the rules in this
|
247
|
+
# grammar. A ParseError is raised if there is no match made or if
|
248
|
+
# +consume_all+ is +true+ and the entire input string cannot be consumed.
|
249
|
+
def parse(string, offset=0, enable_memo=false, consume_all=true)
|
250
|
+
raise "No root rule specified" unless root
|
251
|
+
|
252
|
+
root_rule = rule(root)
|
253
|
+
raise "No rule named \"#{root}\"" unless root_rule
|
254
|
+
|
255
|
+
input = Input.new(string, enable_memo)
|
256
|
+
match = input.match(root_rule, offset)
|
257
|
+
|
258
|
+
if !match || (consume_all && match.length != string.length)
|
259
|
+
raise ParseError.new(input)
|
260
|
+
end
|
261
|
+
|
262
|
+
match
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
# This class represents the core of the parsing algorithm. It wraps the input
|
267
|
+
# string and serves matches to all nonterminals.
|
268
|
+
class Input
|
269
|
+
# Takes the input +string+ that is to be parsed. If +enable_memo+ is +true+
|
270
|
+
# a cache is created that holds references to already generated matches.
|
271
|
+
def initialize(string, enable_memo=false)
|
272
|
+
@string = string
|
273
|
+
@max_offset = 0
|
274
|
+
if enable_memo
|
275
|
+
@cache = {}
|
276
|
+
@cache_hits = 0
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
# The input string.
|
281
|
+
attr_reader :string
|
282
|
+
|
283
|
+
# The maximum offset that has been achieved.
|
284
|
+
attr_reader :max_offset
|
285
|
+
|
286
|
+
# A two-level hash of rule id's and offsets to their respective matches.
|
287
|
+
# Only present if memoing is enabled.
|
288
|
+
attr_reader :cache
|
289
|
+
|
290
|
+
# The number of times the cache was hit. Only present if memoing is enabled.
|
291
|
+
attr_reader :cache_hits
|
292
|
+
|
293
|
+
# Sends all arguments to this input's +string+.
|
294
|
+
def [](*args)
|
295
|
+
@string.__send__(:[], *args)
|
296
|
+
end
|
297
|
+
|
298
|
+
# Returns the length of this input.
|
299
|
+
def length
|
300
|
+
@string.length
|
301
|
+
end
|
302
|
+
|
303
|
+
# Returns the match for a given +rule+ at +offset+. If memoing is enabled
|
304
|
+
# and a match does not already exist for the given rule/offset pair then
|
305
|
+
# the rule is executed and the result is cached before returning. See
|
306
|
+
# http://pdos.csail.mit.edu/~baford/packrat/icfp02/ for more information
|
307
|
+
# on memoing match results (also known as packrat parsing).
|
308
|
+
def match(rule, offset=0)
|
309
|
+
@max_offset = offset if offset > @max_offset
|
310
|
+
|
311
|
+
if @cache
|
312
|
+
c = @cache[rule.id] ||= {}
|
313
|
+
|
314
|
+
if c.key?(offset)
|
315
|
+
@cache_hits += 1
|
316
|
+
c[offset]
|
317
|
+
else
|
318
|
+
c[offset] = rule.match(self, offset)
|
319
|
+
end
|
320
|
+
else
|
321
|
+
rule.match(self, offset)
|
322
|
+
end
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
# A Rule is an object that is used by a grammar to create matches on the
|
327
|
+
# Input during parsing.
|
328
|
+
module Rule
|
329
|
+
# Returns a new Rule object depending on the type of object given.
|
330
|
+
def self.create(obj)
|
331
|
+
case obj
|
332
|
+
when Rule then obj
|
333
|
+
when Symbol then Alias.new(obj)
|
334
|
+
when String then FixedWidth.new(obj)
|
335
|
+
when Regexp then Expression.new(obj)
|
336
|
+
when Array then Sequence.new(obj)
|
337
|
+
when Range then Choice.new(obj.to_a)
|
338
|
+
when Numeric then FixedWidth.new(obj.to_s)
|
339
|
+
else
|
340
|
+
raise ArgumentError, "Invalid rule object: #{obj.inspect}"
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
@uniq_id = 0
|
345
|
+
|
346
|
+
# Generates a new rule id.
|
347
|
+
def self.new_id
|
348
|
+
@uniq_id += 1
|
349
|
+
end
|
350
|
+
|
351
|
+
# The grammar this rule belongs to.
|
352
|
+
attr_accessor :grammar
|
353
|
+
|
354
|
+
# An integer id that is unique to this rule.
|
355
|
+
def id
|
356
|
+
@id ||= Rule.new_id
|
357
|
+
end
|
358
|
+
|
359
|
+
# Sets the name of this rule.
|
360
|
+
def name=(name)
|
361
|
+
@name = name.to_sym
|
362
|
+
end
|
363
|
+
|
364
|
+
# The name of this rule.
|
365
|
+
attr_reader :name
|
366
|
+
|
367
|
+
# Specifies a module that will be used to extend all Match objects that
|
368
|
+
# result from this rule. If +mod+ is a Proc, it is used to create an
|
369
|
+
# anonymous module.
|
370
|
+
def ext=(mod)
|
371
|
+
mod = Module.new(&mod) if Proc === mod
|
372
|
+
@ext = mod
|
373
|
+
end
|
374
|
+
|
375
|
+
# The module this rule uses to extend new matches.
|
376
|
+
attr_reader :ext
|
377
|
+
|
378
|
+
# Returns +true+ if this rule is a Terminal.
|
379
|
+
def terminal?
|
380
|
+
is_a?(Terminal)
|
381
|
+
end
|
382
|
+
|
383
|
+
# Returns +true+ if this rule needs to be surrounded by parentheses when
|
384
|
+
# using #embed.
|
385
|
+
def paren?
|
386
|
+
false
|
387
|
+
end
|
388
|
+
|
389
|
+
# Returns a string version of this rule that is suitable to be used in the
|
390
|
+
# string representation of another rule.
|
391
|
+
def embed
|
392
|
+
name ? name.to_s : (paren? ? '(%s)' % to_s : to_s)
|
393
|
+
end
|
394
|
+
|
395
|
+
def inspect # :nodoc:
|
396
|
+
to_s
|
397
|
+
end
|
398
|
+
|
399
|
+
private
|
400
|
+
|
401
|
+
def extend_match(match)
|
402
|
+
match.extend(ext) if ext
|
403
|
+
end
|
404
|
+
|
405
|
+
def create_match(data, offset)
|
406
|
+
match = Match.new(data, offset)
|
407
|
+
extend_match(match)
|
408
|
+
match.name = name
|
409
|
+
match
|
410
|
+
end
|
411
|
+
end
|
412
|
+
|
413
|
+
# A Proxy is a Rule that is a placeholder for another rule. It stores the
|
414
|
+
# name of some other rule in the grammar internally and resolves it to the
|
415
|
+
# actual Rule object at runtime. This lazy evaluation permits us to create
|
416
|
+
# Proxy objects for rules that we may not know the definition of yet.
|
417
|
+
module Proxy
|
418
|
+
include Rule
|
419
|
+
|
420
|
+
def initialize(name='<proxy>')
|
421
|
+
self.rule_name = name
|
422
|
+
end
|
423
|
+
|
424
|
+
# Sets the name of the rule this rule is proxy for.
|
425
|
+
def rule_name=(name)
|
426
|
+
@rule_name = name.to_sym
|
427
|
+
end
|
428
|
+
|
429
|
+
# The name of this proxy's rule.
|
430
|
+
attr_reader :rule_name
|
431
|
+
|
432
|
+
# Returns the underlying Rule for this proxy.
|
433
|
+
def rule
|
434
|
+
@rule ||= resolve!
|
435
|
+
end
|
436
|
+
|
437
|
+
# Returns the Match for this proxy's #rule on +input+ at the given +offset+,
|
438
|
+
# +nil+ if no match can be made.
|
439
|
+
def match(input, offset=0)
|
440
|
+
m = input.match(rule, offset)
|
441
|
+
if m
|
442
|
+
extend_match(m)
|
443
|
+
# If this Proxy has a name then it should rename all of its matches.
|
444
|
+
m.name = name if name
|
445
|
+
m
|
446
|
+
end
|
447
|
+
end
|
448
|
+
end
|
449
|
+
|
450
|
+
# An Alias is a Proxy for a rule in the same grammar. It is used in rule
|
451
|
+
# definitions when a rule calls some other rule by name. The PEG notation is
|
452
|
+
# simply the name of another rule without any other punctuation, e.g.:
|
453
|
+
#
|
454
|
+
# name
|
455
|
+
#
|
456
|
+
class Alias
|
457
|
+
include Proxy
|
458
|
+
|
459
|
+
# Returns the PEG notation of this rule as a string.
|
460
|
+
def to_s
|
461
|
+
rule_name.to_s
|
462
|
+
end
|
463
|
+
|
464
|
+
private
|
465
|
+
|
466
|
+
# Searches this proxy's grammar and any included grammars for a rule with
|
467
|
+
# this proxy's #rule_name. Raises an error if one cannot be found.
|
468
|
+
def resolve!
|
469
|
+
rule = grammar.rule(rule_name)
|
470
|
+
raise RuntimeError, 'No rule named "%s" in grammar %s' %
|
471
|
+
[rule_name, grammar.name] unless rule
|
472
|
+
rule
|
473
|
+
end
|
474
|
+
end
|
475
|
+
|
476
|
+
# A Super is a Proxy for a rule of the same name that was defined previously
|
477
|
+
# in the grammar's inheritance chain. Thus, Super's work like Ruby's +super+,
|
478
|
+
# only for rules in a grammar instead of methods in a module. The PEG notation
|
479
|
+
# is the word +super+ without any other punctuation, e.g.:
|
480
|
+
#
|
481
|
+
# super
|
482
|
+
#
|
483
|
+
class Super
|
484
|
+
include Proxy
|
485
|
+
|
486
|
+
# Returns the PEG notation of this rule as a string.
|
487
|
+
def to_s
|
488
|
+
'super'
|
489
|
+
end
|
490
|
+
|
491
|
+
private
|
492
|
+
|
493
|
+
# Searches this proxy's included grammars for a rule with this proxy's
|
494
|
+
# #rule_name. Raises an error if one cannot be found.
|
495
|
+
def resolve!
|
496
|
+
rule = grammar.super_rule(rule_name)
|
497
|
+
raise RuntimeError, 'No rule named "%s" in hierarchy of grammar %s' %
|
498
|
+
[rule_name, grammar.name] unless rule
|
499
|
+
rule
|
500
|
+
end
|
501
|
+
end
|
502
|
+
|
503
|
+
# A Terminal is a Rule that matches directly on the input stream and may not
|
504
|
+
# contain any other rule.
|
505
|
+
module Terminal
|
506
|
+
include Rule
|
507
|
+
|
508
|
+
def initialize(rule)
|
509
|
+
@rule = rule
|
510
|
+
end
|
511
|
+
|
512
|
+
# The actual String or Regexp object this rule uses to match.
|
513
|
+
attr_reader :rule
|
514
|
+
|
515
|
+
# Returns the PEG notation of this rule as a string.
|
516
|
+
def to_s
|
517
|
+
rule.inspect
|
518
|
+
end
|
519
|
+
end
|
520
|
+
|
521
|
+
# A FixedWidth is a Terminal that matches based on its length. The PEG
|
522
|
+
# notation is any sequence of characters enclosed in either single or double
|
523
|
+
# quotes, e.g.:
|
524
|
+
#
|
525
|
+
# 'expr'
|
526
|
+
# "expr"
|
527
|
+
#
|
528
|
+
class FixedWidth
|
529
|
+
include Terminal
|
530
|
+
|
531
|
+
def initialize(rule='')
|
532
|
+
raise ArgumentError, "FixedWidth must be a String" unless String === rule
|
533
|
+
super
|
534
|
+
end
|
535
|
+
|
536
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
537
|
+
# no match can be made.
|
538
|
+
def match(input, offset=0)
|
539
|
+
create_match(rule.dup, offset) if rule == input[offset, rule.length]
|
540
|
+
end
|
541
|
+
end
|
542
|
+
|
543
|
+
# An Expression is a Terminal that has the same semantics as a regular
|
544
|
+
# expression in Ruby. The expression must match at the beginning of the input
|
545
|
+
# (index 0). The PEG notation is identical to Ruby's regular expression
|
546
|
+
# notation, e.g.:
|
547
|
+
#
|
548
|
+
# /expr/
|
549
|
+
#
|
550
|
+
# Character classes and the dot symbol may also be used in PEG notation for
|
551
|
+
# compatibility with other PEG implementations, e.g.:
|
552
|
+
#
|
553
|
+
# [a-zA-Z]
|
554
|
+
# .
|
555
|
+
#
|
556
|
+
class Expression
|
557
|
+
include Terminal
|
558
|
+
|
559
|
+
def initialize(rule=/^/)
|
560
|
+
raise ArgumentError, "Expression must be a Regexp" unless Regexp === rule
|
561
|
+
super
|
562
|
+
end
|
563
|
+
|
564
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
565
|
+
# no match can be made.
|
566
|
+
def match(input, offset=0)
|
567
|
+
result = input[offset, input.length - offset].match(rule)
|
568
|
+
create_match(result, offset) if result && result.begin(0) == 0
|
569
|
+
end
|
570
|
+
end
|
571
|
+
|
572
|
+
# A Nonterminal is a Rule that augments the matching behavior of one or more
|
573
|
+
# other rules. Nonterminals may not match directly on the input, but instead
|
574
|
+
# invoke the rule(s) they contain to determine if a match can be made from
|
575
|
+
# the collective result.
|
576
|
+
module Nonterminal
|
577
|
+
include Rule
|
578
|
+
|
579
|
+
def initialize(rules=[])
|
580
|
+
@rules = rules.map {|r| Rule.create(r) }
|
581
|
+
end
|
582
|
+
|
583
|
+
# An array of the actual Rule objects this rule uses to match.
|
584
|
+
attr_reader :rules
|
585
|
+
|
586
|
+
def grammar=(grammar)
|
587
|
+
@rules.each {|r| r.grammar = grammar }
|
588
|
+
super
|
589
|
+
end
|
590
|
+
end
|
591
|
+
|
592
|
+
# A Predicate is a Nonterminal that contains one other rule.
|
593
|
+
module Predicate
|
594
|
+
include Nonterminal
|
595
|
+
|
596
|
+
def initialize(rule='')
|
597
|
+
super([ rule ])
|
598
|
+
end
|
599
|
+
|
600
|
+
# Returns the Rule object this rule uses to match.
|
601
|
+
def rule
|
602
|
+
rules[0]
|
603
|
+
end
|
604
|
+
end
|
605
|
+
|
606
|
+
# An AndPredicate is a Predicate that contains a rule that must match. Upon
|
607
|
+
# success an empty match is returned and no input is consumed. The PEG
|
608
|
+
# notation is any expression preceeded by an ampersand, e.g.:
|
609
|
+
#
|
610
|
+
# &expr
|
611
|
+
#
|
612
|
+
class AndPredicate
|
613
|
+
include Predicate
|
614
|
+
|
615
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
616
|
+
# no match can be made.
|
617
|
+
def match(input, offset=0)
|
618
|
+
create_match('', offset) if input.match(rule, offset)
|
619
|
+
end
|
620
|
+
|
621
|
+
# Returns the PEG notation of this rule as a string.
|
622
|
+
def to_s
|
623
|
+
'&' + rule.embed
|
624
|
+
end
|
625
|
+
end
|
626
|
+
|
627
|
+
# A NotPredicate is a Predicate that contains a rule that must not match. Upon
|
628
|
+
# success an empty match is returned and no input is consumed. The PEG
|
629
|
+
# notation is any expression preceeded by an exclamation mark, e.g.:
|
630
|
+
#
|
631
|
+
# !expr
|
632
|
+
#
|
633
|
+
class NotPredicate
|
634
|
+
include Predicate
|
635
|
+
|
636
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
637
|
+
# no match can be made.
|
638
|
+
def match(input, offset=0)
|
639
|
+
create_match('', offset) unless input.match(rule, offset)
|
640
|
+
end
|
641
|
+
|
642
|
+
# Returns the PEG notation of this rule as a string.
|
643
|
+
def to_s
|
644
|
+
'!' + rule.embed
|
645
|
+
end
|
646
|
+
end
|
647
|
+
|
648
|
+
# A Label is a Predicate that applies a new name to any matches made by its
|
649
|
+
# rule. The PEG notation is any sequence of word characters (i.e.
|
650
|
+
# <tt>[a-zA-Z0-9_]</tt>) followed by a colon, followed by any other
|
651
|
+
# expression, e.g.:
|
652
|
+
#
|
653
|
+
# label:expr
|
654
|
+
#
|
655
|
+
class Label
|
656
|
+
include Predicate
|
657
|
+
|
658
|
+
def initialize(label='<label>', rule='')
|
659
|
+
@label = label.to_sym
|
660
|
+
super(rule)
|
661
|
+
end
|
662
|
+
|
663
|
+
# The symbol this rule uses to re-name all its matches.
|
664
|
+
attr_reader :label
|
665
|
+
|
666
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
667
|
+
# no match can be made. When a Label makes a match, it re-names the match to
|
668
|
+
# the value of its label.
|
669
|
+
def match(input, offset=0)
|
670
|
+
m = rule.match(input, offset)
|
671
|
+
if m
|
672
|
+
extend_match(m)
|
673
|
+
m.name = label
|
674
|
+
m
|
675
|
+
end
|
676
|
+
end
|
677
|
+
|
678
|
+
# Returns the PEG notation of this rule as a string.
|
679
|
+
def to_s
|
680
|
+
label.to_s + ':' + rule.embed
|
681
|
+
end
|
682
|
+
end
|
683
|
+
|
684
|
+
# A Repeat is a Predicate that specifies a minimum and maximum number of times
|
685
|
+
# its rule must match. The PEG notation is an integer, +N+, followed by an
|
686
|
+
# asterisk, followed by another integer, +M+, all of which follow any other
|
687
|
+
# expression, e.g.:
|
688
|
+
#
|
689
|
+
# expr N*M
|
690
|
+
#
|
691
|
+
# In this notation +N+ specifies the minimum number of times the preceeding
|
692
|
+
# expression must match and +M+ specifies the maximum. If +N+ is ommitted,
|
693
|
+
# it is assumed to be 0. Likewise, if +M+ is omitted, it is assumed to be
|
694
|
+
# infinity (no maximum). Thus, an expression followed by only an asterisk may
|
695
|
+
# match any number of times, including zero.
|
696
|
+
#
|
697
|
+
# The shorthand notation <tt>+</tt> and <tt>?</tt> may be used for the common
|
698
|
+
# cases of <tt>1*</tt> and <tt>*1</tt> respectively, e.g.:
|
699
|
+
#
|
700
|
+
# expr+
|
701
|
+
# expr?
|
702
|
+
#
|
703
|
+
class Repeat
|
704
|
+
include Predicate
|
705
|
+
|
706
|
+
def initialize(min=1, max=Infinity, rule='')
|
707
|
+
raise ArgumentError, "Min cannot be greater than max" if min > max
|
708
|
+
@range = Range.new(min, max)
|
709
|
+
super(rule)
|
710
|
+
end
|
711
|
+
|
712
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
713
|
+
# no match can be made.
|
714
|
+
def match(input, offset=0)
|
715
|
+
matches = []
|
716
|
+
os = offset
|
717
|
+
while matches.length < @range.end
|
718
|
+
m = input.match(rule, os)
|
719
|
+
break unless m
|
720
|
+
matches << m
|
721
|
+
os += m.length
|
722
|
+
end
|
723
|
+
create_match(matches, offset) if @range.include?(matches.length)
|
724
|
+
end
|
725
|
+
|
726
|
+
# Returns the operator this rule uses as a string. Will be one of
|
727
|
+
# <tt>+</tt>, <tt>?</tt>, or <tt>N*M</tt>.
|
728
|
+
def operator
|
729
|
+
unless @operator
|
730
|
+
m = [@range.begin, @range.end].map do |n|
|
731
|
+
n == 0 || n == Infinity ? '' : n.to_s
|
732
|
+
end
|
733
|
+
@operator = case m
|
734
|
+
when ['', '1'] then '?'
|
735
|
+
when ['1', ''] then '+'
|
736
|
+
else m.join('*')
|
737
|
+
end
|
738
|
+
end
|
739
|
+
@operator
|
740
|
+
end
|
741
|
+
|
742
|
+
# Returns the PEG notation of this rule as a string.
|
743
|
+
def to_s
|
744
|
+
rule.embed + operator
|
745
|
+
end
|
746
|
+
end
|
747
|
+
|
748
|
+
# A List is a Nonterminal that contains any number of other rules and tests
|
749
|
+
# them for matches in sequential order.
|
750
|
+
module List
|
751
|
+
include Nonterminal
|
752
|
+
|
753
|
+
def paren?
|
754
|
+
rules.length > 1
|
755
|
+
end
|
756
|
+
end
|
757
|
+
|
758
|
+
# A Choice is a List where only one rule must match. The PEG notation is two
|
759
|
+
# or more expressions separated by a vertical bar, e.g.:
|
760
|
+
#
|
761
|
+
# expr | expr
|
762
|
+
#
|
763
|
+
class Choice
|
764
|
+
include List
|
765
|
+
|
766
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
767
|
+
# no match can be made.
|
768
|
+
def match(input, offset=0)
|
769
|
+
rules.each do |rule|
|
770
|
+
m = input.match(rule, offset)
|
771
|
+
return create_match([m], offset) if m
|
772
|
+
end
|
773
|
+
nil
|
774
|
+
end
|
775
|
+
|
776
|
+
# Returns the PEG notation of this rule as a string.
|
777
|
+
def to_s
|
778
|
+
rules.map {|r| r.embed }.join(' | ')
|
779
|
+
end
|
780
|
+
end
|
781
|
+
|
782
|
+
# A Sequence is a List where all rules must match. The PEG notation is two or
|
783
|
+
# more expressions separated by a space, e.g.:
|
784
|
+
#
|
785
|
+
# expr expr
|
786
|
+
#
|
787
|
+
class Sequence
|
788
|
+
include List
|
789
|
+
|
790
|
+
# Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
|
791
|
+
# no match can be made.
|
792
|
+
def match(input, offset=0)
|
793
|
+
matches = []
|
794
|
+
os = offset
|
795
|
+
rules.each do |rule|
|
796
|
+
m = input.match(rule, os)
|
797
|
+
break unless m
|
798
|
+
matches << m
|
799
|
+
os += m.length
|
800
|
+
end
|
801
|
+
create_match(matches, offset) if matches.length == rules.length
|
802
|
+
end
|
803
|
+
|
804
|
+
# Returns the PEG notation of this rule as a string.
|
805
|
+
def to_s
|
806
|
+
rules.map {|r| r.embed }.join(' ')
|
807
|
+
end
|
808
|
+
end
|
809
|
+
|
810
|
+
# The base class for all matches. Matches are organized into a tree where any
|
811
|
+
# match may contain any number of other matches. This class provides several
|
812
|
+
# convenient tree traversal methods that help when examining parse results.
|
813
|
+
class Match
|
814
|
+
def initialize(data, offset=0)
|
815
|
+
case data
|
816
|
+
when String
|
817
|
+
@text = data
|
818
|
+
when MatchData
|
819
|
+
@text = data[0]
|
820
|
+
@captures = data.captures
|
821
|
+
when Array
|
822
|
+
@matches = data
|
823
|
+
end
|
824
|
+
|
825
|
+
@offset = offset
|
826
|
+
end
|
827
|
+
|
828
|
+
# The name by which this match can be accessed from a parent match. This
|
829
|
+
# will be the name of the rule that generated the match in most cases.
|
830
|
+
# However, if the match is the result of a Label this will be the value of
|
831
|
+
# the label.
|
832
|
+
attr_accessor :name
|
833
|
+
|
834
|
+
# The offset in the input at which this match occurred.
|
835
|
+
attr_reader :offset
|
836
|
+
|
837
|
+
# An array of all sub-matches of this match.
|
838
|
+
def matches
|
839
|
+
@matches ||= []
|
840
|
+
end
|
841
|
+
|
842
|
+
# An array of substrings returned by MatchData#captures if this match was
|
843
|
+
# created by an Expression.
|
844
|
+
def captures
|
845
|
+
@captures ||= []
|
846
|
+
end
|
847
|
+
|
848
|
+
# Returns the raw text value of this match, which may simply be an
|
849
|
+
# aggregate of the text of all sub-matches if this match is not #terminal?.
|
850
|
+
def text
|
851
|
+
@text ||= matches.inject('') {|s, m| s << m.text }
|
852
|
+
end
|
853
|
+
|
854
|
+
alias to_s text
|
855
|
+
|
856
|
+
# Returns the length of this match's #text value as an Integer.
|
857
|
+
def length
|
858
|
+
text.length
|
859
|
+
end
|
860
|
+
|
861
|
+
# Passes all arguments to the #text of this match.
|
862
|
+
def [](*args)
|
863
|
+
text.__send__(:[], *args)
|
864
|
+
end
|
865
|
+
|
866
|
+
# Returns an array of all sub-matches with the given +name+. If +deep+ is
|
867
|
+
# +false+, returns only sub-matches that are immediate descendants of this
|
868
|
+
# match.
|
869
|
+
def find(name, deep=true)
|
870
|
+
sym = name.to_sym
|
871
|
+
ms = matches.select {|m| sym == m.name }
|
872
|
+
ms.concat(matches.map {|m| m.find(name, deep) }.flatten) if deep
|
873
|
+
ms
|
874
|
+
end
|
875
|
+
|
876
|
+
# A shortcut for retrieving the first immediate sub-match of this match. If
|
877
|
+
# +name+ is given, attempts to retrieve the first immediate sub-match named
|
878
|
+
# +name+.
|
879
|
+
def first(name=nil)
|
880
|
+
name.nil? ? matches.first : find(name, false).first
|
881
|
+
end
|
882
|
+
|
883
|
+
# Returns +true+ if this match has no descendants (was created from a
|
884
|
+
# Terminal).
|
885
|
+
def terminal?
|
886
|
+
matches.length == 0
|
887
|
+
end
|
888
|
+
|
889
|
+
# Checks equality by comparing this match's #text value to +obj+.
|
890
|
+
def ==(obj)
|
891
|
+
text == obj
|
892
|
+
end
|
893
|
+
|
894
|
+
alias eql? ==
|
895
|
+
|
896
|
+
# Uses #match to allow sub-matches of this match to be called by name as
|
897
|
+
# instance methods.
|
898
|
+
def method_missing(sym, *args)
|
899
|
+
m = first(sym)
|
900
|
+
return m if m
|
901
|
+
raise 'No match named "%s" in %s (%s)' % [sym, self, name]
|
902
|
+
end
|
903
|
+
end
|
904
|
+
end
|