ebnf 1.2.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,222 @@
1
+ module EBNF::PEG
2
+ # Behaviior for parsing a PEG rule
3
+ module Rule
4
+ ##
5
+ # Initialized by parser when loading rules.
6
+ # Used for finding rules and invoking elements of the parse process.
7
+ #
8
+ # @return [EBNF::PEG::Parser] parser
9
+ attr_accessor :parser
10
+
11
+ ##
12
+ # Parse a rule or terminal, invoking callbacks, as appropriate
13
+
14
+ # If there is are `start_production` and/or `production`,
15
+ # they are invoked with a `prod_data` stack, the input stream and offset.
16
+ # Otherwise, the results are added as an array value
17
+ # to a hash indexed by the rule name.
18
+ #
19
+ # If matched, the input position is updated and the results returned in a Hash.
20
+ #
21
+ # * `alt`: returns the value of the matched production or `:unmatched`
22
+ # * `diff`: returns the string value matched, or `:unmatched`
23
+ # * `hex`: returns a string composed of the matched hex character, or `:unmatched`.
24
+ # * `opt`: returns the matched production, or `nil` if unmatched.
25
+ # * `plus`: returns an array of the matches for the specified production, or `:unmatched`, if none are matched. For Terminals, these are concatenated into a single string.
26
+ # * `range`: returns a string composed of the character matching the range, or `:unmatched`.
27
+ # * `seq`: returns an array composed of single-entry hashes for each matched production indexed by the production name, or `:unmatched` if any production fails to match. For Terminals, returns a string created by concatenating these values.
28
+ # * `star`: returns an array of the matches for the specified production.For Terminals, these are concatenated into a single string.
29
+ # @param [Scanner] input
30
+ # @return [Hash{Symbol => Object}, :unmatched] A hash with keys for matched component of the expression. Returns :unmatched if the input does not match the production.
31
+ def parse(input)
32
+ # Save position and linenumber for backtracking
33
+ pos, lineno = input.pos, input.lineno
34
+
35
+ parser.packrat[sym] ||= {}
36
+ if parser.packrat[sym][pos]
37
+ parser.debug("#{sym}(:memo)", lineno: lineno) { "#{parser.packrat[sym][pos].inspect}(@#{pos})"}
38
+ input.pos, input.lineno = parser.packrat[sym][pos][:pos], parser.packrat[sym][pos][:lineno]
39
+ return parser.packrat[sym][pos][:result]
40
+ end
41
+
42
+ if terminal?
43
+ # If the terminal is defined with a regular expression,
44
+ # use that to match the input,
45
+ # otherwise,
46
+ if regexp = parser.find_terminal_regexp(sym)
47
+ matched = input.scan(regexp)
48
+ result = (matched ? parser.onTerminal(sym, matched) : :unmatched)
49
+ # Update furthest failure for strings and terminals
50
+ parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched
51
+ parser.packrat[sym][pos] = {
52
+ pos: input.pos,
53
+ lineno: input.lineno,
54
+ result: result
55
+ }
56
+ return parser.packrat[sym][pos][:result]
57
+ end
58
+ else
59
+ eat_whitespace(input)
60
+ end
61
+ parser.onStart(sym)
62
+
63
+ result = case expr.first
64
+ when :alt
65
+ # Return the first expression to match.
66
+ # Result is either :unmatched, or the value of the matching rule
67
+ alt = :unmatched
68
+ expr[1..-1].each do |prod|
69
+ alt = case prod
70
+ when Symbol
71
+ rule = parser.find_rule(prod)
72
+ raise "No rule found for #{prod}" unless rule
73
+ rule.parse(input)
74
+ when String
75
+ input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
76
+ end
77
+ if alt == :unmatched
78
+ # Update furthest failure for strings and terminals
79
+ parser.update_furthest_failure(input.pos, input.lineno, prod) if prod.is_a?(String) || rule.terminal?
80
+ else
81
+ break
82
+ end
83
+ end
84
+ alt
85
+ when :diff
86
+ # matches any string that matches A but does not match B.
87
+ # XXX: Should this work for arbitrary rules?
88
+ re1, re2 = Regexp.new(translate_codepoints(expr[1])), Regexp.new(translate_codepoints(expr[2]))
89
+ matched = input.scan(re1)
90
+ if !matched || re2.match?(matched)
91
+ # Update furthest failure for terminals
92
+ parser.update_furthest_failure(input.pos, input.lineno, sym)
93
+ :unmatched
94
+ else
95
+ matched
96
+ end
97
+ when :hex
98
+ # Matches the given hex character if expression matches the character whose number (code point) in ISO/IEC 10646 is N. The number of leading zeros in the #xN form is insignificant.
99
+ input.scan(to_regexp) || begin
100
+ # Update furthest failure for terminals
101
+ parser.update_furthest_failure(input.pos, input.lineno, expr.last)
102
+ :unmatched
103
+ end
104
+ when :opt
105
+ # Always matches
106
+ opt = case prod = expr[1]
107
+ when Symbol
108
+ rule = parser.find_rule(prod)
109
+ raise "No rule found for #{prod}" unless rule
110
+ rule.parse(input)
111
+ when String
112
+ input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
113
+ end
114
+ if opt == :unmatched
115
+ # Update furthest failure for terminals
116
+ parser.update_furthest_failure(input.pos, input.lineno, prod) if terminal?
117
+ nil
118
+ else
119
+ opt
120
+ end
121
+ when :plus
122
+ # Result is an array of all expressions while they match,
123
+ # at least one must match
124
+ prod, plus = expr[1], []
125
+ case prod
126
+ when Symbol
127
+ rule = parser.find_rule(prod)
128
+ raise "No rule found for #{prod}" unless rule
129
+ while (res = rule.parse(input)) != :unmatched
130
+ eat_whitespace(input)
131
+ plus << res
132
+ end
133
+ when String
134
+ while res = input.scan(Regexp.new(Regexp.quote(prod)))
135
+ eat_whitespace(input)
136
+ plus << res
137
+ end
138
+ end
139
+ # Update furthest failure for strings and terminals
140
+ parser.update_furthest_failure(input.pos, input.lineno, prod)
141
+ plus.empty? ? :unmatched : (terminal? ? plus.compact.join("") : plus.compact)
142
+ when :range
143
+ # Matches the specified character range
144
+ input.scan(to_regexp) || begin
145
+ # Update furthest failure for strings and terminals
146
+ parser.update_furthest_failure(input.pos, input.lineno, expr[1])
147
+ :unmatched
148
+ end
149
+ when :seq
150
+ # Evaluate each expression into an array of hashes where each hash contains a key from the associated production and the value is the parsed value of that production. Returns :unmatched if the input does not match the production. Value ordering is ensured by native Hash ordering.
151
+ seq = expr[1..-1].each_with_object([]) do |prod, accumulator|
152
+ eat_whitespace(input) unless accumulator.empty?
153
+ res = case prod
154
+ when Symbol
155
+ rule = parser.find_rule(prod)
156
+ raise "No rule found for #{prod}" unless rule
157
+ rule.parse(input)
158
+ when String
159
+ input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
160
+ end
161
+ if res == :unmatched
162
+ # Update furthest failure for strings and terminals
163
+ parser.update_furthest_failure(input.pos, input.lineno, prod)
164
+ break :unmatched
165
+ end
166
+ accumulator << {prod.to_sym => res}
167
+ end
168
+ seq == :unmatched ?
169
+ :unmatched :
170
+ (terminal? ?
171
+ seq.map(&:values).compact.join("") : # Concat values for terminal production
172
+ seq)
173
+ when :star
174
+ # Result is an array of all expressions while they match,
175
+ # an empty array of none match
176
+ prod, star = expr[1], []
177
+ case prod
178
+ when Symbol
179
+ rule = parser.find_rule(prod)
180
+ raise "No rule found for #{prod}" unless rule
181
+ while (res = rule.parse(input)) != :unmatched
182
+ eat_whitespace(input)
183
+ star << res
184
+ end
185
+ when String
186
+ while res = input.scan(Regexp.new(Regexp.quote(prod)))
187
+ eat_whitespace(input)
188
+ star << res
189
+ end
190
+ end
191
+ # Update furthest failure for strings and terminals
192
+ parser.update_furthest_failure(input.pos, input.lineno, prod)
193
+ star.compact
194
+ else
195
+ raise "attempt to parse unknown rule type: #{expr.first}"
196
+ end
197
+
198
+ if result == :unmatched
199
+ input.pos, input.lineno = pos, lineno
200
+ end
201
+
202
+ result = parser.onFinish(result)
203
+ (parser.packrat[sym] ||= {})[pos] = {
204
+ pos: input.pos,
205
+ lineno: input.lineno,
206
+ result: result
207
+ }
208
+ return parser.packrat[sym][pos][:result]
209
+ end
210
+
211
+ ##
212
+ # Eat whitespace between non-terminal rules
213
+ def eat_whitespace(input)
214
+ if parser.whitespace.is_a?(Regexp)
215
+ # Eat whitespace before a non-terminal
216
+ input.skip(parser.whitespace)
217
+ elsif parser.whitespace.is_a?(Rule)
218
+ parser.whitespace.parse(input) # throw away result
219
+ end
220
+ end
221
+ end
222
+ end
@@ -1,7 +1,9 @@
1
+ require 'scanf'
2
+
1
3
  module EBNF
2
4
  # Represent individual parsed rules
3
5
  class Rule
4
- # Operations which are flattened to seprate rules in to_bnf
6
+ # Operations which are flattened to seprate rules in to_bnf.
5
7
  BNF_OPS = %w{
6
8
  alt opt plus seq star
7
9
  }.map(&:to_sym).freeze
@@ -57,16 +59,16 @@ module EBNF
57
59
  # Determines preparation and cleanup rules for reconstituting EBNF ? * + from BNF
58
60
  attr_accessor :cleanup
59
61
 
60
- # @param [Integer] id
61
62
  # @param [Symbol] sym
63
+ # @param [Integer] id
62
64
  # @param [Array] expr
63
- # @param [Symbol] :kind
64
- # @param [String] :ebnf
65
- # @param [Array] :first
66
- # @param [Array] :follow
67
- # @param [Boolean] :start
68
- # @param [Rule] :top_rule
69
- # @param [Boolean] :cleanup
65
+ # @param [Symbol] kind (nil)
66
+ # @param [String] ebnf (nil)
67
+ # @param [Array] first (nil)
68
+ # @param [Array] follow (nil)
69
+ # @param [Boolean] start (nil)
70
+ # @param [Rule] top_rule (nil)
71
+ # @param [Boolean] cleanup (nil)
70
72
  def initialize(sym, id, expr, kind: nil, ebnf: nil, first: nil, follow: nil, start: nil, top_rule: nil, cleanup: nil)
71
73
  @sym, @id = sym, id
72
74
  @expr = expr.is_a?(Array) ? expr : [:seq, expr]
@@ -87,7 +89,7 @@ module EBNF
87
89
  # (rule ebnf "1" (star (alt declaration rule)))
88
90
  # (terminal O_ENUM "17" (seq "[^" (plus CHAR) "]"))
89
91
  #
90
- # Also may have (first ...), (follow ...), or (start #t)
92
+ # Also may have `(first ...)`, `(follow ...)`, or `(start #t)`.
91
93
  #
92
94
  # @param [Array] sxp
93
95
  # @return [Rule]
@@ -102,26 +104,28 @@ module EBNF
102
104
  start = sxp.any? {|e| e.is_a?(Array) && e.first.to_sym == :start}
103
105
  sym = sxp[1] if sxp[1].is_a?(Symbol)
104
106
  id = sxp[2] if sxp[2].is_a?(String)
105
- Rule.new(sym, id, expr, kind: sxp.first, first: first, follow: follow, cleanup: cleanup, start: start)
107
+ self.new(sym, id, expr, kind: sxp.first, first: first, follow: follow, cleanup: cleanup, start: start)
106
108
  end
107
109
 
108
110
  # Build a new rule creating a symbol and numbering from the current rule
109
- # Symbol and number creation is handled by the top-most rule in such a chain
111
+ # Symbol and number creation is handled by the top-most rule in such a chain.
110
112
  #
111
113
  # @param [Array] expr
114
+ # @param [Symbol] kind (nil)
115
+ # @param [Hash{Symbol => Symbol}] cleanup (nil)
112
116
  # @param [Hash{Symbol => Object}] options
113
- # @param [Symbol] :kind
114
117
  def build(expr, kind: nil, cleanup: nil, **options)
115
118
  new_sym, new_id = (@top_rule ||self).send(:make_sym_id)
116
- Rule.new(new_sym, new_id, expr,
117
- kind: kind,
118
- ebnf: @ebnf,
119
- top_rule: (@top_rule || self),
120
- cleanup: cleanup,
121
- **options)
119
+ self.class.new(new_sym, new_id, expr,
120
+ kind: kind,
121
+ ebnf: @ebnf,
122
+ top_rule: (@top_rule || self),
123
+ cleanup: cleanup,
124
+ **options)
122
125
  end
123
126
 
124
- # Return representation for building S-Expressions
127
+ # Return representation for building S-Expressions.
128
+ #
125
129
  # @return [Array]
126
130
  def for_sxp
127
131
  elements = [kind, sym]
@@ -143,7 +147,8 @@ module EBNF
143
147
 
144
148
  alias_method :to_s, :to_sxp
145
149
 
146
- # Serializes this rule to an Turtle
150
+ # Serializes this rule to an Turtle.
151
+ #
147
152
  # @return [String]
148
153
  def to_ttl
149
154
  @ebnf.debug("to_ttl") {inspect} if @ebnf
@@ -161,17 +166,24 @@ module EBNF
161
166
  "\n" + statements.join("\n")
162
167
  end
163
168
 
169
+ # Return a Ruby representation of this rule
170
+ # @return [String]
171
+ def to_ruby
172
+ "EBNF::Rule.new(#{sym.inspect}, #{id.inspect}, #{expr.inspect}#{', kind: ' + kind.inspect unless kind == :rule})"
173
+ end
174
+
164
175
  ##
165
176
  # Transform EBNF rule to BNF rules:
166
177
  #
167
- # * Transform (a [n] rule (op1 (op2))) into two rules:
168
- # (a [n] rule (op1 _a_1))
169
- # (_a_1 [n.1] rule (op2))
170
- # * Transform (a rule (opt b)) into (a rule (alt _empty b))
171
- # * Transform (a rule (star b)) into (a rule (alt _empty (seq b a)))
172
- # * Transform (a rule (plus b)) into (a rule (seq b (star b)
178
+ # * Transform (rule a "n" (op1 (op2))) into two rules:
179
+ # (rule a "n" (op1 _a_1))
180
+ # (rule _a_1 "n.1" (op2))
181
+ # * Transform (rule a (opt b)) into (rule a (alt _empty b))
182
+ # * Transform (rule a (star b)) into (rule a (alt _empty (seq b a)))
183
+ # * Transform (rule a (plus b)) into (rule a (seq b (star b)
184
+ #
185
+ # Transformation includes information used to re-construct non-transformed.
173
186
  #
174
- # Transformation includes information used to re-construct non-transformed
175
187
  # AST representation
176
188
  # @return [Array<Rule>]
177
189
  def to_bnf
@@ -198,19 +210,19 @@ module EBNF
198
210
  new_rules = new_rules.map {|r| r.to_bnf}.flatten
199
211
  elsif expr.first == :opt
200
212
  this = dup
201
- # * Transform (a rule (opt b)) into (a rule (alt _empty b))
213
+ # * Transform (rule a (opt b)) into (rule a (alt _empty b))
202
214
  this.expr = [:alt, :_empty, expr.last]
203
215
  this.cleanup = :opt
204
216
  new_rules = this.to_bnf
205
217
  elsif expr.first == :star
206
- # * Transform (a rule (star b)) into (a rule (alt _empty (seq b a)))
218
+ # * Transform (rule a (star b)) into (rule a (alt _empty (seq b a)))
207
219
  this = dup
208
220
  this.cleanup = :star
209
221
  new_rule = this.build([:seq, expr.last, this.sym], cleanup: :merge)
210
222
  this.expr = [:alt, :_empty, new_rule.sym]
211
223
  new_rules = [this] + new_rule.to_bnf
212
224
  elsif expr.first == :plus
213
- # * Transform (a rule (plus b)) into (a rule (seq b (star b)
225
+ # * Transform (rule a (plus b)) into (rule a (seq b (star b)
214
226
  this = dup
215
227
  this.cleanup = :plus
216
228
  this.expr = [:seq, expr.last, [:star, expr.last]]
@@ -230,8 +242,61 @@ module EBNF
230
242
  return new_rules
231
243
  end
232
244
 
245
+ ##
246
+ # Transform EBNF rule for PEG:
247
+ #
248
+ # * Transform (rule a "n" (op1 ... (op2 y) ...z)) into two rules:
249
+ # (rule a "n" (op1 ... _a_1 ... z))
250
+ # (rule _a_1 "n.1" (op2 y))
251
+ #
252
+ # @return [Array<Rule>]
253
+ def to_peg
254
+ new_rules = []
255
+
256
+ # Look for rules containing sub-sequences
257
+ if expr.any? {|e| e.is_a?(Array) && e.first.is_a?(Symbol)}
258
+ # duplicate ourselves for rewriting
259
+ this = dup
260
+ new_rules << this
261
+
262
+ expr.each_with_index do |e, index|
263
+ next unless e.is_a?(Array) && e.first.is_a?(Symbol)
264
+ new_rule = build(e)
265
+ this.expr[index] = new_rule.sym
266
+ new_rules << new_rule
267
+ end
268
+
269
+ # Return new rules after recursively applying #to_bnf
270
+ new_rules = new_rules.map {|r| r.to_peg}.flatten
271
+ elsif [:diff, :hex, :range].include?(expr.first)
272
+ # This rules are fine, the just need to be terminals
273
+ raise "Encountered #{expr.first.inspect}, which is a #{self.kind}, not :terminal" unless self.terminal?
274
+ new_rules << self
275
+ else
276
+ new_rules << self
277
+ end
278
+
279
+ return new_rules.map {|r| r.extend(EBNF::PEG::Rule)}
280
+ end
281
+
282
+ ##
283
+ # For :hex or :range, create a regular expression.
284
+ #
285
+ # @return [Regexp]
286
+ def to_regexp
287
+ case expr.first
288
+ when :hex
289
+ Regexp.new(translate_codepoints(expr[1]))
290
+ when :range
291
+ Regexp.new("[#{translate_codepoints(expr[1])}]")
292
+ else
293
+ raise "Can't turn #{expr.inspect} into a regexp"
294
+ end
295
+ end
296
+
233
297
  # Return the non-terminals for this rule. For seq, this is the first
234
- # non-terminals in the seq. For alt, this is every non-terminal ni the alt
298
+ # non-terminal in the sequence. For alt, this is every non-terminal in the alt.
299
+ #
235
300
  # @param [Array<Rule>] ast
236
301
  # The set of rules, used to turn symbols into rules
237
302
  # @return [Array<Rule>]
@@ -248,7 +313,8 @@ module EBNF
248
313
  end
249
314
 
250
315
  # Return the terminals for this rule. For seq, this is the first
251
- # terminals or strings in the seq. For alt, this is every non-terminal ni the alt
316
+ # terminals or strings in the seq. For alt, this is every non-terminal ni the alt.
317
+ #
252
318
  # @param [Array<Rule>] ast
253
319
  # The set of rules, used to turn symbols into rules
254
320
  # @return [Array<Rule>]
@@ -267,8 +333,9 @@ module EBNF
267
333
  end
268
334
 
269
335
  # Does this rule start with a sym? It does if expr is that sym,
270
- # expr starts with alt and contains that sym, or
271
- # expr starts with seq and the next element is that sym
336
+ # expr starts with alt and contains that sym,
337
+ # or expr starts with seq and the next element is that sym.
338
+ #
272
339
  # @param [Symbol, class] sym
273
340
  # Symbol matching any start element, or if it is String, any start element which is a String
274
341
  # @return [Array<Symbol, String>] list of symbol (singular), or strings which are start symbol, or nil if there are none
@@ -283,12 +350,14 @@ module EBNF
283
350
  end
284
351
 
285
352
  # Do the firsts of this rule include the empty string?
353
+ #
286
354
  # @return [Boolean]
287
355
  def first_includes_eps?
288
356
  @first && @first.include?(:_eps)
289
357
  end
290
358
 
291
- # Add terminal as proceding this rule
359
+ # Add terminal as proceding this rule.
360
+ #
292
361
  # @param [Array<Rule, Symbol, String>] terminals
293
362
  # @return [Integer] if number of terminals added
294
363
  def add_first(terminals)
@@ -313,6 +382,7 @@ module EBNF
313
382
  end
314
383
 
315
384
  # Is this a terminal?
385
+ #
316
386
  # @return [Boolean]
317
387
  def terminal?
318
388
  kind == :terminal
@@ -351,7 +421,8 @@ module EBNF
351
421
  ">"
352
422
  end
353
423
 
354
- # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}
424
+ # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}.
425
+ #
355
426
  # @param [Rule] other
356
427
  # @return [Boolean]
357
428
  def ==(other)
@@ -360,26 +431,12 @@ module EBNF
360
431
  expr == other.expr
361
432
  end
362
433
 
363
- # Two rules are equivalent if they have the same {#expr}
434
+ # Two rules are equivalent if they have the same {#expr}.
435
+ #
364
436
  # @param [Rule] other
365
437
  # @return [Boolean]
366
438
  def equivalent?(other)
367
- expr == other.expr
368
- end
369
-
370
- # Rewrite the rule substituting src_rule for dst_rule wherever
371
- # it is used in the production (first level only).
372
- # @param [Rule] src_rule
373
- # @param [Rule] dst_rule
374
- # @return [Rule]
375
- def rewrite(src_rule, dst_rule)
376
- case @expr
377
- when Array
378
- @expr = @expr.map {|e| e == src_rule.sym ? dst_rule.sym : e}
379
- else
380
- @expr = dst_rule.sym if @expr == src_rule.sym
381
- end
382
- self
439
+ expr == other.expr
383
440
  end
384
441
 
385
442
  # Rules compare using their ids
@@ -391,6 +448,12 @@ module EBNF
391
448
  end
392
449
  end
393
450
 
451
+ ##
452
+ # Utility function to translate code points of the form '#xN' into ruby unicode characters
453
+ def translate_codepoints(str)
454
+ str.gsub(/#x\h+/) {|c| c[2..-1].scanf("%x").first.chr(Encoding::UTF_8)}
455
+ end
456
+
394
457
  private
395
458
  def ttl_expr(expr, pfx, depth, is_obj = true)
396
459
  indent = ' ' * depth
@@ -413,7 +476,7 @@ module EBNF
413
476
  statements << %{#{indent}#{bra}#{pfx}:#{op} }
414
477
  statements += ttl_expr(expr.first, pfx, depth + 1)
415
478
  statements << %{#{indent} #{ket}} unless ket.empty?
416
- when :_empty, :_eps, :_empty
479
+ when :_empty, :_eps
417
480
  statements << %{#{indent}"g:#{op.to_s[1..-1]}"}
418
481
  when :"'"
419
482
  statements << %{#{indent}"#{esc(expr)}"}