ebnf 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,222 @@
1
+ module EBNF::PEG
2
+ # Behaviior for parsing a PEG rule
3
+ module Rule
4
+ ##
5
+ # Initialized by parser when loading rules.
6
+ # Used for finding rules and invoking elements of the parse process.
7
+ #
8
+ # @return [EBNF::PEG::Parser] parser
9
+ attr_accessor :parser
10
+
11
+ ##
12
+ # Parse a rule or terminal, invoking callbacks, as appropriate
13
+
14
+ # If there is are `start_production` and/or `production`,
15
+ # they are invoked with a `prod_data` stack, the input stream and offset.
16
+ # Otherwise, the results are added as an array value
17
+ # to a hash indexed by the rule name.
18
+ #
19
+ # If matched, the input position is updated and the results returned in a Hash.
20
+ #
21
+ # * `alt`: returns the value of the matched production or `:unmatched`
22
+ # * `diff`: returns the string value matched, or `:unmatched`
23
+ # * `hex`: returns a string composed of the matched hex character, or `:unmatched`.
24
+ # * `opt`: returns the matched production, or `nil` if unmatched.
25
+ # * `plus`: returns an array of the matches for the specified production, or `:unmatched`, if none are matched. For Terminals, these are concatenated into a single string.
26
+ # * `range`: returns a string composed of the character matching the range, or `:unmatched`.
27
+ # * `seq`: returns an array composed of single-entry hashes for each matched production indexed by the production name, or `:unmatched` if any production fails to match. For Terminals, returns a string created by concatenating these values.
28
+ # * `star`: returns an array of the matches for the specified production.For Terminals, these are concatenated into a single string.
29
+ # @param [Scanner] input
30
+ # @return [Hash{Symbol => Object}, :unmatched] A hash with keys for matched component of the expression. Returns :unmatched if the input does not match the production.
31
+ def parse(input)
32
+ # Save position and linenumber for backtracking
33
+ pos, lineno = input.pos, input.lineno
34
+
35
+ parser.packrat[sym] ||= {}
36
+ if parser.packrat[sym][pos]
37
+ parser.debug("#{sym}(:memo)", lineno: lineno) { "#{parser.packrat[sym][pos].inspect}(@#{pos})"}
38
+ input.pos, input.lineno = parser.packrat[sym][pos][:pos], parser.packrat[sym][pos][:lineno]
39
+ return parser.packrat[sym][pos][:result]
40
+ end
41
+
42
+ if terminal?
43
+ # If the terminal is defined with a regular expression,
44
+ # use that to match the input,
45
+ # otherwise,
46
+ if regexp = parser.find_terminal_regexp(sym)
47
+ matched = input.scan(regexp)
48
+ result = (matched ? parser.onTerminal(sym, matched) : :unmatched)
49
+ # Update furthest failure for strings and terminals
50
+ parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched
51
+ parser.packrat[sym][pos] = {
52
+ pos: input.pos,
53
+ lineno: input.lineno,
54
+ result: result
55
+ }
56
+ return parser.packrat[sym][pos][:result]
57
+ end
58
+ else
59
+ eat_whitespace(input)
60
+ end
61
+ parser.onStart(sym)
62
+
63
+ result = case expr.first
64
+ when :alt
65
+ # Return the first expression to match.
66
+ # Result is either :unmatched, or the value of the matching rule
67
+ alt = :unmatched
68
+ expr[1..-1].each do |prod|
69
+ alt = case prod
70
+ when Symbol
71
+ rule = parser.find_rule(prod)
72
+ raise "No rule found for #{prod}" unless rule
73
+ rule.parse(input)
74
+ when String
75
+ input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
76
+ end
77
+ if alt == :unmatched
78
+ # Update furthest failure for strings and terminals
79
+ parser.update_furthest_failure(input.pos, input.lineno, prod) if prod.is_a?(String) || rule.terminal?
80
+ else
81
+ break
82
+ end
83
+ end
84
+ alt
85
+ when :diff
86
+ # matches any string that matches A but does not match B.
87
+ # XXX: Should this work for arbitrary rules?
88
+ re1, re2 = Regexp.new(translate_codepoints(expr[1])), Regexp.new(translate_codepoints(expr[2]))
89
+ matched = input.scan(re1)
90
+ if !matched || re2.match?(matched)
91
+ # Update furthest failure for terminals
92
+ parser.update_furthest_failure(input.pos, input.lineno, sym)
93
+ :unmatched
94
+ else
95
+ matched
96
+ end
97
+ when :hex
98
+ # Matches the given hex character if expression matches the character whose number (code point) in ISO/IEC 10646 is N. The number of leading zeros in the #xN form is insignificant.
99
+ input.scan(to_regexp) || begin
100
+ # Update furthest failure for terminals
101
+ parser.update_furthest_failure(input.pos, input.lineno, expr.last)
102
+ :unmatched
103
+ end
104
+ when :opt
105
+ # Always matches
106
+ opt = case prod = expr[1]
107
+ when Symbol
108
+ rule = parser.find_rule(prod)
109
+ raise "No rule found for #{prod}" unless rule
110
+ rule.parse(input)
111
+ when String
112
+ input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
113
+ end
114
+ if opt == :unmatched
115
+ # Update furthest failure for terminals
116
+ parser.update_furthest_failure(input.pos, input.lineno, prod) if terminal?
117
+ nil
118
+ else
119
+ opt
120
+ end
121
+ when :plus
122
+ # Result is an array of all expressions while they match,
123
+ # at least one must match
124
+ prod, plus = expr[1], []
125
+ case prod
126
+ when Symbol
127
+ rule = parser.find_rule(prod)
128
+ raise "No rule found for #{prod}" unless rule
129
+ while (res = rule.parse(input)) != :unmatched
130
+ eat_whitespace(input)
131
+ plus << res
132
+ end
133
+ when String
134
+ while res = input.scan(Regexp.new(Regexp.quote(prod)))
135
+ eat_whitespace(input)
136
+ plus << res
137
+ end
138
+ end
139
+ # Update furthest failure for strings and terminals
140
+ parser.update_furthest_failure(input.pos, input.lineno, prod)
141
+ plus.empty? ? :unmatched : (terminal? ? plus.compact.join("") : plus.compact)
142
+ when :range
143
+ # Matches the specified character range
144
+ input.scan(to_regexp) || begin
145
+ # Update furthest failure for strings and terminals
146
+ parser.update_furthest_failure(input.pos, input.lineno, expr[1])
147
+ :unmatched
148
+ end
149
+ when :seq
150
+ # Evaluate each expression into an array of hashes where each hash contains a key from the associated production and the value is the parsed value of that production. Returns :unmatched if the input does not match the production. Value ordering is ensured by native Hash ordering.
151
+ seq = expr[1..-1].each_with_object([]) do |prod, accumulator|
152
+ eat_whitespace(input) unless accumulator.empty?
153
+ res = case prod
154
+ when Symbol
155
+ rule = parser.find_rule(prod)
156
+ raise "No rule found for #{prod}" unless rule
157
+ rule.parse(input)
158
+ when String
159
+ input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
160
+ end
161
+ if res == :unmatched
162
+ # Update furthest failure for strings and terminals
163
+ parser.update_furthest_failure(input.pos, input.lineno, prod)
164
+ break :unmatched
165
+ end
166
+ accumulator << {prod.to_sym => res}
167
+ end
168
+ seq == :unmatched ?
169
+ :unmatched :
170
+ (terminal? ?
171
+ seq.map(&:values).compact.join("") : # Concat values for terminal production
172
+ seq)
173
+ when :star
174
+ # Result is an array of all expressions while they match,
175
+ # an empty array of none match
176
+ prod, star = expr[1], []
177
+ case prod
178
+ when Symbol
179
+ rule = parser.find_rule(prod)
180
+ raise "No rule found for #{prod}" unless rule
181
+ while (res = rule.parse(input)) != :unmatched
182
+ eat_whitespace(input)
183
+ star << res
184
+ end
185
+ when String
186
+ while res = input.scan(Regexp.new(Regexp.quote(prod)))
187
+ eat_whitespace(input)
188
+ star << res
189
+ end
190
+ end
191
+ # Update furthest failure for strings and terminals
192
+ parser.update_furthest_failure(input.pos, input.lineno, prod)
193
+ star.compact
194
+ else
195
+ raise "attempt to parse unknown rule type: #{expr.first}"
196
+ end
197
+
198
+ if result == :unmatched
199
+ input.pos, input.lineno = pos, lineno
200
+ end
201
+
202
+ result = parser.onFinish(result)
203
+ (parser.packrat[sym] ||= {})[pos] = {
204
+ pos: input.pos,
205
+ lineno: input.lineno,
206
+ result: result
207
+ }
208
+ return parser.packrat[sym][pos][:result]
209
+ end
210
+
211
+ ##
212
+ # Eat whitespace between non-terminal rules
213
+ def eat_whitespace(input)
214
+ if parser.whitespace.is_a?(Regexp)
215
+ # Eat whitespace before a non-terminal
216
+ input.skip(parser.whitespace)
217
+ elsif parser.whitespace.is_a?(Rule)
218
+ parser.whitespace.parse(input) # throw away result
219
+ end
220
+ end
221
+ end
222
+ end
@@ -1,7 +1,9 @@
1
+ require 'scanf'
2
+
1
3
  module EBNF
2
4
  # Represent individual parsed rules
3
5
  class Rule
4
- # Operations which are flattened to seprate rules in to_bnf
6
+ # Operations which are flattened to seprate rules in to_bnf.
5
7
  BNF_OPS = %w{
6
8
  alt opt plus seq star
7
9
  }.map(&:to_sym).freeze
@@ -57,16 +59,16 @@ module EBNF
57
59
  # Determines preparation and cleanup rules for reconstituting EBNF ? * + from BNF
58
60
  attr_accessor :cleanup
59
61
 
60
- # @param [Integer] id
61
62
  # @param [Symbol] sym
63
+ # @param [Integer] id
62
64
  # @param [Array] expr
63
- # @param [Symbol] :kind
64
- # @param [String] :ebnf
65
- # @param [Array] :first
66
- # @param [Array] :follow
67
- # @param [Boolean] :start
68
- # @param [Rule] :top_rule
69
- # @param [Boolean] :cleanup
65
+ # @param [Symbol] kind (nil)
66
+ # @param [String] ebnf (nil)
67
+ # @param [Array] first (nil)
68
+ # @param [Array] follow (nil)
69
+ # @param [Boolean] start (nil)
70
+ # @param [Rule] top_rule (nil)
71
+ # @param [Boolean] cleanup (nil)
70
72
  def initialize(sym, id, expr, kind: nil, ebnf: nil, first: nil, follow: nil, start: nil, top_rule: nil, cleanup: nil)
71
73
  @sym, @id = sym, id
72
74
  @expr = expr.is_a?(Array) ? expr : [:seq, expr]
@@ -87,7 +89,7 @@ module EBNF
87
89
  # (rule ebnf "1" (star (alt declaration rule)))
88
90
  # (terminal O_ENUM "17" (seq "[^" (plus CHAR) "]"))
89
91
  #
90
- # Also may have (first ...), (follow ...), or (start #t)
92
+ # Also may have `(first ...)`, `(follow ...)`, or `(start #t)`.
91
93
  #
92
94
  # @param [Array] sxp
93
95
  # @return [Rule]
@@ -102,26 +104,28 @@ module EBNF
102
104
  start = sxp.any? {|e| e.is_a?(Array) && e.first.to_sym == :start}
103
105
  sym = sxp[1] if sxp[1].is_a?(Symbol)
104
106
  id = sxp[2] if sxp[2].is_a?(String)
105
- Rule.new(sym, id, expr, kind: sxp.first, first: first, follow: follow, cleanup: cleanup, start: start)
107
+ self.new(sym, id, expr, kind: sxp.first, first: first, follow: follow, cleanup: cleanup, start: start)
106
108
  end
107
109
 
108
110
  # Build a new rule creating a symbol and numbering from the current rule
109
- # Symbol and number creation is handled by the top-most rule in such a chain
111
+ # Symbol and number creation is handled by the top-most rule in such a chain.
110
112
  #
111
113
  # @param [Array] expr
114
+ # @param [Symbol] kind (nil)
115
+ # @param [Hash{Symbol => Symbol}] cleanup (nil)
112
116
  # @param [Hash{Symbol => Object}] options
113
- # @param [Symbol] :kind
114
117
  def build(expr, kind: nil, cleanup: nil, **options)
115
118
  new_sym, new_id = (@top_rule ||self).send(:make_sym_id)
116
- Rule.new(new_sym, new_id, expr,
117
- kind: kind,
118
- ebnf: @ebnf,
119
- top_rule: (@top_rule || self),
120
- cleanup: cleanup,
121
- **options)
119
+ self.class.new(new_sym, new_id, expr,
120
+ kind: kind,
121
+ ebnf: @ebnf,
122
+ top_rule: (@top_rule || self),
123
+ cleanup: cleanup,
124
+ **options)
122
125
  end
123
126
 
124
- # Return representation for building S-Expressions
127
+ # Return representation for building S-Expressions.
128
+ #
125
129
  # @return [Array]
126
130
  def for_sxp
127
131
  elements = [kind, sym]
@@ -143,7 +147,8 @@ module EBNF
143
147
 
144
148
  alias_method :to_s, :to_sxp
145
149
 
146
- # Serializes this rule to an Turtle
150
+ # Serializes this rule to an Turtle.
151
+ #
147
152
  # @return [String]
148
153
  def to_ttl
149
154
  @ebnf.debug("to_ttl") {inspect} if @ebnf
@@ -161,17 +166,24 @@ module EBNF
161
166
  "\n" + statements.join("\n")
162
167
  end
163
168
 
169
+ # Return a Ruby representation of this rule
170
+ # @return [String]
171
+ def to_ruby
172
+ "EBNF::Rule.new(#{sym.inspect}, #{id.inspect}, #{expr.inspect}#{', kind: ' + kind.inspect unless kind == :rule})"
173
+ end
174
+
164
175
  ##
165
176
  # Transform EBNF rule to BNF rules:
166
177
  #
167
- # * Transform (a [n] rule (op1 (op2))) into two rules:
168
- # (a [n] rule (op1 _a_1))
169
- # (_a_1 [n.1] rule (op2))
170
- # * Transform (a rule (opt b)) into (a rule (alt _empty b))
171
- # * Transform (a rule (star b)) into (a rule (alt _empty (seq b a)))
172
- # * Transform (a rule (plus b)) into (a rule (seq b (star b)
178
+ # * Transform (rule a "n" (op1 (op2))) into two rules:
179
+ # (rule a "n" (op1 _a_1))
180
+ # (rule _a_1 "n.1" (op2))
181
+ # * Transform (rule a (opt b)) into (rule a (alt _empty b))
182
+ # * Transform (rule a (star b)) into (rule a (alt _empty (seq b a)))
183
+ # * Transform (rule a (plus b)) into (rule a (seq b (star b)
184
+ #
185
+ # Transformation includes information used to re-construct non-transformed.
173
186
  #
174
- # Transformation includes information used to re-construct non-transformed
175
187
  # AST representation
176
188
  # @return [Array<Rule>]
177
189
  def to_bnf
@@ -198,19 +210,19 @@ module EBNF
198
210
  new_rules = new_rules.map {|r| r.to_bnf}.flatten
199
211
  elsif expr.first == :opt
200
212
  this = dup
201
- # * Transform (a rule (opt b)) into (a rule (alt _empty b))
213
+ # * Transform (rule a (opt b)) into (rule a (alt _empty b))
202
214
  this.expr = [:alt, :_empty, expr.last]
203
215
  this.cleanup = :opt
204
216
  new_rules = this.to_bnf
205
217
  elsif expr.first == :star
206
- # * Transform (a rule (star b)) into (a rule (alt _empty (seq b a)))
218
+ # * Transform (rule a (star b)) into (rule a (alt _empty (seq b a)))
207
219
  this = dup
208
220
  this.cleanup = :star
209
221
  new_rule = this.build([:seq, expr.last, this.sym], cleanup: :merge)
210
222
  this.expr = [:alt, :_empty, new_rule.sym]
211
223
  new_rules = [this] + new_rule.to_bnf
212
224
  elsif expr.first == :plus
213
- # * Transform (a rule (plus b)) into (a rule (seq b (star b)
225
+ # * Transform (rule a (plus b)) into (rule a (seq b (star b)
214
226
  this = dup
215
227
  this.cleanup = :plus
216
228
  this.expr = [:seq, expr.last, [:star, expr.last]]
@@ -230,8 +242,61 @@ module EBNF
230
242
  return new_rules
231
243
  end
232
244
 
245
+ ##
246
+ # Transform EBNF rule for PEG:
247
+ #
248
+ # * Transform (rule a "n" (op1 ... (op2 y) ...z)) into two rules:
249
+ # (rule a "n" (op1 ... _a_1 ... z))
250
+ # (rule _a_1 "n.1" (op2 y))
251
+ #
252
+ # @return [Array<Rule>]
253
+ def to_peg
254
+ new_rules = []
255
+
256
+ # Look for rules containing sub-sequences
257
+ if expr.any? {|e| e.is_a?(Array) && e.first.is_a?(Symbol)}
258
+ # duplicate ourselves for rewriting
259
+ this = dup
260
+ new_rules << this
261
+
262
+ expr.each_with_index do |e, index|
263
+ next unless e.is_a?(Array) && e.first.is_a?(Symbol)
264
+ new_rule = build(e)
265
+ this.expr[index] = new_rule.sym
266
+ new_rules << new_rule
267
+ end
268
+
269
+ # Return new rules after recursively applying #to_bnf
270
+ new_rules = new_rules.map {|r| r.to_peg}.flatten
271
+ elsif [:diff, :hex, :range].include?(expr.first)
272
+ # This rules are fine, the just need to be terminals
273
+ raise "Encountered #{expr.first.inspect}, which is a #{self.kind}, not :terminal" unless self.terminal?
274
+ new_rules << self
275
+ else
276
+ new_rules << self
277
+ end
278
+
279
+ return new_rules.map {|r| r.extend(EBNF::PEG::Rule)}
280
+ end
281
+
282
+ ##
283
+ # For :hex or :range, create a regular expression.
284
+ #
285
+ # @return [Regexp]
286
+ def to_regexp
287
+ case expr.first
288
+ when :hex
289
+ Regexp.new(translate_codepoints(expr[1]))
290
+ when :range
291
+ Regexp.new("[#{translate_codepoints(expr[1])}]")
292
+ else
293
+ raise "Can't turn #{expr.inspect} into a regexp"
294
+ end
295
+ end
296
+
233
297
  # Return the non-terminals for this rule. For seq, this is the first
234
- # non-terminals in the seq. For alt, this is every non-terminal ni the alt
298
+ # non-terminal in the sequence. For alt, this is every non-terminal in the alt.
299
+ #
235
300
  # @param [Array<Rule>] ast
236
301
  # The set of rules, used to turn symbols into rules
237
302
  # @return [Array<Rule>]
@@ -248,7 +313,8 @@ module EBNF
248
313
  end
249
314
 
250
315
  # Return the terminals for this rule. For seq, this is the first
251
- # terminals or strings in the seq. For alt, this is every non-terminal ni the alt
316
+ # terminals or strings in the seq. For alt, this is every non-terminal ni the alt.
317
+ #
252
318
  # @param [Array<Rule>] ast
253
319
  # The set of rules, used to turn symbols into rules
254
320
  # @return [Array<Rule>]
@@ -267,8 +333,9 @@ module EBNF
267
333
  end
268
334
 
269
335
  # Does this rule start with a sym? It does if expr is that sym,
270
- # expr starts with alt and contains that sym, or
271
- # expr starts with seq and the next element is that sym
336
+ # expr starts with alt and contains that sym,
337
+ # or expr starts with seq and the next element is that sym.
338
+ #
272
339
  # @param [Symbol, class] sym
273
340
  # Symbol matching any start element, or if it is String, any start element which is a String
274
341
  # @return [Array<Symbol, String>] list of symbol (singular), or strings which are start symbol, or nil if there are none
@@ -283,12 +350,14 @@ module EBNF
283
350
  end
284
351
 
285
352
  # Do the firsts of this rule include the empty string?
353
+ #
286
354
  # @return [Boolean]
287
355
  def first_includes_eps?
288
356
  @first && @first.include?(:_eps)
289
357
  end
290
358
 
291
- # Add terminal as proceding this rule
359
+ # Add terminal as proceding this rule.
360
+ #
292
361
  # @param [Array<Rule, Symbol, String>] terminals
293
362
  # @return [Integer] if number of terminals added
294
363
  def add_first(terminals)
@@ -313,6 +382,7 @@ module EBNF
313
382
  end
314
383
 
315
384
  # Is this a terminal?
385
+ #
316
386
  # @return [Boolean]
317
387
  def terminal?
318
388
  kind == :terminal
@@ -351,7 +421,8 @@ module EBNF
351
421
  ">"
352
422
  end
353
423
 
354
- # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}
424
+ # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}.
425
+ #
355
426
  # @param [Rule] other
356
427
  # @return [Boolean]
357
428
  def ==(other)
@@ -360,26 +431,12 @@ module EBNF
360
431
  expr == other.expr
361
432
  end
362
433
 
363
- # Two rules are equivalent if they have the same {#expr}
434
+ # Two rules are equivalent if they have the same {#expr}.
435
+ #
364
436
  # @param [Rule] other
365
437
  # @return [Boolean]
366
438
  def equivalent?(other)
367
- expr == other.expr
368
- end
369
-
370
- # Rewrite the rule substituting src_rule for dst_rule wherever
371
- # it is used in the production (first level only).
372
- # @param [Rule] src_rule
373
- # @param [Rule] dst_rule
374
- # @return [Rule]
375
- def rewrite(src_rule, dst_rule)
376
- case @expr
377
- when Array
378
- @expr = @expr.map {|e| e == src_rule.sym ? dst_rule.sym : e}
379
- else
380
- @expr = dst_rule.sym if @expr == src_rule.sym
381
- end
382
- self
439
+ expr == other.expr
383
440
  end
384
441
 
385
442
  # Rules compare using their ids
@@ -391,6 +448,12 @@ module EBNF
391
448
  end
392
449
  end
393
450
 
451
+ ##
452
+ # Utility function to translate code points of the form '#xN' into ruby unicode characters
453
+ def translate_codepoints(str)
454
+ str.gsub(/#x\h+/) {|c| c[2..-1].scanf("%x").first.chr(Encoding::UTF_8)}
455
+ end
456
+
394
457
  private
395
458
  def ttl_expr(expr, pfx, depth, is_obj = true)
396
459
  indent = ' ' * depth
@@ -413,7 +476,7 @@ module EBNF
413
476
  statements << %{#{indent}#{bra}#{pfx}:#{op} }
414
477
  statements += ttl_expr(expr.first, pfx, depth + 1)
415
478
  statements << %{#{indent} #{ket}} unless ket.empty?
416
- when :_empty, :_eps, :_empty
479
+ when :_empty, :_eps
417
480
  statements << %{#{indent}"g:#{op.to_s[1..-1]}"}
418
481
  when :"'"
419
482
  statements << %{#{indent}"#{esc(expr)}"}