ebnf 2.0.0 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -31,7 +31,7 @@ module EBNF
31
31
  def to_ruby_peg(output, **options)
32
32
  output.puts " RULES = ["
33
33
  ast.each do |rule|
34
- output.puts " " + rule.to_ruby + '.extend(EBNF::PEG::Rule),'
34
+ output.puts " " + rule.to_ruby + (rule.is_a?(EBNF::PEG::Rule) ? '.extend(EBNF::PEG::Rule)' : '') + ','
35
35
  end
36
36
  output.puts " ]"
37
37
  end
@@ -51,6 +51,7 @@ module EBNF::PEG
51
51
  # DSL for creating terminals and productions
52
52
  module ClassMethods
53
53
  def start_handlers; (@start_handlers ||= {}); end
54
+ def start_options; (@start_hoptions ||= {}); end
54
55
  def production_handlers; (@production_handlers ||= {}); end
55
56
  def terminal_handlers; (@terminal_handlers ||= {}); end
56
57
  def terminal_regexps; (@terminal_regexps ||= {}); end
@@ -97,6 +98,10 @@ module EBNF::PEG
97
98
  #
98
99
  # @param [Symbol] term
99
100
  # The rule name
101
+ # @param [Hash{Symbol => Object}] options
102
+ # Options which are returned from {Parser#onStart}.
103
+ # @option options [Boolean] :as_hash (false)
104
+ # If the production is a `seq`, causes the value to be represented as a single hash, rather than an array of individual hashes for each sub-production. Note that this is not always advisable due to the possibility of repeated productions within the sequence.
100
105
  # @yield [data, block]
101
106
  # @yieldparam [Hash] data
102
107
  # A Hash defined for the current production, during :start
@@ -106,8 +111,9 @@ module EBNF::PEG
106
111
  # Block passed to initialization for yielding to calling parser.
107
112
  # Should conform to the yield specs for #initialize
108
113
  # Yield to generate a triple
109
- def start_production(term, &block)
114
+ def start_production(term, **options, &block)
110
115
  start_handlers[term] = block
116
+ start_options[term] = options.freeze
111
117
  end
112
118
 
113
119
  ##
@@ -204,6 +210,7 @@ module EBNF::PEG
204
210
  @whitespace = case options[:whitespace]
205
211
  when Regexp then options[:whitespace]
206
212
  when Symbol then @rules[options[:whitespace]]
213
+ else options[:whitespace]
207
214
  end ||
208
215
  @rules.values.detect(&:pass?) ||
209
216
  /(?:\s|(?:#[^x][^\n\r]*))+/m.freeze
@@ -329,19 +336,30 @@ module EBNF::PEG
329
336
  # @option options [Integer] :depth
330
337
  # Recursion depth for indenting output
331
338
  # @yieldreturn [String] additional string appended to `message`.
332
- def debug(*args)
339
+ def debug(*args, &block)
333
340
  return unless @options[:logger]
334
341
  options = args.last.is_a?(Hash) ? args.pop : {}
335
342
  lineno = options[:lineno] || (scanner.lineno if scanner)
336
343
  level = options.fetch(:level, 0)
337
-
338
344
  depth = options[:depth] || self.depth
339
- args << yield if block_given?
340
- @options[:logger].add(level, "[#{lineno}]" + (" " * depth) + args.join(" "))
345
+
346
+ if self.respond_to?(:log_debug)
347
+ level = [:debug, :info, :warn, :error, :fatal][level]
348
+ log_debug(*args, **options.merge(level: level, lineno: lineno, depth: depth), &block)
349
+ elsif @options[:logger].respond_to?(:add)
350
+ args << yield if block_given?
351
+ @options[:logger].add(level, "[#{lineno}]" + (" " * depth) + args.join(" "))
352
+ elsif @options[:logger].respond_to?(:<<)
353
+ args << yield if block_given?
354
+ @options[:logger] << "[#{lineno}]" + (" " * depth) + args.join(" ")
355
+ end
341
356
  end
342
357
 
343
358
  # Start for production
344
359
  # Adds data avoiable during the processing of the production
360
+ #
361
+ # @return [Hash] composed of production options. Currently only `as_hash` is supported.
362
+ # @see ClassMethods#start_production
345
363
  def onStart(prod)
346
364
  handler = self.class.start_handlers[prod]
347
365
  @productions << prod
@@ -367,6 +385,7 @@ module EBNF::PEG
367
385
  # explicit start handler
368
386
  @prod_data << {}
369
387
  end
388
+ return self.class.start_options.fetch(prod, {}) # any options on this production
370
389
  end
371
390
 
372
391
  # Finish of production
@@ -18,14 +18,15 @@ module EBNF::PEG
18
18
  #
19
19
  # If matched, the input position is updated and the results returned in a Hash.
20
20
  #
21
- # * `alt`: returns the value of the matched production or `:unmatched`
22
- # * `diff`: returns the string value matched, or `:unmatched`
21
+ # * `alt`: returns the value of the matched production or `:unmatched`.
22
+ # * `diff`: returns the value matched, or `:unmatched`.
23
23
  # * `hex`: returns a string composed of the matched hex character, or `:unmatched`.
24
- # * `opt`: returns the matched production, or `nil` if unmatched.
25
- # * `plus`: returns an array of the matches for the specified production, or `:unmatched`, if none are matched. For Terminals, these are concatenated into a single string.
26
- # * `range`: returns a string composed of the character matching the range, or `:unmatched`.
27
- # * `seq`: returns an array composed of single-entry hashes for each matched production indexed by the production name, or `:unmatched` if any production fails to match. For Terminals, returns a string created by concatenating these values.
28
- # * `star`: returns an array of the matches for the specified production.For Terminals, these are concatenated into a single string.
24
+ # * `opt`: returns the value matched, or `nil` if unmatched.
25
+ # * `plus`: returns an array of the values matched for the specified production, or `:unmatched`, if none are matched. For Terminals, these are concatenated into a single string.
26
+ # * `range`: returns a string composed of the values matched, or `:unmatched`, if less than `min` are matched.
27
+ # * `seq`: returns an array composed of single-entry hashes for each matched production indexed by the production name, or `:unmatched` if any production fails to match. For Terminals, returns a string created by concatenating these values. Via option in a `production` or definition, the result can be a single hash with values for each matched production; note that this is not always possible due to the possibility of repeated productions within the sequence.
28
+ # * `star`: returns an array of the values matched for the specified production. For Terminals, these are concatenated into a single string.
29
+ #
29
30
  # @param [Scanner] input
30
31
  # @return [Hash{Symbol => Object}, :unmatched] A hash with keys for matched component of the expression. Returns :unmatched if the input does not match the production.
31
32
  def parse(input)
@@ -45,7 +46,7 @@ module EBNF::PEG
45
46
  # otherwise,
46
47
  if regexp = parser.find_terminal_regexp(sym)
47
48
  matched = input.scan(regexp)
48
- result = (matched ? parser.onTerminal(sym, matched) : :unmatched)
49
+ result = parser.onTerminal(sym, (matched ? matched : :unmatched))
49
50
  # Update furthest failure for strings and terminals
50
51
  parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched
51
52
  parser.packrat[sym][pos] = {
@@ -58,7 +59,7 @@ module EBNF::PEG
58
59
  else
59
60
  eat_whitespace(input)
60
61
  end
61
- parser.onStart(sym)
62
+ start_options = parser.onStart(sym)
62
63
 
63
64
  result = case expr.first
64
65
  when :alt
@@ -84,7 +85,8 @@ module EBNF::PEG
84
85
  alt
85
86
  when :diff
86
87
  # matches any string that matches A but does not match B.
87
- # XXX: Should this work for arbitrary rules?
88
+ # (Note, this is only used for Terminal rules, non-terminals will use :not)
89
+ raise "Diff used on non-terminal #{prod}" unless terminal?
88
90
  re1, re2 = Regexp.new(translate_codepoints(expr[1])), Regexp.new(translate_codepoints(expr[2]))
89
91
  matched = input.scan(re1)
90
92
  if !matched || re2.match?(matched)
@@ -101,9 +103,9 @@ module EBNF::PEG
101
103
  parser.update_furthest_failure(input.pos, input.lineno, expr.last)
102
104
  :unmatched
103
105
  end
104
- when :opt
105
- # Always matches
106
- opt = case prod = expr[1]
106
+ when :not
107
+ # matches any string that does not match B.
108
+ res = case prod = expr[1]
107
109
  when Symbol
108
110
  rule = parser.find_rule(prod)
109
111
  raise "No rule found for #{prod}" unless rule
@@ -111,35 +113,29 @@ module EBNF::PEG
111
113
  when String
112
114
  input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
113
115
  end
114
- if opt == :unmatched
116
+ if res != :unmatched
115
117
  # Update furthest failure for terminals
116
- parser.update_furthest_failure(input.pos, input.lineno, prod) if terminal?
117
- nil
118
+ parser.update_furthest_failure(input.pos, input.lineno, sym) if terminal?
119
+ :unmatched
118
120
  else
119
- opt
121
+ nil
120
122
  end
123
+ when :opt
124
+ # Result is the matched value or nil
125
+ opt = rept(input, 0, 1, expr[1])
126
+
127
+ # Update furthest failure for strings and terminals
128
+ parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
129
+ opt.first
121
130
  when :plus
122
131
  # Result is an array of all expressions while they match,
123
132
  # at least one must match
124
- prod, plus = expr[1], []
125
- case prod
126
- when Symbol
127
- rule = parser.find_rule(prod)
128
- raise "No rule found for #{prod}" unless rule
129
- while (res = rule.parse(input)) != :unmatched
130
- eat_whitespace(input)
131
- plus << res
132
- end
133
- when String
134
- while res = input.scan(Regexp.new(Regexp.quote(prod)))
135
- eat_whitespace(input)
136
- plus << res
137
- end
138
- end
133
+ plus = rept(input, 1, '*', expr[1])
134
+
139
135
  # Update furthest failure for strings and terminals
140
- parser.update_furthest_failure(input.pos, input.lineno, prod)
141
- plus.empty? ? :unmatched : (terminal? ? plus.compact.join("") : plus.compact)
142
- when :range
136
+ parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
137
+ plus.is_a?(Array) && terminal? ? plus.join("") : plus
138
+ when :range, :istr
143
139
  # Matches the specified character range
144
140
  input.scan(to_regexp) || begin
145
141
  # Update furthest failure for strings and terminals
@@ -149,7 +145,7 @@ module EBNF::PEG
149
145
  when :seq
150
146
  # Evaluate each expression into an array of hashes where each hash contains a key from the associated production and the value is the parsed value of that production. Returns :unmatched if the input does not match the production. Value ordering is ensured by native Hash ordering.
151
147
  seq = expr[1..-1].each_with_object([]) do |prod, accumulator|
152
- eat_whitespace(input) unless accumulator.empty?
148
+ eat_whitespace(input) unless accumulator.empty? || terminal?
153
149
  res = case prod
154
150
  when Symbol
155
151
  rule = parser.find_rule(prod)
@@ -165,32 +161,23 @@ module EBNF::PEG
165
161
  end
166
162
  accumulator << {prod.to_sym => res}
167
163
  end
168
- seq == :unmatched ?
169
- :unmatched :
170
- (terminal? ?
171
- seq.map(&:values).compact.join("") : # Concat values for terminal production
172
- seq)
164
+ if seq == :unmatched
165
+ :unmatched
166
+ elsif terminal?
167
+ seq.map(&:values).compact.join("") # Concat values for terminal production
168
+ elsif start_options[:as_hash]
169
+ seq.inject {|memo, h| memo.merge(h)}
170
+ else
171
+ seq
172
+ end
173
173
  when :star
174
174
  # Result is an array of all expressions while they match,
175
175
  # an empty array of none match
176
- prod, star = expr[1], []
177
- case prod
178
- when Symbol
179
- rule = parser.find_rule(prod)
180
- raise "No rule found for #{prod}" unless rule
181
- while (res = rule.parse(input)) != :unmatched
182
- eat_whitespace(input)
183
- star << res
184
- end
185
- when String
186
- while res = input.scan(Regexp.new(Regexp.quote(prod)))
187
- eat_whitespace(input)
188
- star << res
189
- end
190
- end
176
+ star = rept(input, 0, '*', expr[1])
177
+
191
178
  # Update furthest failure for strings and terminals
192
- parser.update_furthest_failure(input.pos, input.lineno, prod)
193
- star.compact
179
+ parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
180
+ star.is_a?(Array) && terminal? ? star.join("") : star
194
181
  else
195
182
  raise "attempt to parse unknown rule type: #{expr.first}"
196
183
  end
@@ -208,6 +195,38 @@ module EBNF::PEG
208
195
  return parser.packrat[sym][pos][:result]
209
196
  end
210
197
 
198
+ ##
199
+ # Repitition, 0-1, 0-n, 1-n, ...
200
+ #
201
+ # Note, nil results are removed from the result, but count towards min/max calculations
202
+ #
203
+ # @param [Scanner] input
204
+ # @param [Integer] min
205
+ # @param [Integer] max
206
+ # If it is an integer, it stops matching after max entries.
207
+ # @param [Symbol, String] prod
208
+ # @return [:unmatched, Array]
209
+ def rept(input, min, max, prod)
210
+ result = []
211
+
212
+ case prod
213
+ when Symbol
214
+ rule = parser.find_rule(prod)
215
+ raise "No rule found for #{prod}" unless rule
216
+ while (max == '*' || result.length < max) && (res = rule.parse(input)) != :unmatched
217
+ eat_whitespace(input) unless terminal?
218
+ result << res
219
+ end
220
+ when String
221
+ while (res = input.scan(Regexp.new(Regexp.quote(prod)))) && (max == '*' || result.length < max)
222
+ eat_whitespace(input) unless terminal?
223
+ result << res
224
+ end
225
+ end
226
+
227
+ result.length < min ? :unmatched : result.compact
228
+ end
229
+
211
230
  ##
212
231
  # Eat whitespace between non-terminal rules
213
232
  def eat_whitespace(input)
@@ -1,17 +1,33 @@
1
1
  require 'scanf'
2
+ require 'strscan'
2
3
 
3
4
  module EBNF
4
5
  # Represent individual parsed rules
5
6
  class Rule
6
7
  # Operations which are flattened to seprate rules in to_bnf.
7
8
  BNF_OPS = %w{
8
- alt opt plus seq star
9
+ alt diff not opt plus rept seq star
9
10
  }.map(&:to_sym).freeze
10
11
 
11
12
  TERM_OPS = %w{
12
- diff hex range
13
+ hex istr range
13
14
  }.map(&:to_sym).freeze
14
15
 
16
+ # The number of arguments expected per operator. `nil` for unspecified
17
+ OP_ARGN = {
18
+ alt: nil,
19
+ diff: 2,
20
+ hex: 1,
21
+ istr: 1,
22
+ not: 1,
23
+ opt: 1,
24
+ plus: 1,
25
+ range: 1,
26
+ rept: 3,
27
+ seq: nil,
28
+ star: 1
29
+ }
30
+
15
31
  # Symbol of rule
16
32
  #
17
33
  # @return [Symbol]
@@ -28,7 +44,7 @@ module EBNF
28
44
 
29
45
  # Kind of rule
30
46
  #
31
- # @return [:rule, :terminal, or :pass]
47
+ # @return [:rule, :terminal, :terminals, or :pass]
32
48
  attr_accessor :kind
33
49
 
34
50
  # Rule expression
@@ -59,19 +75,38 @@ module EBNF
59
75
  # Determines preparation and cleanup rules for reconstituting EBNF ? * + from BNF
60
76
  attr_accessor :cleanup
61
77
 
62
- # @param [Symbol] sym
63
- # @param [Integer] id
78
+ # @param [Symbol, nil] sym
79
+ # `nil` is allowed only for @pass or @terminals
80
+ # @param [Integer, nil] id
64
81
  # @param [Array] expr
65
- # @param [Symbol] kind (nil)
82
+ # The expression is an internal-representation of an S-Expression with one of the following oparators:
83
+ #
84
+ # * `alt` – A list of alternative rules, which are attempted in order. It terminates with the first matching rule, or is terminated as unmatched, if no such rule is found.
85
+ # * `diff` – matches any string that matches `A` but does not match `B`.
86
+ # * `hex` – A single character represented using the hexadecimal notation `#xnn`.
87
+ # * `istr` – A string which matches in a case-insensitive manner, so that `(istr "fOo")` will match either of the strings `"foo"`, `"FOO"` or any other combination.
88
+ # * `opt` – An optional rule or terminal. It either results in the matching rule or returns `nil`.
89
+ # * `plus` – A sequence of one or more of the matching rule. If there is no such rule, it is terminated as unmatched; otherwise, the result is an array containing all matched input.
90
+ # * `range` – A range of characters, possibly repeated, of the form `(range "a-z")`. May also use hexadecimal notation.
91
+ # * `rept m n` – A sequence of at lest `m` and at most `n` of the matching rule. It will always return an array.
92
+ # * `seq` – A sequence of rules or terminals. If any (other than `opt` or `star`) to not parse, the rule is terminated as unmatched.
93
+ # * `star` – A sequence of zero or more of the matching rule. It will always return an array.
94
+ # @param [:rule, :terminal, :terminals, :pass] kind (nil)
66
95
  # @param [String] ebnf (nil)
96
+ # When parsing, records the EBNF string used to create the rule.
67
97
  # @param [Array] first (nil)
98
+ # Recorded set of terminals that can proceed this rule (LL(1))
68
99
  # @param [Array] follow (nil)
100
+ # Recorded set of terminals that can follow this rule (LL(1))
69
101
  # @param [Boolean] start (nil)
102
+ # Is this the starting rule for the grammar?
70
103
  # @param [Rule] top_rule (nil)
104
+ # The top-most rule. All expressed rules are top-rules, derived rules have the original rule as their top-rule.
71
105
  # @param [Boolean] cleanup (nil)
106
+ # Records information useful for cleaning up converted :plus, and :star expansions (LL(1)).
72
107
  def initialize(sym, id, expr, kind: nil, ebnf: nil, first: nil, follow: nil, start: nil, top_rule: nil, cleanup: nil)
73
108
  @sym, @id = sym, id
74
- @expr = expr.is_a?(Array) ? expr : [:seq, expr]
109
+ @expr = expr.is_a?(Array) ? expr : [:seq, expr].compact
75
110
  @ebnf, @kind, @first, @follow, @start, @cleanup, @top_rule = ebnf, kind, first, follow, start, cleanup, top_rule
76
111
  @top_rule ||= self
77
112
  @kind ||= case
@@ -79,21 +114,53 @@ module EBNF
79
114
  when !BNF_OPS.include?(@expr.first) then :terminal
80
115
  else :rule
81
116
  end
117
+
118
+ # Allow @pass and @terminals to not be named
119
+ @sym ||= :_pass if @kind == :pass
120
+ @sym ||= :_terminals if @kind == :terminals
121
+
122
+ raise ArgumentError, "Rule sym must be a symbol, was #{@sym.inspect}" unless @sym.is_a?(Symbol)
123
+ raise ArgumentError, "Rule id must be a string or nil, was #{@id.inspect}" unless (@id || "").is_a?(String)
124
+ raise ArgumentError, "Rule kind must be one of :rule, :terminal, :terminals, or :pass, was #{@kind.inspect}" unless
125
+ @kind.is_a?(Symbol) && %w(rule terminal terminals pass).map(&:to_sym).include?(@kind)
126
+
127
+ case @expr.first
128
+ when :alt
129
+ raise ArgumentError, "#{@expr.first} operation must have at least one operand, had #{@expr.length - 1}" unless @expr.length > 1
130
+ when :diff
131
+ raise ArgumentError, "#{@expr.first} operation must have exactly two operands, had #{@expr.length - 1}" unless @expr.length == 3
132
+ when :hex, :istr, :not, :opt, :plus, :range, :star
133
+ raise ArgumentError, "#{@expr.first} operation must have exactly one operand, had #{@expr.length - 1}" unless @expr.length == 2
134
+ when :rept
135
+ raise ArgumentError, "#{@expr.first} operation must have exactly three, had #{@expr.length - 1}" unless @expr.length == 4
136
+ raise ArgumentError, "#{@expr.first} operation must an non-negative integer minimum, was #{@expr[1]}" unless
137
+ @expr[1].is_a?(Integer) && @expr[1] >= 0
138
+ raise ArgumentError, "#{@expr.first} operation must an non-negative integer maximum or '*', was #{@expr[2]}" unless
139
+ @expr[2] == '*' || @expr[2].is_a?(Integer) && @expr[2] >= 0
140
+ when :seq
141
+ # It's legal to have a zero-length sequence
142
+ else
143
+ raise ArgumentError, "Rule expression must be an array using a known operator, was #{@expr.first}"
144
+ end
82
145
  end
83
146
 
84
147
  ##
85
148
  # Return a rule from its SXP representation:
86
149
  #
87
150
  # @example inputs
88
- # (pass (plus (range "#x20\\t\\r\\n")))
151
+ # (pass _pass (plus (range "#x20\\t\\r\\n")))
89
152
  # (rule ebnf "1" (star (alt declaration rule)))
90
- # (terminal O_ENUM "17" (seq "[^" (plus CHAR) "]"))
153
+ # (terminal R_CHAR "19" (diff CHAR (alt "]" "-")))
91
154
  #
92
155
  # Also may have `(first ...)`, `(follow ...)`, or `(start #t)`.
93
156
  #
94
- # @param [Array] sxp
157
+ # @param [String, Array] sxp
95
158
  # @return [Rule]
96
159
  def self.from_sxp(sxp)
160
+ if sxp.is_a?(String)
161
+ require 'sxp' unless defined?(SXP)
162
+ sxp = SXP.parse(sxp)
163
+ end
97
164
  expr = sxp.detect {|e| e.is_a?(Array) && ![:first, :follow, :start].include?(e.first.to_sym)}
98
165
  first = sxp.detect {|e| e.is_a?(Array) && e.first.to_sym == :first}
99
166
  first = first[1..-1] if first
@@ -115,11 +182,11 @@ module EBNF
115
182
  # @param [Hash{Symbol => Symbol}] cleanup (nil)
116
183
  # @param [Hash{Symbol => Object}] options
117
184
  def build(expr, kind: nil, cleanup: nil, **options)
118
- new_sym, new_id = (@top_rule ||self).send(:make_sym_id)
185
+ new_sym, new_id = @top_rule.send(:make_sym_id)
119
186
  self.class.new(new_sym, new_id, expr,
120
187
  kind: kind,
121
188
  ebnf: @ebnf,
122
- top_rule: (@top_rule || self),
189
+ top_rule: @top_rule,
123
190
  cleanup: cleanup,
124
191
  **options)
125
192
  end
@@ -152,15 +219,16 @@ module EBNF
152
219
  # @return [String]
153
220
  def to_ttl
154
221
  @ebnf.debug("to_ttl") {inspect} if @ebnf
155
- comment = orig.to_s.strip.
156
- gsub(/"""/, '\"\"\"').
157
- gsub("\\", "\\\\").
158
- sub(/^\"/, '\"').
159
- sub(/\"$/m, '\"')
160
- statements = [
161
- %{:#{id} rdfs:label "#{id}"; rdf:value "#{sym}";},
162
- %{ rdfs:comment #{comment.inspect};},
163
- ]
222
+ statements = [%{:#{sym} rdfs:label "#{sym}";}]
223
+ if orig
224
+ comment = orig.to_s.strip.
225
+ gsub(/"""/, '\"\"\"').
226
+ gsub("\\", "\\\\").
227
+ sub(/^\"/, '\"').
228
+ sub(/\"$/m, '\"')
229
+ statements << %{ rdfs:comment #{comment.inspect};}
230
+ end
231
+ statements << %{ dc:identifier "#{id}";} if id
164
232
 
165
233
  statements += ttl_expr(expr, terminal? ? "re" : "g", 1, false)
166
234
  "\n" + statements.join("\n")
@@ -175,12 +243,13 @@ module EBNF
175
243
  ##
176
244
  # Transform EBNF rule to BNF rules:
177
245
  #
178
- # * Transform (rule a "n" (op1 (op2))) into two rules:
179
- # (rule a "n" (op1 _a_1))
180
- # (rule _a_1 "n.1" (op2))
181
- # * Transform (rule a (opt b)) into (rule a (alt _empty b))
182
- # * Transform (rule a (star b)) into (rule a (alt _empty (seq b a)))
183
- # * Transform (rule a (plus b)) into (rule a (seq b (star b)
246
+ # * Transform `(rule a "n" (op1 (op2)))` into two rules:
247
+ #
248
+ # (rule a "n" (op1 _a_1))
249
+ # (rule _a_1 "n.1" (op2))
250
+ # * Transform `(rule a (opt b))` into `(rule a (alt _empty b))`
251
+ # * Transform `(rule a (star b))` into `(rule a (alt _empty (seq b a)))`
252
+ # * Transform `(rule a (plus b))` into `(rule a (seq b (star b)`
184
253
  #
185
254
  # Transformation includes information used to re-construct non-transformed.
186
255
  #
@@ -231,7 +300,7 @@ module EBNF
231
300
  # Otherwise, no further transformation necessary
232
301
  new_rules << self
233
302
  elsif [:diff, :hex, :range].include?(expr.first)
234
- # This rules are fine, the just need to be terminals
303
+ # This rules are fine, they just need to be terminals
235
304
  raise "Encountered #{expr.first.inspect}, which is a #{self.kind}, not :terminal" unless self.terminal?
236
305
  new_rules << self
237
306
  else
@@ -245,9 +314,14 @@ module EBNF
245
314
  ##
246
315
  # Transform EBNF rule for PEG:
247
316
  #
248
- # * Transform (rule a "n" (op1 ... (op2 y) ...z)) into two rules:
249
- # (rule a "n" (op1 ... _a_1 ... z))
250
- # (rule _a_1 "n.1" (op2 y))
317
+ # * Transform `(rule a "n" (op1 ... (op2 y) ...z))` into two rules:
318
+ #
319
+ # (rule a "n" (op1 ... _a_1 ... z))
320
+ # (rule _a_1 "n.1" (op2 y))
321
+ # * Transform `(rule a "n" (diff op1 op2))` into two rules:
322
+ #
323
+ # (rule a "n" (seq _a_1 op1))
324
+ # (rule _a_1 "n.1" (not op1))
251
325
  #
252
326
  # @return [Array<Rule>]
253
327
  def to_peg
@@ -268,8 +342,14 @@ module EBNF
268
342
 
269
343
  # Return new rules after recursively applying #to_bnf
270
344
  new_rules = new_rules.map {|r| r.to_peg}.flatten
271
- elsif [:diff, :hex, :range].include?(expr.first)
272
- # This rules are fine, the just need to be terminals
345
+ elsif expr.first == :diff && !terminal?
346
+ this = dup
347
+ new_rule = build([:not, expr[2]])
348
+ this.expr = [:seq, new_rule.sym, expr[1]]
349
+ new_rules << this
350
+ new_rules << new_rule
351
+ elsif [:hex, :istr, :range].include?(expr.first)
352
+ # This rules are fine, they just need to be terminals
273
353
  raise "Encountered #{expr.first.inspect}, which is a #{self.kind}, not :terminal" unless self.terminal?
274
354
  new_rules << self
275
355
  else
@@ -287,6 +367,8 @@ module EBNF
287
367
  case expr.first
288
368
  when :hex
289
369
  Regexp.new(translate_codepoints(expr[1]))
370
+ when :istr
371
+ /#{expr.last}/ui
290
372
  when :range
291
373
  Regexp.new("[#{translate_codepoints(expr[1])}]")
292
374
  else
@@ -294,45 +376,170 @@ module EBNF
294
376
  end
295
377
  end
296
378
 
297
- # Return the non-terminals for this rule. For seq, this is the first
298
- # non-terminal in the sequence. For alt, this is every non-terminal in the alt.
379
+ # Is this a terminal?
380
+ #
381
+ # @return [Boolean]
382
+ def terminal?
383
+ kind == :terminal
384
+ end
385
+
386
+ # Is this a pass?
387
+ # @return [Boolean]
388
+ def pass?
389
+ kind == :pass
390
+ end
391
+
392
+ # Is this a rule?
393
+ # @return [Boolean]
394
+ def rule?
395
+ kind == :rule
396
+ end
397
+
398
+ # Is this rule of the form (alt ...)?
399
+ def alt?
400
+ expr.is_a?(Array) && expr.first == :alt
401
+ end
402
+
403
+ # Is this rule of the form (seq ...)?
404
+ def seq?
405
+ expr.is_a?(Array) && expr.first == :seq
406
+ end
407
+
408
+ def inspect
409
+ "#<EBNF::Rule:#{object_id} " +
410
+ {sym: sym, id: id, kind: kind, expr: expr}.inspect +
411
+ ">"
412
+ end
413
+
414
+ # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}.
415
+ #
416
+ # @param [Rule] other
417
+ # @return [Boolean]
418
+ def ==(other)
419
+ sym == other.sym &&
420
+ kind == other.kind &&
421
+ expr == other.expr
422
+ end
423
+
424
+ # Two rules are equivalent if they have the same {#expr}.
425
+ #
426
+ # @param [Rule] other
427
+ # @return [Boolean]
428
+ def eql?(other)
429
+ expr == other.expr
430
+ end
431
+
432
+ # Rules compare using their ids
433
+ def <=>(other)
434
+ if id && other.id
435
+ if id == other.id
436
+ id.to_s <=> other.id.to_s
437
+ else
438
+ id.to_f <=> other.id.to_f
439
+ end
440
+ else
441
+ sym.to_s <=> other.sym.to_s
442
+ end
443
+ end
444
+
445
+ ##
446
+ # Utility function to translate code points of the form '#xN' into ruby unicode characters
447
+ def translate_codepoints(str)
448
+ str.gsub(/#x\h+/) {|c| c[2..-1].scanf("%x").first.chr(Encoding::UTF_8)}
449
+ end
450
+
451
+ # Return the non-terminals for this rule.
452
+ #
453
+ # * `alt` => this is every non-terminal.
454
+ # * `diff` => this is every non-terminal.
455
+ # * `hex` => nil
456
+ # * `istr` => nil
457
+ # * `not` => this is the last expression, if any.
458
+ # * `opt` => this is the last expression, if any.
459
+ # * `plus` => this is the last expression, if any.
460
+ # * `range` => nil
461
+ # * `rept` => this is the last expression, if any.
462
+ # * `seq` => this is the first expression in the sequence, if any.
463
+ # * `star` => this is the last expression, if any.
299
464
  #
300
465
  # @param [Array<Rule>] ast
301
466
  # The set of rules, used to turn symbols into rules
467
+ # @param [Array<Symbol,String,Array>] expr (@expr)
468
+ # The expression to check, defaults to the rule expression.
469
+ # Typically, if the expression is recursive, the embedded expression is called recursively.
302
470
  # @return [Array<Rule>]
303
- def non_terminals(ast)
304
- @non_terms ||= (alt? ? expr[1..-1] : expr[1,1]).map do |sym|
471
+ # @note this is used for LL(1) tansformation, so rule types are limited
472
+ def non_terminals(ast, expr = @expr)
473
+ ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).map do |sym|
305
474
  case sym
306
475
  when Symbol
307
476
  r = ast.detect {|r| r.sym == sym}
308
477
  r if r && r.rule?
478
+ when Array
479
+ non_terminals(ast, sym)
309
480
  else
310
481
  nil
311
482
  end
312
- end.compact
483
+ end.flatten.compact.uniq
313
484
  end
314
485
 
315
- # Return the terminals for this rule. For seq, this is the first
316
- # terminals or strings in the seq. For alt, this is every non-terminal ni the alt.
486
+ # Return the terminals for this rule.
487
+ #
488
+ # * `alt` => this is every terminal.
489
+ # * `diff` => this is every terminal.
490
+ # * `hex` => nil
491
+ # * `istr` => nil
492
+ # * `not` => this is the last expression, if any.
493
+ # * `opt` => this is the last expression, if any.
494
+ # * `plus` => this is the last expression, if any.
495
+ # * `range` => nil
496
+ # * `rept` => this is the last expression, if any.
497
+ # * `seq` => this is the first expression in the sequence, if any.
498
+ # * `star` => this is the last expression, if any.
317
499
  #
318
500
  # @param [Array<Rule>] ast
319
501
  # The set of rules, used to turn symbols into rules
502
+ # @param [Array<Symbol,String,Array>] expr (@expr)
503
+ # The expression to check, defaults to the rule expression.
504
+ # Typically, if the expression is recursive, the embedded expression is called recursively.
320
505
  # @return [Array<Rule>]
321
- def terminals(ast)
322
- @terms ||= (alt? ? expr[1..-1] : expr[1,1]).map do |sym|
506
+ # @note this is used for LL(1) tansformation, so rule types are limited
507
+ def terminals(ast, expr = @expr)
508
+ ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).map do |sym|
323
509
  case sym
324
510
  when Symbol
325
511
  r = ast.detect {|r| r.sym == sym}
326
512
  r if r && r.terminal?
327
513
  when String
328
514
  sym
329
- else
330
- nil
515
+ when Array
516
+ terminals(ast, sym)
331
517
  end
332
- end.compact
518
+ end.flatten.compact.uniq
333
519
  end
334
520
 
335
- # Does this rule start with a sym? It does if expr is that sym,
521
+ # Return the symbols used in the rule.
522
+ #
523
+ # @param [Array<Symbol,String,Array>] expr (@expr)
524
+ # The expression to check, defaults to the rule expression.
525
+ # Typically, if the expression is recursive, the embedded expression is called recursively.
526
+ # @return [Array<Rule>]
527
+ def symbols(expr = @expr)
528
+ expr[1..-1].map do |sym|
529
+ case sym
530
+ when Symbol
531
+ sym
532
+ when Array
533
+ symbols(sym)
534
+ end
535
+ end.flatten.compact.uniq
536
+ end
537
+
538
+ ##
539
+ # The following are used for LL(1) transformation.
540
+ ##
541
+
542
+ # Does this rule start with `sym`? It does if expr is that sym,
336
543
  # expr starts with alt and contains that sym,
337
544
  # or expr starts with seq and the next element is that sym.
338
545
  #
@@ -349,6 +556,92 @@ module EBNF
349
556
  end
350
557
  end
351
558
 
559
+ ##
560
+ # Validate the rule, with respect to an AST.
561
+ #
562
+ # @param [Array<Rule>] ast
563
+ # The set of rules, used to turn symbols into rules
564
+ # @param [Array<Symbol,String,Array>] expr (@expr)
565
+ # The expression to check, defaults to the rule expression.
566
+ # Typically, if the expression is recursive, the embedded expression is called recursively.
567
+ # @raise [RangeError]
568
+ def validate!(ast, expr = @expr)
569
+ op = expr.first
570
+ raise SyntaxError, "Unknown operator: #{op}" unless OP_ARGN.key?(op)
571
+ raise SyntaxError, "Argument count missmatch on operator #{op}, had #{expr.length - 1} expected #{OP_ARGN[op]}" if
572
+ OP_ARGN[op] && OP_ARGN[op] != expr.length - 1
573
+
574
+ # rept operator needs min and max
575
+ if op == :alt
576
+ raise SyntaxError, "alt operation must have at least one operand, had #{expr.length - 1}" unless expr.length > 1
577
+ elsif op == :rept
578
+ raise SyntaxError, "rept operation must an non-negative integer minimum, was #{expr[1]}" unless
579
+ expr[1].is_a?(Integer) && expr[1] >= 0
580
+ raise SyntaxError, "rept operation must an non-negative integer maximum or '*', was #{expr[2]}" unless
581
+ expr[2] == '*' || expr[2].is_a?(Integer) && expr[2] >= 0
582
+ end
583
+
584
+ case op
585
+ when :hex
586
+ raise SyntaxError, "Hex operand must be of form '#xN+': #{sym}" unless expr.last.match?(/^#x\h+$/)
587
+ when :range
588
+ str = expr.last.dup
589
+ str = str[1..-1] if str.start_with?('^')
590
+ str = str[0..-2] if str.end_with?('-') # Allowed at end of range
591
+ scanner = StringScanner.new(str)
592
+ hex = rchar = in_range = false
593
+ while !scanner.eos?
594
+ begin
595
+ if scanner.scan(Terminals::HEX)
596
+ raise SyntaxError if in_range && rchar
597
+ rchar = in_range = false
598
+ hex = true
599
+ elsif scanner.scan(Terminals::R_CHAR)
600
+ raise SyntaxError if in_range && hex
601
+ hex = in_range = false
602
+ rchar = true
603
+ else
604
+ raise(SyntaxError, "Range contains illegal components at offset #{scanner.pos}: was #{expr.last}")
605
+ end
606
+
607
+ if scanner.scan(/\-/)
608
+ raise SyntaxError if in_range
609
+ in_range = true
610
+ end
611
+ rescue SyntaxError
612
+ raise(SyntaxError, "Range contains illegal components at offset #{scanner.pos}: was #{expr.last}")
613
+ end
614
+ end
615
+ else
616
+ ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).each do |sym|
617
+ case sym
618
+ when Symbol
619
+ r = ast.detect {|r| r.sym == sym}
620
+ raise SyntaxError, "No rule found for #{sym}" unless r
621
+ when Array
622
+ validate!(ast, sym)
623
+ when String
624
+ raise SyntaxError, "String must be of the form CHAR*" unless sym.match?(/^#{Terminals::CHAR}*$/)
625
+ end
626
+ end
627
+ end
628
+ end
629
+
630
+ ##
631
+ # Validate the rule, with respect to an AST.
632
+ #
633
+ # Uses `#validate!` and catches `RangeError`
634
+ #
635
+ # @param [Array<Rule>] ast
636
+ # The set of rules, used to turn symbols into rules
637
+ # @return [Boolean]
638
+ def valid?(ast)
639
+ validate!(ast)
640
+ true
641
+ rescue SyntaxError
642
+ false
643
+ end
644
+
352
645
  # Do the firsts of this rule include the empty string?
353
646
  #
354
647
  # @return [Boolean]
@@ -381,79 +674,6 @@ module EBNF
381
674
  terminals.length
382
675
  end
383
676
 
384
- # Is this a terminal?
385
- #
386
- # @return [Boolean]
387
- def terminal?
388
- kind == :terminal
389
- end
390
-
391
- # Is this a pass?
392
- # @return [Boolean]
393
- def pass?
394
- kind == :pass
395
- end
396
-
397
- # Is this a rule?
398
- # @return [Boolean]
399
- def rule?
400
- kind == :rule
401
- end
402
-
403
- # Is this rule of the form (alt ...)?
404
- def alt?
405
- expr.is_a?(Array) && expr.first == :alt
406
- end
407
-
408
- # Is this rule of the form (seq ...)?
409
- def seq?
410
- expr.is_a?(Array) && expr.first == :seq
411
- end
412
-
413
- # Is this rule of the form (alt ...)?
414
- def alt?
415
- expr.is_a?(Array) && expr.first == :alt
416
- end
417
-
418
- def inspect
419
- "#<EBNF::Rule:#{object_id} " +
420
- {sym: sym, id: id, kind: kind, expr: expr}.inspect +
421
- ">"
422
- end
423
-
424
- # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}.
425
- #
426
- # @param [Rule] other
427
- # @return [Boolean]
428
- def ==(other)
429
- sym == other.sym &&
430
- kind == other.kind &&
431
- expr == other.expr
432
- end
433
-
434
- # Two rules are equivalent if they have the same {#expr}.
435
- #
436
- # @param [Rule] other
437
- # @return [Boolean]
438
- def equivalent?(other)
439
- expr == other.expr
440
- end
441
-
442
- # Rules compare using their ids
443
- def <=>(other)
444
- if id.to_i == other.id.to_i
445
- id.to_s <=> other.id.to_s
446
- else
447
- id.to_i <=> other.id.to_i
448
- end
449
- end
450
-
451
- ##
452
- # Utility function to translate code points of the form '#xN' into ruby unicode characters
453
- def translate_codepoints(str)
454
- str.gsub(/#x\h+/) {|c| c[2..-1].scanf("%x").first.chr(Encoding::UTF_8)}
455
- end
456
-
457
677
  private
458
678
  def ttl_expr(expr, pfx, depth, is_obj = true)
459
679
  indent = ' ' * depth
@@ -469,17 +689,28 @@ module EBNF
469
689
 
470
690
  case op
471
691
  when :seq, :alt, :diff
692
+ # Multiple operands
472
693
  statements << %{#{indent}#{bra}#{pfx}:#{op} (}
473
694
  expr.each {|a| statements += ttl_expr(a, pfx, depth + 1)}
474
695
  statements << %{#{indent} )#{ket}}
475
- when :opt, :plus, :star
696
+ when :opt, :plus, :star, :not
697
+ # Single operand
476
698
  statements << %{#{indent}#{bra}#{pfx}:#{op} }
477
699
  statements += ttl_expr(expr.first, pfx, depth + 1)
478
700
  statements << %{#{indent} #{ket}} unless ket.empty?
701
+ when :rept
702
+ # Three operands (min, max and expr)
703
+ statements << %{ #{indent}#{pfx}:min #{expr[0].inspect};}
704
+ statements << %{ #{indent}#{pfx}:max #{expr[1].inspect};}
705
+ statements << %{#{indent}#{bra}#{pfx}:#{op} }
706
+ statements += ttl_expr(expr.last, pfx, depth + 1)
707
+ statements << %{#{indent} #{ket}} unless ket.empty?
479
708
  when :_empty, :_eps
480
709
  statements << %{#{indent}"g:#{op.to_s[1..-1]}"}
481
710
  when :"'"
482
711
  statements << %{#{indent}"#{esc(expr)}"}
712
+ when :istr
713
+ statements << %{#{indent}#{bra} re:matches #{expr.first.inspect} #{ket}}
483
714
  when :range
484
715
  statements << %{#{indent}#{bra} re:matches #{cclass(expr.first).inspect} #{ket}}
485
716
  when :hex
@@ -535,7 +766,7 @@ module EBNF
535
766
  def make_sym_id(variation = nil)
536
767
  @id_seq ||= 0
537
768
  @id_seq += 1
538
- ["_#{@sym}_#{@id_seq}#{variation}".to_sym, "#{@id}.#{@id_seq}#{variation}"]
769
+ ["_#{@sym}_#{@id_seq}#{variation}".to_sym, ("#{@id}.#{@id_seq}#{variation}" if @id)]
539
770
  end
540
771
  end
541
772
  end