ebnf 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,7 +31,7 @@ module EBNF
31
31
  def to_ruby_peg(output, **options)
32
32
  output.puts " RULES = ["
33
33
  ast.each do |rule|
34
- output.puts " " + rule.to_ruby + '.extend(EBNF::PEG::Rule),'
34
+ output.puts " " + rule.to_ruby + (rule.is_a?(EBNF::PEG::Rule) ? '.extend(EBNF::PEG::Rule)' : '') + ','
35
35
  end
36
36
  output.puts " ]"
37
37
  end
@@ -51,6 +51,7 @@ module EBNF::PEG
51
51
  # DSL for creating terminals and productions
52
52
  module ClassMethods
53
53
  def start_handlers; (@start_handlers ||= {}); end
54
+ def start_options; (@start_hoptions ||= {}); end
54
55
  def production_handlers; (@production_handlers ||= {}); end
55
56
  def terminal_handlers; (@terminal_handlers ||= {}); end
56
57
  def terminal_regexps; (@terminal_regexps ||= {}); end
@@ -97,6 +98,10 @@ module EBNF::PEG
97
98
  #
98
99
  # @param [Symbol] term
99
100
  # The rule name
101
+ # @param [Hash{Symbol => Object}] options
102
+ # Options which are returned from {Parser#onStart}.
103
+ # @option options [Boolean] :as_hash (false)
104
+ # If the production is a `seq`, causes the value to be represented as a single hash, rather than an array of individual hashes for each sub-production. Note that this is not always advisable due to the possibility of repeated productions within the sequence.
100
105
  # @yield [data, block]
101
106
  # @yieldparam [Hash] data
102
107
  # A Hash defined for the current production, during :start
@@ -106,8 +111,9 @@ module EBNF::PEG
106
111
  # Block passed to initialization for yielding to calling parser.
107
112
  # Should conform to the yield specs for #initialize
108
113
  # Yield to generate a triple
109
- def start_production(term, &block)
114
+ def start_production(term, **options, &block)
110
115
  start_handlers[term] = block
116
+ start_options[term] = options.freeze
111
117
  end
112
118
 
113
119
  ##
@@ -204,6 +210,7 @@ module EBNF::PEG
204
210
  @whitespace = case options[:whitespace]
205
211
  when Regexp then options[:whitespace]
206
212
  when Symbol then @rules[options[:whitespace]]
213
+ else options[:whitespace]
207
214
  end ||
208
215
  @rules.values.detect(&:pass?) ||
209
216
  /(?:\s|(?:#[^x][^\n\r]*))+/m.freeze
@@ -329,19 +336,30 @@ module EBNF::PEG
329
336
  # @option options [Integer] :depth
330
337
  # Recursion depth for indenting output
331
338
  # @yieldreturn [String] additional string appended to `message`.
332
- def debug(*args)
339
+ def debug(*args, &block)
333
340
  return unless @options[:logger]
334
341
  options = args.last.is_a?(Hash) ? args.pop : {}
335
342
  lineno = options[:lineno] || (scanner.lineno if scanner)
336
343
  level = options.fetch(:level, 0)
337
-
338
344
  depth = options[:depth] || self.depth
339
- args << yield if block_given?
340
- @options[:logger].add(level, "[#{lineno}]" + (" " * depth) + args.join(" "))
345
+
346
+ if self.respond_to?(:log_debug)
347
+ level = [:debug, :info, :warn, :error, :fatal][level]
348
+ log_debug(*args, **options.merge(level: level, lineno: lineno, depth: depth), &block)
349
+ elsif @options[:logger].respond_to?(:add)
350
+ args << yield if block_given?
351
+ @options[:logger].add(level, "[#{lineno}]" + (" " * depth) + args.join(" "))
352
+ elsif @options[:logger].respond_to?(:<<)
353
+ args << yield if block_given?
354
+ @options[:logger] << "[#{lineno}]" + (" " * depth) + args.join(" ")
355
+ end
341
356
  end
342
357
 
343
358
  # Start for production
344
359
  # Adds data avoiable during the processing of the production
360
+ #
361
+ # @return [Hash] composed of production options. Currently only `as_hash` is supported.
362
+ # @see ClassMethods#start_production
345
363
  def onStart(prod)
346
364
  handler = self.class.start_handlers[prod]
347
365
  @productions << prod
@@ -367,6 +385,7 @@ module EBNF::PEG
367
385
  # explicit start handler
368
386
  @prod_data << {}
369
387
  end
388
+ return self.class.start_options.fetch(prod, {}) # any options on this production
370
389
  end
371
390
 
372
391
  # Finish of production
@@ -18,14 +18,15 @@ module EBNF::PEG
18
18
  #
19
19
  # If matched, the input position is updated and the results returned in a Hash.
20
20
  #
21
- # * `alt`: returns the value of the matched production or `:unmatched`
22
- # * `diff`: returns the string value matched, or `:unmatched`
21
+ # * `alt`: returns the value of the matched production or `:unmatched`.
22
+ # * `diff`: returns the value matched, or `:unmatched`.
23
23
  # * `hex`: returns a string composed of the matched hex character, or `:unmatched`.
24
- # * `opt`: returns the matched production, or `nil` if unmatched.
25
- # * `plus`: returns an array of the matches for the specified production, or `:unmatched`, if none are matched. For Terminals, these are concatenated into a single string.
26
- # * `range`: returns a string composed of the character matching the range, or `:unmatched`.
27
- # * `seq`: returns an array composed of single-entry hashes for each matched production indexed by the production name, or `:unmatched` if any production fails to match. For Terminals, returns a string created by concatenating these values.
28
- # * `star`: returns an array of the matches for the specified production.For Terminals, these are concatenated into a single string.
24
+ # * `opt`: returns the value matched, or `nil` if unmatched.
25
+ # * `plus`: returns an array of the values matched for the specified production, or `:unmatched`, if none are matched. For Terminals, these are concatenated into a single string.
26
+ # * `range`: returns a string composed of the values matched, or `:unmatched`, if less than `min` are matched.
27
+ # * `seq`: returns an array composed of single-entry hashes for each matched production indexed by the production name, or `:unmatched` if any production fails to match. For Terminals, returns a string created by concatenating these values. Via option in a `production` or definition, the result can be a single hash with values for each matched production; note that this is not always possible due to the possibility of repeated productions within the sequence.
28
+ # * `star`: returns an array of the values matched for the specified production. For Terminals, these are concatenated into a single string.
29
+ #
29
30
  # @param [Scanner] input
30
31
  # @return [Hash{Symbol => Object}, :unmatched] A hash with keys for matched component of the expression. Returns :unmatched if the input does not match the production.
31
32
  def parse(input)
@@ -45,7 +46,7 @@ module EBNF::PEG
45
46
  # otherwise,
46
47
  if regexp = parser.find_terminal_regexp(sym)
47
48
  matched = input.scan(regexp)
48
- result = (matched ? parser.onTerminal(sym, matched) : :unmatched)
49
+ result = parser.onTerminal(sym, (matched ? matched : :unmatched))
49
50
  # Update furthest failure for strings and terminals
50
51
  parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched
51
52
  parser.packrat[sym][pos] = {
@@ -58,7 +59,7 @@ module EBNF::PEG
58
59
  else
59
60
  eat_whitespace(input)
60
61
  end
61
- parser.onStart(sym)
62
+ start_options = parser.onStart(sym)
62
63
 
63
64
  result = case expr.first
64
65
  when :alt
@@ -84,7 +85,8 @@ module EBNF::PEG
84
85
  alt
85
86
  when :diff
86
87
  # matches any string that matches A but does not match B.
87
- # XXX: Should this work for arbitrary rules?
88
+ # (Note, this is only used for Terminal rules, non-terminals will use :not)
89
+ raise "Diff used on non-terminal #{prod}" unless terminal?
88
90
  re1, re2 = Regexp.new(translate_codepoints(expr[1])), Regexp.new(translate_codepoints(expr[2]))
89
91
  matched = input.scan(re1)
90
92
  if !matched || re2.match?(matched)
@@ -101,9 +103,9 @@ module EBNF::PEG
101
103
  parser.update_furthest_failure(input.pos, input.lineno, expr.last)
102
104
  :unmatched
103
105
  end
104
- when :opt
105
- # Always matches
106
- opt = case prod = expr[1]
106
+ when :not
107
+ # matches any string that does not match B.
108
+ res = case prod = expr[1]
107
109
  when Symbol
108
110
  rule = parser.find_rule(prod)
109
111
  raise "No rule found for #{prod}" unless rule
@@ -111,35 +113,29 @@ module EBNF::PEG
111
113
  when String
112
114
  input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
113
115
  end
114
- if opt == :unmatched
116
+ if res != :unmatched
115
117
  # Update furthest failure for terminals
116
- parser.update_furthest_failure(input.pos, input.lineno, prod) if terminal?
117
- nil
118
+ parser.update_furthest_failure(input.pos, input.lineno, sym) if terminal?
119
+ :unmatched
118
120
  else
119
- opt
121
+ nil
120
122
  end
123
+ when :opt
124
+ # Result is the matched value or nil
125
+ opt = rept(input, 0, 1, expr[1])
126
+
127
+ # Update furthest failure for strings and terminals
128
+ parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
129
+ opt.first
121
130
  when :plus
122
131
  # Result is an array of all expressions while they match,
123
132
  # at least one must match
124
- prod, plus = expr[1], []
125
- case prod
126
- when Symbol
127
- rule = parser.find_rule(prod)
128
- raise "No rule found for #{prod}" unless rule
129
- while (res = rule.parse(input)) != :unmatched
130
- eat_whitespace(input)
131
- plus << res
132
- end
133
- when String
134
- while res = input.scan(Regexp.new(Regexp.quote(prod)))
135
- eat_whitespace(input)
136
- plus << res
137
- end
138
- end
133
+ plus = rept(input, 1, '*', expr[1])
134
+
139
135
  # Update furthest failure for strings and terminals
140
- parser.update_furthest_failure(input.pos, input.lineno, prod)
141
- plus.empty? ? :unmatched : (terminal? ? plus.compact.join("") : plus.compact)
142
- when :range
136
+ parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
137
+ plus.is_a?(Array) && terminal? ? plus.join("") : plus
138
+ when :range, :istr
143
139
  # Matches the specified character range
144
140
  input.scan(to_regexp) || begin
145
141
  # Update furthest failure for strings and terminals
@@ -149,7 +145,7 @@ module EBNF::PEG
149
145
  when :seq
150
146
  # Evaluate each expression into an array of hashes where each hash contains a key from the associated production and the value is the parsed value of that production. Returns :unmatched if the input does not match the production. Value ordering is ensured by native Hash ordering.
151
147
  seq = expr[1..-1].each_with_object([]) do |prod, accumulator|
152
- eat_whitespace(input) unless accumulator.empty?
148
+ eat_whitespace(input) unless accumulator.empty? || terminal?
153
149
  res = case prod
154
150
  when Symbol
155
151
  rule = parser.find_rule(prod)
@@ -165,32 +161,23 @@ module EBNF::PEG
165
161
  end
166
162
  accumulator << {prod.to_sym => res}
167
163
  end
168
- seq == :unmatched ?
169
- :unmatched :
170
- (terminal? ?
171
- seq.map(&:values).compact.join("") : # Concat values for terminal production
172
- seq)
164
+ if seq == :unmatched
165
+ :unmatched
166
+ elsif terminal?
167
+ seq.map(&:values).compact.join("") # Concat values for terminal production
168
+ elsif start_options[:as_hash]
169
+ seq.inject {|memo, h| memo.merge(h)}
170
+ else
171
+ seq
172
+ end
173
173
  when :star
174
174
  # Result is an array of all expressions while they match,
175
175
  # an empty array of none match
176
- prod, star = expr[1], []
177
- case prod
178
- when Symbol
179
- rule = parser.find_rule(prod)
180
- raise "No rule found for #{prod}" unless rule
181
- while (res = rule.parse(input)) != :unmatched
182
- eat_whitespace(input)
183
- star << res
184
- end
185
- when String
186
- while res = input.scan(Regexp.new(Regexp.quote(prod)))
187
- eat_whitespace(input)
188
- star << res
189
- end
190
- end
176
+ star = rept(input, 0, '*', expr[1])
177
+
191
178
  # Update furthest failure for strings and terminals
192
- parser.update_furthest_failure(input.pos, input.lineno, prod)
193
- star.compact
179
+ parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
180
+ star.is_a?(Array) && terminal? ? star.join("") : star
194
181
  else
195
182
  raise "attempt to parse unknown rule type: #{expr.first}"
196
183
  end
@@ -208,6 +195,38 @@ module EBNF::PEG
208
195
  return parser.packrat[sym][pos][:result]
209
196
  end
210
197
 
198
+ ##
199
+ # Repitition, 0-1, 0-n, 1-n, ...
200
+ #
201
+ # Note, nil results are removed from the result, but count towards min/max calculations
202
+ #
203
+ # @param [Scanner] input
204
+ # @param [Integer] min
205
+ # @param [Integer] max
206
+ # If it is an integer, it stops matching after max entries.
207
+ # @param [Symbol, String] prod
208
+ # @return [:unmatched, Array]
209
+ def rept(input, min, max, prod)
210
+ result = []
211
+
212
+ case prod
213
+ when Symbol
214
+ rule = parser.find_rule(prod)
215
+ raise "No rule found for #{prod}" unless rule
216
+ while (max == '*' || result.length < max) && (res = rule.parse(input)) != :unmatched
217
+ eat_whitespace(input) unless terminal?
218
+ result << res
219
+ end
220
+ when String
221
+ while (res = input.scan(Regexp.new(Regexp.quote(prod)))) && (max == '*' || result.length < max)
222
+ eat_whitespace(input) unless terminal?
223
+ result << res
224
+ end
225
+ end
226
+
227
+ result.length < min ? :unmatched : result.compact
228
+ end
229
+
211
230
  ##
212
231
  # Eat whitespace between non-terminal rules
213
232
  def eat_whitespace(input)
@@ -1,17 +1,33 @@
1
1
  require 'scanf'
2
+ require 'strscan'
2
3
 
3
4
  module EBNF
4
5
  # Represent individual parsed rules
5
6
  class Rule
6
7
  # Operations which are flattened to seprate rules in to_bnf.
7
8
  BNF_OPS = %w{
8
- alt opt plus seq star
9
+ alt diff not opt plus rept seq star
9
10
  }.map(&:to_sym).freeze
10
11
 
11
12
  TERM_OPS = %w{
12
- diff hex range
13
+ hex istr range
13
14
  }.map(&:to_sym).freeze
14
15
 
16
+ # The number of arguments expected per operator. `nil` for unspecified
17
+ OP_ARGN = {
18
+ alt: nil,
19
+ diff: 2,
20
+ hex: 1,
21
+ istr: 1,
22
+ not: 1,
23
+ opt: 1,
24
+ plus: 1,
25
+ range: 1,
26
+ rept: 3,
27
+ seq: nil,
28
+ star: 1
29
+ }
30
+
15
31
  # Symbol of rule
16
32
  #
17
33
  # @return [Symbol]
@@ -28,7 +44,7 @@ module EBNF
28
44
 
29
45
  # Kind of rule
30
46
  #
31
- # @return [:rule, :terminal, or :pass]
47
+ # @return [:rule, :terminal, :terminals, or :pass]
32
48
  attr_accessor :kind
33
49
 
34
50
  # Rule expression
@@ -59,19 +75,38 @@ module EBNF
59
75
  # Determines preparation and cleanup rules for reconstituting EBNF ? * + from BNF
60
76
  attr_accessor :cleanup
61
77
 
62
- # @param [Symbol] sym
63
- # @param [Integer] id
78
+ # @param [Symbol, nil] sym
79
+ # `nil` is allowed only for @pass or @terminals
80
+ # @param [Integer, nil] id
64
81
  # @param [Array] expr
65
- # @param [Symbol] kind (nil)
82
+ # The expression is an internal-representation of an S-Expression with one of the following oparators:
83
+ #
84
+ # * `alt` – A list of alternative rules, which are attempted in order. It terminates with the first matching rule, or is terminated as unmatched, if no such rule is found.
85
+ # * `diff` – matches any string that matches `A` but does not match `B`.
86
+ # * `hex` – A single character represented using the hexadecimal notation `#xnn`.
87
+ # * `istr` – A string which matches in a case-insensitive manner, so that `(istr "fOo")` will match either of the strings `"foo"`, `"FOO"` or any other combination.
88
+ # * `opt` – An optional rule or terminal. It either results in the matching rule or returns `nil`.
89
+ # * `plus` – A sequence of one or more of the matching rule. If there is no such rule, it is terminated as unmatched; otherwise, the result is an array containing all matched input.
90
+ # * `range` – A range of characters, possibly repeated, of the form `(range "a-z")`. May also use hexadecimal notation.
91
+ # * `rept m n` – A sequence of at lest `m` and at most `n` of the matching rule. It will always return an array.
92
+ # * `seq` – A sequence of rules or terminals. If any (other than `opt` or `star`) to not parse, the rule is terminated as unmatched.
93
+ # * `star` – A sequence of zero or more of the matching rule. It will always return an array.
94
+ # @param [:rule, :terminal, :terminals, :pass] kind (nil)
66
95
  # @param [String] ebnf (nil)
96
+ # When parsing, records the EBNF string used to create the rule.
67
97
  # @param [Array] first (nil)
98
+ # Recorded set of terminals that can proceed this rule (LL(1))
68
99
  # @param [Array] follow (nil)
100
+ # Recorded set of terminals that can follow this rule (LL(1))
69
101
  # @param [Boolean] start (nil)
102
+ # Is this the starting rule for the grammar?
70
103
  # @param [Rule] top_rule (nil)
104
+ # The top-most rule. All expressed rules are top-rules, derived rules have the original rule as their top-rule.
71
105
  # @param [Boolean] cleanup (nil)
106
+ # Records information useful for cleaning up converted :plus, and :star expansions (LL(1)).
72
107
  def initialize(sym, id, expr, kind: nil, ebnf: nil, first: nil, follow: nil, start: nil, top_rule: nil, cleanup: nil)
73
108
  @sym, @id = sym, id
74
- @expr = expr.is_a?(Array) ? expr : [:seq, expr]
109
+ @expr = expr.is_a?(Array) ? expr : [:seq, expr].compact
75
110
  @ebnf, @kind, @first, @follow, @start, @cleanup, @top_rule = ebnf, kind, first, follow, start, cleanup, top_rule
76
111
  @top_rule ||= self
77
112
  @kind ||= case
@@ -79,21 +114,53 @@ module EBNF
79
114
  when !BNF_OPS.include?(@expr.first) then :terminal
80
115
  else :rule
81
116
  end
117
+
118
+ # Allow @pass and @terminals to not be named
119
+ @sym ||= :_pass if @kind == :pass
120
+ @sym ||= :_terminals if @kind == :terminals
121
+
122
+ raise ArgumentError, "Rule sym must be a symbol, was #{@sym.inspect}" unless @sym.is_a?(Symbol)
123
+ raise ArgumentError, "Rule id must be a string or nil, was #{@id.inspect}" unless (@id || "").is_a?(String)
124
+ raise ArgumentError, "Rule kind must be one of :rule, :terminal, :terminals, or :pass, was #{@kind.inspect}" unless
125
+ @kind.is_a?(Symbol) && %w(rule terminal terminals pass).map(&:to_sym).include?(@kind)
126
+
127
+ case @expr.first
128
+ when :alt
129
+ raise ArgumentError, "#{@expr.first} operation must have at least one operand, had #{@expr.length - 1}" unless @expr.length > 1
130
+ when :diff
131
+ raise ArgumentError, "#{@expr.first} operation must have exactly two operands, had #{@expr.length - 1}" unless @expr.length == 3
132
+ when :hex, :istr, :not, :opt, :plus, :range, :star
133
+ raise ArgumentError, "#{@expr.first} operation must have exactly one operand, had #{@expr.length - 1}" unless @expr.length == 2
134
+ when :rept
135
+ raise ArgumentError, "#{@expr.first} operation must have exactly three, had #{@expr.length - 1}" unless @expr.length == 4
136
+ raise ArgumentError, "#{@expr.first} operation must an non-negative integer minimum, was #{@expr[1]}" unless
137
+ @expr[1].is_a?(Integer) && @expr[1] >= 0
138
+ raise ArgumentError, "#{@expr.first} operation must an non-negative integer maximum or '*', was #{@expr[2]}" unless
139
+ @expr[2] == '*' || @expr[2].is_a?(Integer) && @expr[2] >= 0
140
+ when :seq
141
+ # It's legal to have a zero-length sequence
142
+ else
143
+ raise ArgumentError, "Rule expression must be an array using a known operator, was #{@expr.first}"
144
+ end
82
145
  end
83
146
 
84
147
  ##
85
148
  # Return a rule from its SXP representation:
86
149
  #
87
150
  # @example inputs
88
- # (pass (plus (range "#x20\\t\\r\\n")))
151
+ # (pass _pass (plus (range "#x20\\t\\r\\n")))
89
152
  # (rule ebnf "1" (star (alt declaration rule)))
90
- # (terminal O_ENUM "17" (seq "[^" (plus CHAR) "]"))
153
+ # (terminal R_CHAR "19" (diff CHAR (alt "]" "-")))
91
154
  #
92
155
  # Also may have `(first ...)`, `(follow ...)`, or `(start #t)`.
93
156
  #
94
- # @param [Array] sxp
157
+ # @param [String, Array] sxp
95
158
  # @return [Rule]
96
159
  def self.from_sxp(sxp)
160
+ if sxp.is_a?(String)
161
+ require 'sxp' unless defined?(SXP)
162
+ sxp = SXP.parse(sxp)
163
+ end
97
164
  expr = sxp.detect {|e| e.is_a?(Array) && ![:first, :follow, :start].include?(e.first.to_sym)}
98
165
  first = sxp.detect {|e| e.is_a?(Array) && e.first.to_sym == :first}
99
166
  first = first[1..-1] if first
@@ -115,11 +182,11 @@ module EBNF
115
182
  # @param [Hash{Symbol => Symbol}] cleanup (nil)
116
183
  # @param [Hash{Symbol => Object}] options
117
184
  def build(expr, kind: nil, cleanup: nil, **options)
118
- new_sym, new_id = (@top_rule ||self).send(:make_sym_id)
185
+ new_sym, new_id = @top_rule.send(:make_sym_id)
119
186
  self.class.new(new_sym, new_id, expr,
120
187
  kind: kind,
121
188
  ebnf: @ebnf,
122
- top_rule: (@top_rule || self),
189
+ top_rule: @top_rule,
123
190
  cleanup: cleanup,
124
191
  **options)
125
192
  end
@@ -152,15 +219,16 @@ module EBNF
152
219
  # @return [String]
153
220
  def to_ttl
154
221
  @ebnf.debug("to_ttl") {inspect} if @ebnf
155
- comment = orig.to_s.strip.
156
- gsub(/"""/, '\"\"\"').
157
- gsub("\\", "\\\\").
158
- sub(/^\"/, '\"').
159
- sub(/\"$/m, '\"')
160
- statements = [
161
- %{:#{id} rdfs:label "#{id}"; rdf:value "#{sym}";},
162
- %{ rdfs:comment #{comment.inspect};},
163
- ]
222
+ statements = [%{:#{sym} rdfs:label "#{sym}";}]
223
+ if orig
224
+ comment = orig.to_s.strip.
225
+ gsub(/"""/, '\"\"\"').
226
+ gsub("\\", "\\\\").
227
+ sub(/^\"/, '\"').
228
+ sub(/\"$/m, '\"')
229
+ statements << %{ rdfs:comment #{comment.inspect};}
230
+ end
231
+ statements << %{ dc:identifier "#{id}";} if id
164
232
 
165
233
  statements += ttl_expr(expr, terminal? ? "re" : "g", 1, false)
166
234
  "\n" + statements.join("\n")
@@ -175,12 +243,13 @@ module EBNF
175
243
  ##
176
244
  # Transform EBNF rule to BNF rules:
177
245
  #
178
- # * Transform (rule a "n" (op1 (op2))) into two rules:
179
- # (rule a "n" (op1 _a_1))
180
- # (rule _a_1 "n.1" (op2))
181
- # * Transform (rule a (opt b)) into (rule a (alt _empty b))
182
- # * Transform (rule a (star b)) into (rule a (alt _empty (seq b a)))
183
- # * Transform (rule a (plus b)) into (rule a (seq b (star b)
246
+ # * Transform `(rule a "n" (op1 (op2)))` into two rules:
247
+ #
248
+ # (rule a "n" (op1 _a_1))
249
+ # (rule _a_1 "n.1" (op2))
250
+ # * Transform `(rule a (opt b))` into `(rule a (alt _empty b))`
251
+ # * Transform `(rule a (star b))` into `(rule a (alt _empty (seq b a)))`
252
+ # * Transform `(rule a (plus b))` into `(rule a (seq b (star b)`
184
253
  #
185
254
  # Transformation includes information used to re-construct non-transformed.
186
255
  #
@@ -231,7 +300,7 @@ module EBNF
231
300
  # Otherwise, no further transformation necessary
232
301
  new_rules << self
233
302
  elsif [:diff, :hex, :range].include?(expr.first)
234
- # This rules are fine, the just need to be terminals
303
+ # This rules are fine, they just need to be terminals
235
304
  raise "Encountered #{expr.first.inspect}, which is a #{self.kind}, not :terminal" unless self.terminal?
236
305
  new_rules << self
237
306
  else
@@ -245,9 +314,14 @@ module EBNF
245
314
  ##
246
315
  # Transform EBNF rule for PEG:
247
316
  #
248
- # * Transform (rule a "n" (op1 ... (op2 y) ...z)) into two rules:
249
- # (rule a "n" (op1 ... _a_1 ... z))
250
- # (rule _a_1 "n.1" (op2 y))
317
+ # * Transform `(rule a "n" (op1 ... (op2 y) ...z))` into two rules:
318
+ #
319
+ # (rule a "n" (op1 ... _a_1 ... z))
320
+ # (rule _a_1 "n.1" (op2 y))
321
+ # * Transform `(rule a "n" (diff op1 op2))` into two rules:
322
+ #
323
+ # (rule a "n" (seq _a_1 op1))
324
+ # (rule _a_1 "n.1" (not op1))
251
325
  #
252
326
  # @return [Array<Rule>]
253
327
  def to_peg
@@ -268,8 +342,14 @@ module EBNF
268
342
 
269
343
  # Return new rules after recursively applying #to_bnf
270
344
  new_rules = new_rules.map {|r| r.to_peg}.flatten
271
- elsif [:diff, :hex, :range].include?(expr.first)
272
- # This rules are fine, the just need to be terminals
345
+ elsif expr.first == :diff && !terminal?
346
+ this = dup
347
+ new_rule = build([:not, expr[2]])
348
+ this.expr = [:seq, new_rule.sym, expr[1]]
349
+ new_rules << this
350
+ new_rules << new_rule
351
+ elsif [:hex, :istr, :range].include?(expr.first)
352
+ # This rules are fine, they just need to be terminals
273
353
  raise "Encountered #{expr.first.inspect}, which is a #{self.kind}, not :terminal" unless self.terminal?
274
354
  new_rules << self
275
355
  else
@@ -287,6 +367,8 @@ module EBNF
287
367
  case expr.first
288
368
  when :hex
289
369
  Regexp.new(translate_codepoints(expr[1]))
370
+ when :istr
371
+ /#{expr.last}/ui
290
372
  when :range
291
373
  Regexp.new("[#{translate_codepoints(expr[1])}]")
292
374
  else
@@ -294,45 +376,170 @@ module EBNF
294
376
  end
295
377
  end
296
378
 
297
- # Return the non-terminals for this rule. For seq, this is the first
298
- # non-terminal in the sequence. For alt, this is every non-terminal in the alt.
379
+ # Is this a terminal?
380
+ #
381
+ # @return [Boolean]
382
+ def terminal?
383
+ kind == :terminal
384
+ end
385
+
386
+ # Is this a pass?
387
+ # @return [Boolean]
388
+ def pass?
389
+ kind == :pass
390
+ end
391
+
392
+ # Is this a rule?
393
+ # @return [Boolean]
394
+ def rule?
395
+ kind == :rule
396
+ end
397
+
398
+ # Is this rule of the form (alt ...)?
399
+ def alt?
400
+ expr.is_a?(Array) && expr.first == :alt
401
+ end
402
+
403
+ # Is this rule of the form (seq ...)?
404
+ def seq?
405
+ expr.is_a?(Array) && expr.first == :seq
406
+ end
407
+
408
+ def inspect
409
+ "#<EBNF::Rule:#{object_id} " +
410
+ {sym: sym, id: id, kind: kind, expr: expr}.inspect +
411
+ ">"
412
+ end
413
+
414
+ # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}.
415
+ #
416
+ # @param [Rule] other
417
+ # @return [Boolean]
418
+ def ==(other)
419
+ sym == other.sym &&
420
+ kind == other.kind &&
421
+ expr == other.expr
422
+ end
423
+
424
+ # Two rules are equivalent if they have the same {#expr}.
425
+ #
426
+ # @param [Rule] other
427
+ # @return [Boolean]
428
+ def eql?(other)
429
+ expr == other.expr
430
+ end
431
+
432
+ # Rules compare using their ids
433
+ def <=>(other)
434
+ if id && other.id
435
+ if id == other.id
436
+ id.to_s <=> other.id.to_s
437
+ else
438
+ id.to_f <=> other.id.to_f
439
+ end
440
+ else
441
+ sym.to_s <=> other.sym.to_s
442
+ end
443
+ end
444
+
445
+ ##
446
+ # Utility function to translate code points of the form '#xN' into ruby unicode characters
447
+ def translate_codepoints(str)
448
+ str.gsub(/#x\h+/) {|c| c[2..-1].scanf("%x").first.chr(Encoding::UTF_8)}
449
+ end
450
+
451
+ # Return the non-terminals for this rule.
452
+ #
453
+ # * `alt` => this is every non-terminal.
454
+ # * `diff` => this is every non-terminal.
455
+ # * `hex` => nil
456
+ # * `istr` => nil
457
+ # * `not` => this is the last expression, if any.
458
+ # * `opt` => this is the last expression, if any.
459
+ # * `plus` => this is the last expression, if any.
460
+ # * `range` => nil
461
+ # * `rept` => this is the last expression, if any.
462
+ # * `seq` => this is the first expression in the sequence, if any.
463
+ # * `star` => this is the last expression, if any.
299
464
  #
300
465
  # @param [Array<Rule>] ast
301
466
  # The set of rules, used to turn symbols into rules
467
+ # @param [Array<Symbol,String,Array>] expr (@expr)
468
+ # The expression to check, defaults to the rule expression.
469
+ # Typically, if the expression is recursive, the embedded expression is called recursively.
302
470
  # @return [Array<Rule>]
303
- def non_terminals(ast)
304
- @non_terms ||= (alt? ? expr[1..-1] : expr[1,1]).map do |sym|
471
+ # @note this is used for LL(1) tansformation, so rule types are limited
472
+ def non_terminals(ast, expr = @expr)
473
+ ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).map do |sym|
305
474
  case sym
306
475
  when Symbol
307
476
  r = ast.detect {|r| r.sym == sym}
308
477
  r if r && r.rule?
478
+ when Array
479
+ non_terminals(ast, sym)
309
480
  else
310
481
  nil
311
482
  end
312
- end.compact
483
+ end.flatten.compact.uniq
313
484
  end
314
485
 
315
- # Return the terminals for this rule. For seq, this is the first
316
- # terminals or strings in the seq. For alt, this is every non-terminal ni the alt.
486
+ # Return the terminals for this rule.
487
+ #
488
+ # * `alt` => this is every terminal.
489
+ # * `diff` => this is every terminal.
490
+ # * `hex` => nil
491
+ # * `istr` => nil
492
+ # * `not` => this is the last expression, if any.
493
+ # * `opt` => this is the last expression, if any.
494
+ # * `plus` => this is the last expression, if any.
495
+ # * `range` => nil
496
+ # * `rept` => this is the last expression, if any.
497
+ # * `seq` => this is the first expression in the sequence, if any.
498
+ # * `star` => this is the last expression, if any.
317
499
  #
318
500
  # @param [Array<Rule>] ast
319
501
  # The set of rules, used to turn symbols into rules
502
+ # @param [Array<Symbol,String,Array>] expr (@expr)
503
+ # The expression to check, defaults to the rule expression.
504
+ # Typically, if the expression is recursive, the embedded expression is called recursively.
320
505
  # @return [Array<Rule>]
321
- def terminals(ast)
322
- @terms ||= (alt? ? expr[1..-1] : expr[1,1]).map do |sym|
506
+ # @note this is used for LL(1) tansformation, so rule types are limited
507
+ def terminals(ast, expr = @expr)
508
+ ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).map do |sym|
323
509
  case sym
324
510
  when Symbol
325
511
  r = ast.detect {|r| r.sym == sym}
326
512
  r if r && r.terminal?
327
513
  when String
328
514
  sym
329
- else
330
- nil
515
+ when Array
516
+ terminals(ast, sym)
331
517
  end
332
- end.compact
518
+ end.flatten.compact.uniq
333
519
  end
334
520
 
335
- # Does this rule start with a sym? It does if expr is that sym,
521
+ # Return the symbols used in the rule.
522
+ #
523
+ # @param [Array<Symbol,String,Array>] expr (@expr)
524
+ # The expression to check, defaults to the rule expression.
525
+ # Typically, if the expression is recursive, the embedded expression is called recursively.
526
+ # @return [Array<Rule>]
527
+ def symbols(expr = @expr)
528
+ expr[1..-1].map do |sym|
529
+ case sym
530
+ when Symbol
531
+ sym
532
+ when Array
533
+ symbols(sym)
534
+ end
535
+ end.flatten.compact.uniq
536
+ end
537
+
538
+ ##
539
+ # The following are used for LL(1) transformation.
540
+ ##
541
+
542
+ # Does this rule start with `sym`? It does if expr is that sym,
336
543
  # expr starts with alt and contains that sym,
337
544
  # or expr starts with seq and the next element is that sym.
338
545
  #
@@ -349,6 +556,92 @@ module EBNF
349
556
  end
350
557
  end
351
558
 
559
+ ##
560
+ # Validate the rule, with respect to an AST.
561
+ #
562
+ # @param [Array<Rule>] ast
563
+ # The set of rules, used to turn symbols into rules
564
+ # @param [Array<Symbol,String,Array>] expr (@expr)
565
+ # The expression to check, defaults to the rule expression.
566
+ # Typically, if the expression is recursive, the embedded expression is called recursively.
567
+ # @raise [RangeError]
568
+ def validate!(ast, expr = @expr)
569
+ op = expr.first
570
+ raise SyntaxError, "Unknown operator: #{op}" unless OP_ARGN.key?(op)
571
+ raise SyntaxError, "Argument count missmatch on operator #{op}, had #{expr.length - 1} expected #{OP_ARGN[op]}" if
572
+ OP_ARGN[op] && OP_ARGN[op] != expr.length - 1
573
+
574
+ # rept operator needs min and max
575
+ if op == :alt
576
+ raise SyntaxError, "alt operation must have at least one operand, had #{expr.length - 1}" unless expr.length > 1
577
+ elsif op == :rept
578
+ raise SyntaxError, "rept operation must an non-negative integer minimum, was #{expr[1]}" unless
579
+ expr[1].is_a?(Integer) && expr[1] >= 0
580
+ raise SyntaxError, "rept operation must an non-negative integer maximum or '*', was #{expr[2]}" unless
581
+ expr[2] == '*' || expr[2].is_a?(Integer) && expr[2] >= 0
582
+ end
583
+
584
+ case op
585
+ when :hex
586
+ raise SyntaxError, "Hex operand must be of form '#xN+': #{sym}" unless expr.last.match?(/^#x\h+$/)
587
+ when :range
588
+ str = expr.last.dup
589
+ str = str[1..-1] if str.start_with?('^')
590
+ str = str[0..-2] if str.end_with?('-') # Allowed at end of range
591
+ scanner = StringScanner.new(str)
592
+ hex = rchar = in_range = false
593
+ while !scanner.eos?
594
+ begin
595
+ if scanner.scan(Terminals::HEX)
596
+ raise SyntaxError if in_range && rchar
597
+ rchar = in_range = false
598
+ hex = true
599
+ elsif scanner.scan(Terminals::R_CHAR)
600
+ raise SyntaxError if in_range && hex
601
+ hex = in_range = false
602
+ rchar = true
603
+ else
604
+ raise(SyntaxError, "Range contains illegal components at offset #{scanner.pos}: was #{expr.last}")
605
+ end
606
+
607
+ if scanner.scan(/\-/)
608
+ raise SyntaxError if in_range
609
+ in_range = true
610
+ end
611
+ rescue SyntaxError
612
+ raise(SyntaxError, "Range contains illegal components at offset #{scanner.pos}: was #{expr.last}")
613
+ end
614
+ end
615
+ else
616
+ ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).each do |sym|
617
+ case sym
618
+ when Symbol
619
+ r = ast.detect {|r| r.sym == sym}
620
+ raise SyntaxError, "No rule found for #{sym}" unless r
621
+ when Array
622
+ validate!(ast, sym)
623
+ when String
624
+ raise SyntaxError, "String must be of the form CHAR*" unless sym.match?(/^#{Terminals::CHAR}*$/)
625
+ end
626
+ end
627
+ end
628
+ end
629
+
630
+ ##
631
+ # Validate the rule, with respect to an AST.
632
+ #
633
+ # Uses `#validate!` and catches `RangeError`
634
+ #
635
+ # @param [Array<Rule>] ast
636
+ # The set of rules, used to turn symbols into rules
637
+ # @return [Boolean]
638
+ def valid?(ast)
639
+ validate!(ast)
640
+ true
641
+ rescue SyntaxError
642
+ false
643
+ end
644
+
352
645
  # Do the firsts of this rule include the empty string?
353
646
  #
354
647
  # @return [Boolean]
@@ -381,79 +674,6 @@ module EBNF
381
674
  terminals.length
382
675
  end
383
676
 
384
- # Is this a terminal?
385
- #
386
- # @return [Boolean]
387
- def terminal?
388
- kind == :terminal
389
- end
390
-
391
- # Is this a pass?
392
- # @return [Boolean]
393
- def pass?
394
- kind == :pass
395
- end
396
-
397
- # Is this a rule?
398
- # @return [Boolean]
399
- def rule?
400
- kind == :rule
401
- end
402
-
403
- # Is this rule of the form (alt ...)?
404
- def alt?
405
- expr.is_a?(Array) && expr.first == :alt
406
- end
407
-
408
- # Is this rule of the form (seq ...)?
409
- def seq?
410
- expr.is_a?(Array) && expr.first == :seq
411
- end
412
-
413
- # Is this rule of the form (alt ...)?
414
- def alt?
415
- expr.is_a?(Array) && expr.first == :alt
416
- end
417
-
418
- def inspect
419
- "#<EBNF::Rule:#{object_id} " +
420
- {sym: sym, id: id, kind: kind, expr: expr}.inspect +
421
- ">"
422
- end
423
-
424
- # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}.
425
- #
426
- # @param [Rule] other
427
- # @return [Boolean]
428
- def ==(other)
429
- sym == other.sym &&
430
- kind == other.kind &&
431
- expr == other.expr
432
- end
433
-
434
- # Two rules are equivalent if they have the same {#expr}.
435
- #
436
- # @param [Rule] other
437
- # @return [Boolean]
438
- def equivalent?(other)
439
- expr == other.expr
440
- end
441
-
442
- # Rules compare using their ids
443
- def <=>(other)
444
- if id.to_i == other.id.to_i
445
- id.to_s <=> other.id.to_s
446
- else
447
- id.to_i <=> other.id.to_i
448
- end
449
- end
450
-
451
- ##
452
- # Utility function to translate code points of the form '#xN' into ruby unicode characters
453
- def translate_codepoints(str)
454
- str.gsub(/#x\h+/) {|c| c[2..-1].scanf("%x").first.chr(Encoding::UTF_8)}
455
- end
456
-
457
677
  private
458
678
  def ttl_expr(expr, pfx, depth, is_obj = true)
459
679
  indent = ' ' * depth
@@ -469,17 +689,28 @@ module EBNF
469
689
 
470
690
  case op
471
691
  when :seq, :alt, :diff
692
+ # Multiple operands
472
693
  statements << %{#{indent}#{bra}#{pfx}:#{op} (}
473
694
  expr.each {|a| statements += ttl_expr(a, pfx, depth + 1)}
474
695
  statements << %{#{indent} )#{ket}}
475
- when :opt, :plus, :star
696
+ when :opt, :plus, :star, :not
697
+ # Single operand
476
698
  statements << %{#{indent}#{bra}#{pfx}:#{op} }
477
699
  statements += ttl_expr(expr.first, pfx, depth + 1)
478
700
  statements << %{#{indent} #{ket}} unless ket.empty?
701
+ when :rept
702
+ # Three operands (min, max and expr)
703
+ statements << %{ #{indent}#{pfx}:min #{expr[0].inspect};}
704
+ statements << %{ #{indent}#{pfx}:max #{expr[1].inspect};}
705
+ statements << %{#{indent}#{bra}#{pfx}:#{op} }
706
+ statements += ttl_expr(expr.last, pfx, depth + 1)
707
+ statements << %{#{indent} #{ket}} unless ket.empty?
479
708
  when :_empty, :_eps
480
709
  statements << %{#{indent}"g:#{op.to_s[1..-1]}"}
481
710
  when :"'"
482
711
  statements << %{#{indent}"#{esc(expr)}"}
712
+ when :istr
713
+ statements << %{#{indent}#{bra} re:matches #{expr.first.inspect} #{ket}}
483
714
  when :range
484
715
  statements << %{#{indent}#{bra} re:matches #{cclass(expr.first).inspect} #{ket}}
485
716
  when :hex
@@ -535,7 +766,7 @@ module EBNF
535
766
  def make_sym_id(variation = nil)
536
767
  @id_seq ||= 0
537
768
  @id_seq += 1
538
- ["_#{@sym}_#{@id_seq}#{variation}".to_sym, "#{@id}.#{@id_seq}#{variation}"]
769
+ ["_#{@sym}_#{@id_seq}#{variation}".to_sym, ("#{@id}.#{@id_seq}#{variation}" if @id)]
539
770
  end
540
771
  end
541
772
  end