ebnf 1.1.3 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +221 -198
  3. data/UNLICENSE +1 -1
  4. data/VERSION +1 -1
  5. data/bin/ebnf +40 -21
  6. data/etc/abnf-core.ebnf +52 -0
  7. data/etc/abnf.abnf +121 -0
  8. data/etc/abnf.ebnf +124 -0
  9. data/etc/abnf.sxp +45 -0
  10. data/etc/doap.ttl +23 -15
  11. data/etc/ebnf.ebnf +21 -33
  12. data/etc/ebnf.html +171 -160
  13. data/etc/{ebnf.rb → ebnf.ll1.rb} +30 -107
  14. data/etc/ebnf.ll1.sxp +182 -183
  15. data/etc/ebnf.peg.rb +90 -0
  16. data/etc/ebnf.peg.sxp +84 -0
  17. data/etc/ebnf.sxp +40 -41
  18. data/etc/iso-ebnf.ebnf +140 -0
  19. data/etc/iso-ebnf.isoebnf +138 -0
  20. data/etc/iso-ebnf.sxp +65 -0
  21. data/etc/sparql.ebnf +4 -4
  22. data/etc/sparql.html +1603 -1751
  23. data/etc/sparql.ll1.sxp +7372 -7372
  24. data/etc/sparql.peg.rb +532 -0
  25. data/etc/sparql.peg.sxp +597 -0
  26. data/etc/sparql.sxp +363 -362
  27. data/etc/turtle.ebnf +3 -3
  28. data/etc/turtle.html +465 -517
  29. data/etc/{turtle.rb → turtle.ll1.rb} +3 -4
  30. data/etc/turtle.ll1.sxp +425 -425
  31. data/etc/turtle.peg.rb +182 -0
  32. data/etc/turtle.peg.sxp +199 -0
  33. data/etc/turtle.sxp +103 -101
  34. data/lib/ebnf.rb +7 -2
  35. data/lib/ebnf/abnf.rb +301 -0
  36. data/lib/ebnf/abnf/core.rb +23 -0
  37. data/lib/ebnf/abnf/meta.rb +111 -0
  38. data/lib/ebnf/base.rb +113 -69
  39. data/lib/ebnf/bnf.rb +1 -26
  40. data/lib/ebnf/ebnf/meta.rb +90 -0
  41. data/lib/ebnf/isoebnf.rb +229 -0
  42. data/lib/ebnf/isoebnf/meta.rb +75 -0
  43. data/lib/ebnf/ll1.rb +138 -6
  44. data/lib/ebnf/ll1/lexer.rb +37 -32
  45. data/lib/ebnf/ll1/parser.rb +113 -73
  46. data/lib/ebnf/ll1/scanner.rb +83 -51
  47. data/lib/ebnf/native.rb +320 -0
  48. data/lib/ebnf/parser.rb +285 -302
  49. data/lib/ebnf/peg.rb +39 -0
  50. data/lib/ebnf/peg/parser.rb +561 -0
  51. data/lib/ebnf/peg/rule.rb +250 -0
  52. data/lib/ebnf/rule.rb +443 -148
  53. data/lib/ebnf/terminals.rb +21 -0
  54. data/lib/ebnf/writer.rb +565 -83
  55. metadata +107 -29
  56. data/etc/sparql.rb +0 -45773
@@ -0,0 +1,250 @@
1
+ module EBNF::PEG
2
+ # Behaviior for parsing a PEG rule
3
+ module Rule
4
+ ##
5
+ # Initialized by parser when loading rules.
6
+ # Used for finding rules and invoking elements of the parse process.
7
+ #
8
+ # @return [EBNF::PEG::Parser] parser
9
+ attr_accessor :parser
10
+
11
+ ##
12
+ # Parse a rule or terminal, invoking callbacks, as appropriate
13
+
14
+ # If there is are `start_production` and/or `production`,
15
+ # they are invoked with a `prod_data` stack, the input stream and offset.
16
+ # Otherwise, the results are added as an array value
17
+ # to a hash indexed by the rule name.
18
+ #
19
+ # If matched, the input position is updated and the results returned in a Hash.
20
+ #
21
+ # * `alt`: returns the value of the matched production or `:unmatched`.
22
+ # * `diff`: returns the value matched, or `:unmatched`.
23
+ # * `hex`: returns a string composed of the matched hex character, or `:unmatched`.
24
+ # * `opt`: returns the value matched, or `nil` if unmatched.
25
+ # * `plus`: returns an array of the values matched for the specified production, or `:unmatched`, if none are matched. For Terminals, these are concatenated into a single string.
26
+ # * `range`: returns a string composed of the values matched, or `:unmatched`, if less than `min` are matched.
27
+ # * `rept`: returns an array of the values matched for the speficied production, or `:unmatched`, if none are matched. For Terminals, these are concatenated into a single string.
28
+ # * `seq`: returns an array composed of single-entry hashes for each matched production indexed by the production name, or `:unmatched` if any production fails to match. For Terminals, returns a string created by concatenating these values. Via option in a `production` or definition, the result can be a single hash with values for each matched production; note that this is not always possible due to the possibility of repeated productions within the sequence.
29
+ # * `star`: returns an array of the values matched for the specified production. For Terminals, these are concatenated into a single string.
30
+ #
31
+ # @param [Scanner] input
32
+ # @return [Hash{Symbol => Object}, :unmatched] A hash with keys for matched component of the expression. Returns :unmatched if the input does not match the production.
33
+ def parse(input)
34
+ # Save position and linenumber for backtracking
35
+ pos, lineno = input.pos, input.lineno
36
+
37
+ parser.packrat[sym] ||= {}
38
+ if parser.packrat[sym][pos]
39
+ parser.debug("#{sym}(:memo)", lineno: lineno) { "#{parser.packrat[sym][pos].inspect}(@#{pos})"}
40
+ input.pos, input.lineno = parser.packrat[sym][pos][:pos], parser.packrat[sym][pos][:lineno]
41
+ return parser.packrat[sym][pos][:result]
42
+ end
43
+
44
+ if terminal?
45
+ # If the terminal is defined with a regular expression,
46
+ # use that to match the input,
47
+ # otherwise,
48
+ if regexp = parser.find_terminal_regexp(sym)
49
+ matched = input.scan(regexp)
50
+ result = parser.onTerminal(sym, (matched ? matched : :unmatched))
51
+ # Update furthest failure for strings and terminals
52
+ parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched
53
+ parser.packrat[sym][pos] = {
54
+ pos: input.pos,
55
+ lineno: input.lineno,
56
+ result: result
57
+ }
58
+ return parser.packrat[sym][pos][:result]
59
+ end
60
+ else
61
+ eat_whitespace(input)
62
+ end
63
+ start_options = parser.onStart(sym)
64
+
65
+ result = case expr.first
66
+ when :alt
67
+ # Return the first expression to match.
68
+ # Result is either :unmatched, or the value of the matching rule
69
+ alt = :unmatched
70
+ expr[1..-1].each do |prod|
71
+ alt = case prod
72
+ when Symbol
73
+ rule = parser.find_rule(prod)
74
+ raise "No rule found for #{prod}" unless rule
75
+ rule.parse(input)
76
+ when String
77
+ input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
78
+ end
79
+ if alt == :unmatched
80
+ # Update furthest failure for strings and terminals
81
+ parser.update_furthest_failure(input.pos, input.lineno, prod) if prod.is_a?(String) || rule.terminal?
82
+ else
83
+ break
84
+ end
85
+ end
86
+ alt
87
+ when :diff
88
+ # matches any string that matches A but does not match B.
89
+ # (Note, this is only used for Terminal rules, non-terminals will use :not)
90
+ raise "Diff used on non-terminal #{prod}" unless terminal?
91
+ re1, re2 = Regexp.new(translate_codepoints(expr[1])), Regexp.new(translate_codepoints(expr[2]))
92
+ matched = input.scan(re1)
93
+ if !matched || re2.match?(matched)
94
+ # Update furthest failure for terminals
95
+ parser.update_furthest_failure(input.pos, input.lineno, sym)
96
+ :unmatched
97
+ else
98
+ matched
99
+ end
100
+ when :hex
101
+ # Matches the given hex character if expression matches the character whose number (code point) in ISO/IEC 10646 is N. The number of leading zeros in the #xN form is insignificant.
102
+ input.scan(to_regexp) || begin
103
+ # Update furthest failure for terminals
104
+ parser.update_furthest_failure(input.pos, input.lineno, expr.last)
105
+ :unmatched
106
+ end
107
+ when :not
108
+ # matches any string that does not match B.
109
+ res = case prod = expr[1]
110
+ when Symbol
111
+ rule = parser.find_rule(prod)
112
+ raise "No rule found for #{prod}" unless rule
113
+ rule.parse(input)
114
+ when String
115
+ input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
116
+ end
117
+ if res != :unmatched
118
+ # Update furthest failure for terminals
119
+ parser.update_furthest_failure(input.pos, input.lineno, sym) if terminal?
120
+ :unmatched
121
+ else
122
+ nil
123
+ end
124
+ when :opt
125
+ # Result is the matched value or nil
126
+ opt = rept(input, 0, 1, expr[1])
127
+
128
+ # Update furthest failure for strings and terminals
129
+ parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
130
+ opt.first
131
+ when :plus
132
+ # Result is an array of all expressions while they match,
133
+ # at least one must match
134
+ plus = rept(input, 1, '*', expr[1])
135
+
136
+ # Update furthest failure for strings and terminals
137
+ parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
138
+ plus.is_a?(Array) && terminal? ? plus.join("") : plus
139
+ when :range, :istr
140
+ # Matches the specified character range
141
+ input.scan(to_regexp) || begin
142
+ # Update furthest failure for strings and terminals
143
+ parser.update_furthest_failure(input.pos, input.lineno, expr[1])
144
+ :unmatched
145
+ end
146
+ when :rept
147
+ # Result is an array of all expressions while they match,
148
+ # an empty array of none match
149
+ rept = rept(input, expr[1], expr[2], expr[3])
150
+
151
+ # # Update furthest failure for strings and terminals
152
+ parser.update_furthest_failure(input.pos, input.lineno, expr[3]) if terminal?
153
+ rept.is_a?(Array) && terminal? ? rept.join("") : rept
154
+ when :seq
155
+ # Evaluate each expression into an array of hashes where each hash contains a key from the associated production and the value is the parsed value of that production. Returns :unmatched if the input does not match the production. Value ordering is ensured by native Hash ordering.
156
+ seq = expr[1..-1].each_with_object([]) do |prod, accumulator|
157
+ eat_whitespace(input) unless accumulator.empty? || terminal?
158
+ res = case prod
159
+ when Symbol
160
+ rule = parser.find_rule(prod)
161
+ raise "No rule found for #{prod}" unless rule
162
+ rule.parse(input)
163
+ when String
164
+ input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
165
+ end
166
+ if res == :unmatched
167
+ # Update furthest failure for strings and terminals
168
+ parser.update_furthest_failure(input.pos, input.lineno, prod)
169
+ break :unmatched
170
+ end
171
+ accumulator << {prod.to_sym => res}
172
+ end
173
+ if seq == :unmatched
174
+ :unmatched
175
+ elsif terminal?
176
+ seq.map(&:values).compact.join("") # Concat values for terminal production
177
+ elsif start_options[:as_hash]
178
+ seq.inject {|memo, h| memo.merge(h)}
179
+ else
180
+ seq
181
+ end
182
+ when :star
183
+ # Result is an array of all expressions while they match,
184
+ # an empty array of none match
185
+ star = rept(input, 0, '*', expr[1])
186
+
187
+ # Update furthest failure for strings and terminals
188
+ parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
189
+ star.is_a?(Array) && terminal? ? star.join("") : star
190
+ else
191
+ raise "attempt to parse unknown rule type: #{expr.first}"
192
+ end
193
+
194
+ if result == :unmatched
195
+ input.pos, input.lineno = pos, lineno
196
+ end
197
+
198
+ result = parser.onFinish(result)
199
+ (parser.packrat[sym] ||= {})[pos] = {
200
+ pos: input.pos,
201
+ lineno: input.lineno,
202
+ result: result
203
+ }
204
+ return parser.packrat[sym][pos][:result]
205
+ end
206
+
207
+ ##
208
+ # Repitition, 0-1, 0-n, 1-n, ...
209
+ #
210
+ # Note, nil results are removed from the result, but count towards min/max calculations
211
+ #
212
+ # @param [Scanner] input
213
+ # @param [Integer] min
214
+ # @param [Integer] max
215
+ # If it is an integer, it stops matching after max entries.
216
+ # @param [Symbol, String] prod
217
+ # @return [:unmatched, Array]
218
+ def rept(input, min, max, prod)
219
+ result = []
220
+
221
+ case prod
222
+ when Symbol
223
+ rule = parser.find_rule(prod)
224
+ raise "No rule found for #{prod}" unless rule
225
+ while (max == '*' || result.length < max) && (res = rule.parse(input)) != :unmatched
226
+ eat_whitespace(input) unless terminal?
227
+ result << res
228
+ end
229
+ when String
230
+ while (res = input.scan(Regexp.new(Regexp.quote(prod)))) && (max == '*' || result.length < max)
231
+ eat_whitespace(input) unless terminal?
232
+ result << res
233
+ end
234
+ end
235
+
236
+ result.length < min ? :unmatched : result.compact
237
+ end
238
+
239
+ ##
240
+ # Eat whitespace between non-terminal rules
241
+ def eat_whitespace(input)
242
+ if parser.whitespace.is_a?(Regexp)
243
+ # Eat whitespace before a non-terminal
244
+ input.skip(parser.whitespace)
245
+ elsif parser.whitespace.is_a?(Rule)
246
+ parser.whitespace.parse(input) # throw away result
247
+ end
248
+ end
249
+ end
250
+ end
@@ -1,15 +1,33 @@
1
+ require 'scanf'
2
+ require 'strscan'
3
+
1
4
  module EBNF
2
5
  # Represent individual parsed rules
3
6
  class Rule
4
- # Operations which are flattened to seprate rules in to_bnf
7
+ # Operations which are flattened to seprate rules in to_bnf.
5
8
  BNF_OPS = %w{
6
- alt opt plus seq star
9
+ alt diff not opt plus rept seq star
7
10
  }.map(&:to_sym).freeze
8
11
 
9
12
  TERM_OPS = %w{
10
- diff hex range
13
+ hex istr range
11
14
  }.map(&:to_sym).freeze
12
15
 
16
+ # The number of arguments expected per operator. `nil` for unspecified
17
+ OP_ARGN = {
18
+ alt: nil,
19
+ diff: 2,
20
+ hex: 1,
21
+ istr: 1,
22
+ not: 1,
23
+ opt: 1,
24
+ plus: 1,
25
+ range: 1,
26
+ rept: 3,
27
+ seq: nil,
28
+ star: 1
29
+ }
30
+
13
31
  # Symbol of rule
14
32
  #
15
33
  # @return [Symbol]
@@ -26,7 +44,7 @@ module EBNF
26
44
 
27
45
  # Kind of rule
28
46
  #
29
- # @return [:rule, :terminal, or :pass]
47
+ # @return [:rule, :terminal, :terminals, or :pass]
30
48
  attr_accessor :kind
31
49
 
32
50
  # Rule expression
@@ -57,19 +75,38 @@ module EBNF
57
75
  # Determines preparation and cleanup rules for reconstituting EBNF ? * + from BNF
58
76
  attr_accessor :cleanup
59
77
 
60
- # @param [Integer] id
61
- # @param [Symbol] sym
78
+ # @param [Symbol, nil] sym
79
+ # `nil` is allowed only for @pass or @terminals
80
+ # @param [Integer, nil] id
62
81
  # @param [Array] expr
63
- # @param [Symbol] :kind
64
- # @param [String] :ebnf
65
- # @param [Array] :first
66
- # @param [Array] :follow
67
- # @param [Boolean] :start
68
- # @param [Rule] :top_rule
69
- # @param [Boolean] :cleanup
82
+ # The expression is an internal-representation of an S-Expression with one of the following oparators:
83
+ #
84
+ # * `alt` A list of alternative rules, which are attempted in order. It terminates with the first matching rule, or is terminated as unmatched, if no such rule is found.
85
+ # * `diff` matches any string that matches `A` but does not match `B`.
86
+ # * `hex` A single character represented using the hexadecimal notation `#xnn`.
87
+ # * `istr` A string which matches in a case-insensitive manner, so that `(istr "fOo")` will match either of the strings `"foo"`, `"FOO"` or any other combination.
88
+ # * `opt` An optional rule or terminal. It either results in the matching rule or returns `nil`.
89
+ # * `plus` – A sequence of one or more of the matching rule. If there is no such rule, it is terminated as unmatched; otherwise, the result is an array containing all matched input.
90
+ # * `range` – A range of characters, possibly repeated, of the form `(range "a-z")`. May also use hexadecimal notation.
91
+ # * `rept m n` – A sequence of at lest `m` and at most `n` of the matching rule. It will always return an array.
92
+ # * `seq` – A sequence of rules or terminals. If any (other than `opt` or `star`) to not parse, the rule is terminated as unmatched.
93
+ # * `star` – A sequence of zero or more of the matching rule. It will always return an array.
94
+ # @param [:rule, :terminal, :terminals, :pass] kind (nil)
95
+ # @param [String] ebnf (nil)
96
+ # When parsing, records the EBNF string used to create the rule.
97
+ # @param [Array] first (nil)
98
+ # Recorded set of terminals that can proceed this rule (LL(1))
99
+ # @param [Array] follow (nil)
100
+ # Recorded set of terminals that can follow this rule (LL(1))
101
+ # @param [Boolean] start (nil)
102
+ # Is this the starting rule for the grammar?
103
+ # @param [Rule] top_rule (nil)
104
+ # The top-most rule. All expressed rules are top-rules, derived rules have the original rule as their top-rule.
105
+ # @param [Boolean] cleanup (nil)
106
+ # Records information useful for cleaning up converted :plus, and :star expansions (LL(1)).
70
107
  def initialize(sym, id, expr, kind: nil, ebnf: nil, first: nil, follow: nil, start: nil, top_rule: nil, cleanup: nil)
71
108
  @sym, @id = sym, id
72
- @expr = expr.is_a?(Array) ? expr : [:seq, expr]
109
+ @expr = expr.is_a?(Array) ? expr : [:seq, expr].compact
73
110
  @ebnf, @kind, @first, @follow, @start, @cleanup, @top_rule = ebnf, kind, first, follow, start, cleanup, top_rule
74
111
  @top_rule ||= self
75
112
  @kind ||= case
@@ -77,21 +114,53 @@ module EBNF
77
114
  when !BNF_OPS.include?(@expr.first) then :terminal
78
115
  else :rule
79
116
  end
117
+
118
+ # Allow @pass and @terminals to not be named
119
+ @sym ||= :_pass if @kind == :pass
120
+ @sym ||= :_terminals if @kind == :terminals
121
+
122
+ raise ArgumentError, "Rule sym must be a symbol, was #{@sym.inspect}" unless @sym.is_a?(Symbol)
123
+ raise ArgumentError, "Rule id must be a string or nil, was #{@id.inspect}" unless (@id || "").is_a?(String)
124
+ raise ArgumentError, "Rule kind must be one of :rule, :terminal, :terminals, or :pass, was #{@kind.inspect}" unless
125
+ @kind.is_a?(Symbol) && %w(rule terminal terminals pass).map(&:to_sym).include?(@kind)
126
+
127
+ case @expr.first
128
+ when :alt
129
+ raise ArgumentError, "#{@expr.first} operation must have at least one operand, had #{@expr.length - 1}" unless @expr.length > 1
130
+ when :diff
131
+ raise ArgumentError, "#{@expr.first} operation must have exactly two operands, had #{@expr.length - 1}" unless @expr.length == 3
132
+ when :hex, :istr, :not, :opt, :plus, :range, :star
133
+ raise ArgumentError, "#{@expr.first} operation must have exactly one operand, had #{@expr.length - 1}" unless @expr.length == 2
134
+ when :rept
135
+ raise ArgumentError, "#{@expr.first} operation must have exactly three, had #{@expr.length - 1}" unless @expr.length == 4
136
+ raise ArgumentError, "#{@expr.first} operation must an non-negative integer minimum, was #{@expr[1]}" unless
137
+ @expr[1].is_a?(Integer) && @expr[1] >= 0
138
+ raise ArgumentError, "#{@expr.first} operation must an non-negative integer maximum or '*', was #{@expr[2]}" unless
139
+ @expr[2] == '*' || @expr[2].is_a?(Integer) && @expr[2] >= 0
140
+ when :seq
141
+ # It's legal to have a zero-length sequence
142
+ else
143
+ raise ArgumentError, "Rule expression must be an array using a known operator, was #{@expr.first}"
144
+ end
80
145
  end
81
146
 
82
147
  ##
83
148
  # Return a rule from its SXP representation:
84
149
  #
85
150
  # @example inputs
86
- # (pass (plus (range "#x20\\t\\r\\n")))
151
+ # (pass _pass (plus (range "#x20\\t\\r\\n")))
87
152
  # (rule ebnf "1" (star (alt declaration rule)))
88
- # (terminal O_ENUM "17" (seq "[^" (plus CHAR) "]"))
153
+ # (terminal R_CHAR "19" (diff CHAR (alt "]" "-")))
89
154
  #
90
- # Also may have (first ...), (follow ...), or (start #t)
155
+ # Also may have `(first ...)`, `(follow ...)`, or `(start #t)`.
91
156
  #
92
- # @param [Array] sxp
157
+ # @param [String, Array] sxp
93
158
  # @return [Rule]
94
159
  def self.from_sxp(sxp)
160
+ if sxp.is_a?(String)
161
+ require 'sxp' unless defined?(SXP)
162
+ sxp = SXP.parse(sxp)
163
+ end
95
164
  expr = sxp.detect {|e| e.is_a?(Array) && ![:first, :follow, :start].include?(e.first.to_sym)}
96
165
  first = sxp.detect {|e| e.is_a?(Array) && e.first.to_sym == :first}
97
166
  first = first[1..-1] if first
@@ -102,26 +171,28 @@ module EBNF
102
171
  start = sxp.any? {|e| e.is_a?(Array) && e.first.to_sym == :start}
103
172
  sym = sxp[1] if sxp[1].is_a?(Symbol)
104
173
  id = sxp[2] if sxp[2].is_a?(String)
105
- Rule.new(sym, id, expr, kind: sxp.first, first: first, follow: follow, cleanup: cleanup, start: start)
174
+ self.new(sym, id, expr, kind: sxp.first, first: first, follow: follow, cleanup: cleanup, start: start)
106
175
  end
107
176
 
108
177
  # Build a new rule creating a symbol and numbering from the current rule
109
- # Symbol and number creation is handled by the top-most rule in such a chain
178
+ # Symbol and number creation is handled by the top-most rule in such a chain.
110
179
  #
111
180
  # @param [Array] expr
181
+ # @param [Symbol] kind (nil)
182
+ # @param [Hash{Symbol => Symbol}] cleanup (nil)
112
183
  # @param [Hash{Symbol => Object}] options
113
- # @param [Symbol] :kind
114
184
  def build(expr, kind: nil, cleanup: nil, **options)
115
- new_sym, new_id = (@top_rule ||self).send(:make_sym_id)
116
- Rule.new(new_sym, new_id, expr,
117
- kind: kind,
118
- ebnf: @ebnf,
119
- top_rule: (@top_rule || self),
120
- cleanup: cleanup,
121
- **options)
185
+ new_sym, new_id = @top_rule.send(:make_sym_id)
186
+ self.class.new(new_sym, new_id, expr,
187
+ kind: kind,
188
+ ebnf: @ebnf,
189
+ top_rule: @top_rule,
190
+ cleanup: cleanup,
191
+ **options)
122
192
  end
123
193
 
124
- # Return representation for building S-Expressions
194
+ # Return representation for building S-Expressions.
195
+ #
125
196
  # @return [Array]
126
197
  def for_sxp
127
198
  elements = [kind, sym]
@@ -137,40 +208,51 @@ module EBNF
137
208
  # Return SXP representation of this rule
138
209
  # @return [String]
139
210
  def to_sxp
211
+ require 'sxp' unless defined?(SXP)
140
212
  for_sxp.to_sxp
141
213
  end
142
214
 
143
215
  alias_method :to_s, :to_sxp
144
216
 
145
- # Serializes this rule to an Turtle
217
+ # Serializes this rule to an Turtle.
218
+ #
146
219
  # @return [String]
147
220
  def to_ttl
148
221
  @ebnf.debug("to_ttl") {inspect} if @ebnf
149
- comment = orig.to_s.strip.
150
- gsub(/"""/, '\"\"\"').
151
- gsub("\\", "\\\\").
152
- sub(/^\"/, '\"').
153
- sub(/\"$/m, '\"')
154
- statements = [
155
- %{:#{id} rdfs:label "#{id}"; rdf:value "#{sym}";},
156
- %{ rdfs:comment #{comment.inspect};},
157
- ]
222
+ statements = [%{:#{sym} rdfs:label "#{sym}";}]
223
+ if orig
224
+ comment = orig.to_s.strip.
225
+ gsub(/"""/, '\"\"\"').
226
+ gsub("\\", "\\\\").
227
+ sub(/^\"/, '\"').
228
+ sub(/\"$/m, '\"')
229
+ statements << %{ rdfs:comment #{comment.inspect};}
230
+ end
231
+ statements << %{ dc:identifier "#{id}";} if id
158
232
 
159
233
  statements += ttl_expr(expr, terminal? ? "re" : "g", 1, false)
160
234
  "\n" + statements.join("\n")
161
235
  end
162
236
 
237
+ # Return a Ruby representation of this rule
238
+ # @return [String]
239
+ def to_ruby
240
+ "EBNF::Rule.new(#{sym.inspect}, #{id.inspect}, #{expr.inspect}#{', kind: ' + kind.inspect unless kind == :rule})"
241
+ end
242
+
163
243
  ##
164
244
  # Transform EBNF rule to BNF rules:
165
245
  #
166
- # * Transform (a [n] rule (op1 (op2))) into two rules:
167
- # (a [n] rule (op1 _a_1))
168
- # (_a_1 [n.1] rule (op2))
169
- # * Transform (a rule (opt b)) into (a rule (alt _empty b))
170
- # * Transform (a rule (star b)) into (a rule (alt _empty (seq b a)))
171
- # * Transform (a rule (plus b)) into (a rule (seq b (star b)
246
+ # * Transform `(rule a "n" (op1 (op2)))` into two rules:
247
+ #
248
+ # (rule a "n" (op1 _a_1))
249
+ # (rule _a_1 "n.1" (op2))
250
+ # * Transform `(rule a (opt b))` into `(rule a (alt _empty b))`
251
+ # * Transform `(rule a (star b))` into `(rule a (alt _empty (seq b a)))`
252
+ # * Transform `(rule a (plus b))` into `(rule a (seq b (star b)`
253
+ #
254
+ # Transformation includes information used to re-construct non-transformed.
172
255
  #
173
- # Transformation includes information used to re-construct non-transformed
174
256
  # AST representation
175
257
  # @return [Array<Rule>]
176
258
  def to_bnf
@@ -197,19 +279,19 @@ module EBNF
197
279
  new_rules = new_rules.map {|r| r.to_bnf}.flatten
198
280
  elsif expr.first == :opt
199
281
  this = dup
200
- # * Transform (a rule (opt b)) into (a rule (alt _empty b))
282
+ # * Transform (rule a (opt b)) into (rule a (alt _empty b))
201
283
  this.expr = [:alt, :_empty, expr.last]
202
284
  this.cleanup = :opt
203
285
  new_rules = this.to_bnf
204
286
  elsif expr.first == :star
205
- # * Transform (a rule (star b)) into (a rule (alt _empty (seq b a)))
287
+ # * Transform (rule a (star b)) into (rule a (alt _empty (seq b a)))
206
288
  this = dup
207
289
  this.cleanup = :star
208
290
  new_rule = this.build([:seq, expr.last, this.sym], cleanup: :merge)
209
291
  this.expr = [:alt, :_empty, new_rule.sym]
210
292
  new_rules = [this] + new_rule.to_bnf
211
293
  elsif expr.first == :plus
212
- # * Transform (a rule (plus b)) into (a rule (seq b (star b)
294
+ # * Transform (rule a (plus b)) into (rule a (seq b (star b)
213
295
  this = dup
214
296
  this.cleanup = :plus
215
297
  this.expr = [:seq, expr.last, [:star, expr.last]]
@@ -218,7 +300,7 @@ module EBNF
218
300
  # Otherwise, no further transformation necessary
219
301
  new_rules << self
220
302
  elsif [:diff, :hex, :range].include?(expr.first)
221
- # This rules are fine, the just need to be terminals
303
+ # This rules are fine, they just need to be terminals
222
304
  raise "Encountered #{expr.first.inspect}, which is a #{self.kind}, not :terminal" unless self.terminal?
223
305
  new_rules << self
224
306
  else
@@ -229,89 +311,73 @@ module EBNF
229
311
  return new_rules
230
312
  end
231
313
 
232
- # Return the non-terminals for this rule. For seq, this is the first
233
- # non-terminals in the seq. For alt, this is every non-terminal ni the alt
234
- # @param [Array<Rule>] ast
235
- # The set of rules, used to turn symbols into rules
314
+ ##
315
+ # Transform EBNF rule for PEG:
316
+ #
317
+ # * Transform `(rule a "n" (op1 ... (op2 y) ...z))` into two rules:
318
+ #
319
+ # (rule a "n" (op1 ... _a_1 ... z))
320
+ # (rule _a_1 "n.1" (op2 y))
321
+ # * Transform `(rule a "n" (diff op1 op2))` into two rules:
322
+ #
323
+ # (rule a "n" (seq _a_1 op1))
324
+ # (rule _a_1 "n.1" (not op1))
325
+ #
236
326
  # @return [Array<Rule>]
237
- def non_terminals(ast)
238
- @non_terms ||= (alt? ? expr[1..-1] : expr[1,1]).map do |sym|
239
- case sym
240
- when Symbol
241
- r = ast.detect {|r| r.sym == sym}
242
- r if r && r.rule?
243
- else
244
- nil
245
- end
246
- end.compact
247
- end
327
+ def to_peg
328
+ new_rules = []
248
329
 
249
- # Return the terminals for this rule. For seq, this is the first
250
- # terminals or strings in the seq. For alt, this is every non-terminal ni the alt
251
- # @param [Array<Rule>] ast
252
- # The set of rules, used to turn symbols into rules
253
- # @return [Array<Rule>]
254
- def terminals(ast)
255
- @terms ||= (alt? ? expr[1..-1] : expr[1,1]).map do |sym|
256
- case sym
257
- when Symbol
258
- r = ast.detect {|r| r.sym == sym}
259
- r if r && r.terminal?
260
- when String
261
- sym
262
- else
263
- nil
330
+ # Look for rules containing sub-sequences
331
+ if expr.any? {|e| e.is_a?(Array) && e.first.is_a?(Symbol)}
332
+ # duplicate ourselves for rewriting
333
+ this = dup
334
+ new_rules << this
335
+
336
+ expr.each_with_index do |e, index|
337
+ next unless e.is_a?(Array) && e.first.is_a?(Symbol)
338
+ new_rule = build(e)
339
+ this.expr[index] = new_rule.sym
340
+ new_rules << new_rule
264
341
  end
265
- end.compact
266
- end
267
342
 
268
- # Does this rule start with a sym? It does if expr is that sym,
269
- # expr starts with alt and contains that sym, or
270
- # expr starts with seq and the next element is that sym
271
- # @param [Symbol, class] sym
272
- # Symbol matching any start element, or if it is String, any start element which is a String
273
- # @return [Array<Symbol, String>] list of symbol (singular), or strings which are start symbol, or nil if there are none
274
- def starts_with?(sym)
275
- if seq? && sym === (v = expr.fetch(1, nil))
276
- [v]
277
- elsif alt? && expr.any? {|e| sym === e}
278
- expr.select {|e| sym === e}
343
+ # Return new rules after recursively applying #to_bnf
344
+ new_rules = new_rules.map {|r| r.to_peg}.flatten
345
+ elsif expr.first == :diff && !terminal?
346
+ this = dup
347
+ new_rule = build([:not, expr[2]])
348
+ this.expr = [:seq, new_rule.sym, expr[1]]
349
+ new_rules << this
350
+ new_rules << new_rule
351
+ elsif [:hex, :istr, :range].include?(expr.first)
352
+ # This rules are fine, they just need to be terminals
353
+ raise "Encountered #{expr.first.inspect}, which is a #{self.kind}, not :terminal" unless self.terminal?
354
+ new_rules << self
279
355
  else
280
- nil
356
+ new_rules << self
281
357
  end
358
+
359
+ return new_rules.map {|r| r.extend(EBNF::PEG::Rule)}
282
360
  end
283
361
 
284
- # Do the firsts of this rule include the empty string?
285
- # @return [Boolean]
286
- def first_includes_eps?
287
- @first && @first.include?(:_eps)
288
- end
289
-
290
- # Add terminal as proceding this rule
291
- # @param [Array<Rule, Symbol, String>] terminals
292
- # @return [Integer] if number of terminals added
293
- def add_first(terminals)
294
- @first ||= []
295
- terminals = terminals.map {|t| t.is_a?(Rule) ? t.sym : t} - @first
296
- @first += terminals
297
- terminals.length
298
- end
299
-
300
- # Add terminal as following this rule. Don't add _eps as a follow
362
+ ##
363
+ # For :hex or :range, create a regular expression.
301
364
  #
302
- # @param [Array<Rule, Symbol, String>] terminals
303
- # @return [Integer] if number of terminals added
304
- def add_follow(terminals)
305
- # Remove terminals already in follows, and empty string
306
- terminals = terminals.map {|t| t.is_a?(Rule) ? t.sym : t} - (@follow || []) - [:_eps]
307
- unless terminals.empty?
308
- @follow ||= []
309
- @follow += terminals
365
+ # @return [Regexp]
366
+ def to_regexp
367
+ case expr.first
368
+ when :hex
369
+ Regexp.new(translate_codepoints(expr[1]))
370
+ when :istr
371
+ /#{expr.last}/ui
372
+ when :range
373
+ Regexp.new("[#{translate_codepoints(expr[1])}]")
374
+ else
375
+ raise "Can't turn #{expr.inspect} into a regexp"
310
376
  end
311
- terminals.length
312
377
  end
313
378
 
314
379
  # Is this a terminal?
380
+ #
315
381
  # @return [Boolean]
316
382
  def terminal?
317
383
  kind == :terminal
@@ -339,18 +405,14 @@ module EBNF
339
405
  expr.is_a?(Array) && expr.first == :seq
340
406
  end
341
407
 
342
- # Is this rule of the form (alt ...)?
343
- def alt?
344
- expr.is_a?(Array) && expr.first == :alt
345
- end
346
-
347
408
  def inspect
348
409
  "#<EBNF::Rule:#{object_id} " +
349
410
  {sym: sym, id: id, kind: kind, expr: expr}.inspect +
350
411
  ">"
351
412
  end
352
413
 
353
- # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}
414
+ # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}.
415
+ #
354
416
  # @param [Rule] other
355
417
  # @return [Boolean]
356
418
  def ==(other)
@@ -359,37 +421,259 @@ module EBNF
359
421
  expr == other.expr
360
422
  end
361
423
 
362
- # Two rules are equivalent if they have the same {#expr}
424
+ # Two rules are equivalent if they have the same {#expr}.
425
+ #
363
426
  # @param [Rule] other
364
427
  # @return [Boolean]
365
- def equivalent?(other)
366
- expr == other.expr
428
+ def eql?(other)
429
+ expr == other.expr
367
430
  end
368
431
 
369
- # Rewrite the rule substituting src_rule for dst_rule wherever
370
- # it is used in the production (first level only).
371
- # @param [Rule] src_rule
372
- # @param [Rule] dst_rule
373
- # @return [Rule]
374
- def rewrite(src_rule, dst_rule)
375
- case @expr
376
- when Array
377
- @expr = @expr.map {|e| e == src_rule.sym ? dst_rule.sym : e}
432
+ # Rules compare using their ids
433
+ def <=>(other)
434
+ if id && other.id
435
+ if id == other.id
436
+ id.to_s <=> other.id.to_s
437
+ else
438
+ id.to_f <=> other.id.to_f
439
+ end
378
440
  else
379
- @expr = dst_rule.sym if @expr == src_rule.sym
441
+ sym.to_s <=> other.sym.to_s
380
442
  end
381
- self
382
443
  end
383
444
 
384
- # Rules compare using their ids
385
- def <=>(other)
386
- if id.to_i == other.id.to_i
387
- id.to_s <=> other.id.to_s
445
+ ##
446
+ # Utility function to translate code points of the form '#xN' into ruby unicode characters
447
+ def translate_codepoints(str)
448
+ str.gsub(/#x\h+/) {|c| c[2..-1].scanf("%x").first.chr(Encoding::UTF_8)}
449
+ end
450
+
451
+ # Return the non-terminals for this rule.
452
+ #
453
+ # * `alt` => this is every non-terminal.
454
+ # * `diff` => this is every non-terminal.
455
+ # * `hex` => nil
456
+ # * `istr` => nil
457
+ # * `not` => this is the last expression, if any.
458
+ # * `opt` => this is the last expression, if any.
459
+ # * `plus` => this is the last expression, if any.
460
+ # * `range` => nil
461
+ # * `rept` => this is the last expression, if any.
462
+ # * `seq` => this is the first expression in the sequence, if any.
463
+ # * `star` => this is the last expression, if any.
464
+ #
465
+ # @param [Array<Rule>] ast
466
+ # The set of rules, used to turn symbols into rules
467
+ # @param [Array<Symbol,String,Array>] expr (@expr)
468
+ # The expression to check, defaults to the rule expression.
469
+ # Typically, if the expression is recursive, the embedded expression is called recursively.
470
+ # @return [Array<Rule>]
471
+ # @note this is used for LL(1) tansformation, so rule types are limited
472
+ def non_terminals(ast, expr = @expr)
473
+ ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).map do |sym|
474
+ case sym
475
+ when Symbol
476
+ r = ast.detect {|r| r.sym == sym}
477
+ r if r && r.rule?
478
+ when Array
479
+ non_terminals(ast, sym)
480
+ else
481
+ nil
482
+ end
483
+ end.flatten.compact.uniq
484
+ end
485
+
486
+ # Return the terminals for this rule.
487
+ #
488
+ # * `alt` => this is every terminal.
489
+ # * `diff` => this is every terminal.
490
+ # * `hex` => nil
491
+ # * `istr` => nil
492
+ # * `not` => this is the last expression, if any.
493
+ # * `opt` => this is the last expression, if any.
494
+ # * `plus` => this is the last expression, if any.
495
+ # * `range` => nil
496
+ # * `rept` => this is the last expression, if any.
497
+ # * `seq` => this is the first expression in the sequence, if any.
498
+ # * `star` => this is the last expression, if any.
499
+ #
500
+ # @param [Array<Rule>] ast
501
+ # The set of rules, used to turn symbols into rules
502
+ # @param [Array<Symbol,String,Array>] expr (@expr)
503
+ # The expression to check, defaults to the rule expression.
504
+ # Typically, if the expression is recursive, the embedded expression is called recursively.
505
+ # @return [Array<Rule>]
506
+ # @note this is used for LL(1) tansformation, so rule types are limited
507
+ def terminals(ast, expr = @expr)
508
+ ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).map do |sym|
509
+ case sym
510
+ when Symbol
511
+ r = ast.detect {|r| r.sym == sym}
512
+ r if r && r.terminal?
513
+ when String
514
+ sym
515
+ when Array
516
+ terminals(ast, sym)
517
+ end
518
+ end.flatten.compact.uniq
519
+ end
520
+
521
+ # Return the symbols used in the rule.
522
+ #
523
+ # @param [Array<Symbol,String,Array>] expr (@expr)
524
+ # The expression to check, defaults to the rule expression.
525
+ # Typically, if the expression is recursive, the embedded expression is called recursively.
526
+ # @return [Array<Rule>]
527
+ def symbols(expr = @expr)
528
+ expr[1..-1].map do |sym|
529
+ case sym
530
+ when Symbol
531
+ sym
532
+ when Array
533
+ symbols(sym)
534
+ end
535
+ end.flatten.compact.uniq
536
+ end
537
+
538
+ ##
539
+ # The following are used for LL(1) transformation.
540
+ ##
541
+
542
+ # Does this rule start with `sym`? It does if expr is that sym,
543
+ # expr starts with alt and contains that sym,
544
+ # or expr starts with seq and the next element is that sym.
545
+ #
546
+ # @param [Symbol, class] sym
547
+ # Symbol matching any start element, or if it is String, any start element which is a String
548
+ # @return [Array<Symbol, String>] list of symbol (singular), or strings which are start symbol, or nil if there are none
549
+ def starts_with?(sym)
550
+ if seq? && sym === (v = expr.fetch(1, nil))
551
+ [v]
552
+ elsif alt? && expr.any? {|e| sym === e}
553
+ expr.select {|e| sym === e}
554
+ else
555
+ nil
556
+ end
557
+ end
558
+
559
+ ##
560
+ # Validate the rule, with respect to an AST.
561
+ #
562
+ # @param [Array<Rule>] ast
563
+ # The set of rules, used to turn symbols into rules
564
+ # @param [Array<Symbol,String,Array>] expr (@expr)
565
+ # The expression to check, defaults to the rule expression.
566
+ # Typically, if the expression is recursive, the embedded expression is called recursively.
567
+ # @raise [RangeError]
568
+ def validate!(ast, expr = @expr)
569
+ op = expr.first
570
+ raise SyntaxError, "Unknown operator: #{op}" unless OP_ARGN.key?(op)
571
+ raise SyntaxError, "Argument count missmatch on operator #{op}, had #{expr.length - 1} expected #{OP_ARGN[op]}" if
572
+ OP_ARGN[op] && OP_ARGN[op] != expr.length - 1
573
+
574
+ # rept operator needs min and max
575
+ if op == :alt
576
+ raise SyntaxError, "alt operation must have at least one operand, had #{expr.length - 1}" unless expr.length > 1
577
+ elsif op == :rept
578
+ raise SyntaxError, "rept operation must an non-negative integer minimum, was #{expr[1]}" unless
579
+ expr[1].is_a?(Integer) && expr[1] >= 0
580
+ raise SyntaxError, "rept operation must an non-negative integer maximum or '*', was #{expr[2]}" unless
581
+ expr[2] == '*' || expr[2].is_a?(Integer) && expr[2] >= 0
582
+ end
583
+
584
+ case op
585
+ when :hex
586
+ raise SyntaxError, "Hex operand must be of form '#xN+': #{sym}" unless expr.last.match?(/^#x\h+$/)
587
+ when :range
588
+ str = expr.last.dup
589
+ str = str[1..-1] if str.start_with?('^')
590
+ str = str[0..-2] if str.end_with?('-') # Allowed at end of range
591
+ scanner = StringScanner.new(str)
592
+ hex = rchar = in_range = false
593
+ while !scanner.eos?
594
+ begin
595
+ if scanner.scan(Terminals::HEX)
596
+ raise SyntaxError if in_range && rchar
597
+ rchar = in_range = false
598
+ hex = true
599
+ elsif scanner.scan(Terminals::R_CHAR)
600
+ raise SyntaxError if in_range && hex
601
+ hex = in_range = false
602
+ rchar = true
603
+ else
604
+ raise(SyntaxError, "Range contains illegal components at offset #{scanner.pos}: was #{expr.last}")
605
+ end
606
+
607
+ if scanner.scan(/\-/)
608
+ raise SyntaxError if in_range
609
+ in_range = true
610
+ end
611
+ rescue SyntaxError
612
+ raise(SyntaxError, "Range contains illegal components at offset #{scanner.pos}: was #{expr.last}")
613
+ end
614
+ end
388
615
  else
389
- id.to_i <=> other.id.to_i
616
+ ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).each do |sym|
617
+ case sym
618
+ when Symbol
619
+ r = ast.detect {|r| r.sym == sym}
620
+ raise SyntaxError, "No rule found for #{sym}" unless r
621
+ when Array
622
+ validate!(ast, sym)
623
+ when String
624
+ raise SyntaxError, "String must be of the form CHAR*" unless sym.match?(/^#{Terminals::CHAR}*$/)
625
+ end
626
+ end
390
627
  end
391
628
  end
392
629
 
630
+ ##
631
+ # Validate the rule, with respect to an AST.
632
+ #
633
+ # Uses `#validate!` and catches `RangeError`
634
+ #
635
+ # @param [Array<Rule>] ast
636
+ # The set of rules, used to turn symbols into rules
637
+ # @return [Boolean]
638
+ def valid?(ast)
639
+ validate!(ast)
640
+ true
641
+ rescue SyntaxError
642
+ false
643
+ end
644
+
645
+ # Do the firsts of this rule include the empty string?
646
+ #
647
+ # @return [Boolean]
648
+ def first_includes_eps?
649
+ @first && @first.include?(:_eps)
650
+ end
651
+
652
+ # Add terminal as proceding this rule.
653
+ #
654
+ # @param [Array<Rule, Symbol, String>] terminals
655
+ # @return [Integer] if number of terminals added
656
+ def add_first(terminals)
657
+ @first ||= []
658
+ terminals = terminals.map {|t| t.is_a?(Rule) ? t.sym : t} - @first
659
+ @first += terminals
660
+ terminals.length
661
+ end
662
+
663
+ # Add terminal as following this rule. Don't add _eps as a follow
664
+ #
665
+ # @param [Array<Rule, Symbol, String>] terminals
666
+ # @return [Integer] if number of terminals added
667
+ def add_follow(terminals)
668
+ # Remove terminals already in follows, and empty string
669
+ terminals = terminals.map {|t| t.is_a?(Rule) ? t.sym : t} - (@follow || []) - [:_eps]
670
+ unless terminals.empty?
671
+ @follow ||= []
672
+ @follow += terminals
673
+ end
674
+ terminals.length
675
+ end
676
+
393
677
  private
394
678
  def ttl_expr(expr, pfx, depth, is_obj = true)
395
679
  indent = ' ' * depth
@@ -405,17 +689,28 @@ module EBNF
405
689
 
406
690
  case op
407
691
  when :seq, :alt, :diff
692
+ # Multiple operands
408
693
  statements << %{#{indent}#{bra}#{pfx}:#{op} (}
409
694
  expr.each {|a| statements += ttl_expr(a, pfx, depth + 1)}
410
695
  statements << %{#{indent} )#{ket}}
411
- when :opt, :plus, :star
696
+ when :opt, :plus, :star, :not
697
+ # Single operand
412
698
  statements << %{#{indent}#{bra}#{pfx}:#{op} }
413
699
  statements += ttl_expr(expr.first, pfx, depth + 1)
414
700
  statements << %{#{indent} #{ket}} unless ket.empty?
415
- when :_empty, :_eps, :_empty
701
+ when :rept
702
+ # Three operands (min, max and expr)
703
+ statements << %{ #{indent}#{pfx}:min #{expr[0].inspect};}
704
+ statements << %{ #{indent}#{pfx}:max #{expr[1].inspect};}
705
+ statements << %{#{indent}#{bra}#{pfx}:#{op} }
706
+ statements += ttl_expr(expr.last, pfx, depth + 1)
707
+ statements << %{#{indent} #{ket}} unless ket.empty?
708
+ when :_empty, :_eps
416
709
  statements << %{#{indent}"g:#{op.to_s[1..-1]}"}
417
710
  when :"'"
418
711
  statements << %{#{indent}"#{esc(expr)}"}
712
+ when :istr
713
+ statements << %{#{indent}#{bra} re:matches #{expr.first.inspect} #{ket}}
419
714
  when :range
420
715
  statements << %{#{indent}#{bra} re:matches #{cclass(expr.first).inspect} #{ket}}
421
716
  when :hex
@@ -471,7 +766,7 @@ module EBNF
471
766
  def make_sym_id(variation = nil)
472
767
  @id_seq ||= 0
473
768
  @id_seq += 1
474
- ["_#{@sym}_#{@id_seq}#{variation}".to_sym, "#{@id}.#{@id_seq}#{variation}"]
769
+ ["_#{@sym}_#{@id_seq}#{variation}".to_sym, ("#{@id}.#{@id_seq}#{variation}" if @id)]
475
770
  end
476
771
  end
477
772
  end