ebnf 2.0.0 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,322 +1,305 @@
1
+ require_relative 'ebnf/meta'
2
+ require 'logger'
3
+
1
4
  module EBNF
2
- module Parser
3
- ##
4
- # Iterate over rule strings.
5
- # a line that starts with '\[' or '@' starts a new rule
6
- #
7
- # @param [StringScanner] scanner
8
- # @yield rule_string
9
- # @yieldparam [String] rule_string
10
- def eachRule(scanner)
11
- cur_lineno = 1
12
- r = ''
13
- until scanner.eos?
14
- case
15
- when s = scanner.scan(%r(\s+)m)
16
- # Eat whitespace
17
- cur_lineno += s.count("\n")
18
- #debug("eachRule(ws)") { "[#{cur_lineno}] #{s.inspect}" }
19
- when s = scanner.scan(%r(/\*([^\*]|\*[^\/])*\*/)m)
20
- # Eat comments /* .. */
21
- cur_lineno += s.count("\n")
22
- debug("eachRule(comment)") { "[#{cur_lineno}] #{s.inspect}" }
23
- when s = scanner.scan(%r(\(\*([^\*]|\*[^\)])*\*\))m)
24
- # Eat comments (* .. *)
25
- cur_lineno += s.count("\n")
26
- debug("eachRule(comment)") { "[#{cur_lineno}] #{s.inspect}" }
27
- when s = scanner.scan(%r((#(?!x)|//).*$))
28
- # Eat comments // & #
29
- cur_lineno += s.count("\n")
30
- debug("eachRule(comment)") { "[#{cur_lineno}] #{s.inspect}" }
31
- when s = scanner.scan(/\A["']/)
32
- # Found a quote, scan until end of matching quote
33
- s += scanner.scan_until(/#{scanner.matched}|$/)
34
- r += s
35
- when s = scanner.scan(%r(^@terminals))
36
- #debug("eachRule(@terminals)") { "[#{cur_lineno}] #{s.inspect}" }
37
- yield(r) unless r.empty?
38
- @lineno = cur_lineno
39
- yield(s)
40
- r = ''
41
- when s = scanner.scan(/@pass/)
42
- # Found rule start, if we've already collected a rule, yield it
43
- #debug("eachRule(@pass)") { "[#{cur_lineno}] #{s.inspect}" }
44
- yield r unless r.empty?
45
- @lineno = cur_lineno
46
- r = s
47
- when s = scanner.scan(/(?:\[[\w\.]+\])\s*[\w\.]+\s*::=/)
48
- # Found rule start, if we've already collected a rule, yield it
49
- yield r unless r.empty?
50
- #debug("eachRule(rule)") { "[#{cur_lineno}] #{s.inspect}" }
51
- @lineno = cur_lineno
52
- r = s
53
- else
54
- # Collect until end of line, or start of comment or quote
55
- s = scanner.scan_until(%r{(?:[/\(]\*)|#(?!x)|//|["']|$})
56
- if scanner.matched.length > 0
57
- # Back up scan head before ending match
58
- scanner.pos = scanner.pos - scanner.matched.length
5
+ class Parser
6
+ include EBNF::PEG::Parser
7
+ include EBNF::Terminals
59
8
 
60
- # Remove matched from end of string
61
- s = s[0..-(scanner.matched.length+1)]
62
- end
63
- cur_lineno += s.count("\n")
64
- #debug("eachRule(rest)") { "[#{cur_lineno}] #{s.inspect}" }
65
- r += s
66
- end
67
- end
68
- yield r unless r.empty?
9
+ # Abstract syntax tree from parse
10
+ #
11
+ # @return [Array<EBNF::Rule>]
12
+ attr_reader :ast
13
+
14
+ # ## Terminals
15
+ # Define rules for Terminals, placing results on the input stack, making them available to upstream non-Terminal rules.
16
+ #
17
+ # Terminals are defined with a symbol matching the associated rule name, and an optional (although strongly encouraged) regular expression used to match the head of the input stream.
18
+ #
19
+ # The result of the terminal block is the semantic value of that terminal, which if often a string, but may be any instance which reflects the semantic interpretation of that terminal.
20
+ #
21
+ # The `value` parameter is the value matched by the regexp, if defined, or by the sub-terminal rules otherwise.
22
+ #
23
+ # The `prod` parameter is the name of the parent rule for which this terminal is matched, which may have a bearing in some circumstances, although not used in this example.
24
+ #
25
+ # If no block is provided, then the value which would have been passed to the block is used as the result directly.
26
+
27
+ # Match the Left hand side of a rule or terminal
28
+ #
29
+ # [11] LHS ::= ('[' SYMBOL+ ']' ' '+)? SYMBOL ' '* '::='
30
+ terminal(:LHS, LHS) do |value, prod|
31
+ value.to_s.scan(/(?:\[([^\]]+)\])?\s*(\w+)\s*::=/).first
32
+ end
33
+
34
+ # Match `SYMBOL` terminal
35
+ #
36
+ # [12] SYMBOL ::= ([a-z] | [A-Z] | [0-9] | '_' | '.')+
37
+ terminal(:SYMBOL, SYMBOL) do |value|
38
+ value.to_sym
69
39
  end
70
-
71
- ##
72
- # Parse a rule into an optional rule number, a symbol and an expression
73
- #
74
- # @param [String] rule
75
- # @return [Rule]
76
- def ruleParts(rule)
77
- num_sym, expr = rule.split('::=', 2).map(&:strip)
78
- num, sym = num_sym.split(']', 2).map(&:strip)
79
- num, sym = "", num if sym.nil?
80
- num = num[1..-1]
81
- r = Rule.new(sym && sym.to_sym, num, expression(expr).first, ebnf: self)
82
- debug("ruleParts") { r.inspect }
83
- r
40
+
41
+ # Match `HEX` terminal
42
+ #
43
+ # [13] HEX ::= #x' ([a-f] | [A-F] | [0-9])+
44
+ terminal(:HEX, HEX) do |value|
45
+ [:hex, value]
84
46
  end
85
47
 
86
- ##
87
- # Parse a string into an expression tree and a remaining string
88
- #
89
- # @example
90
- # >>> expression("a b c")
91
- # ((seq a b c) '')
92
- #
93
- # >>> expression("a? b+ c*")
94
- # ((seq (opt a) (plus b) (star c)) '')
95
- #
96
- # >>> expression(" | x xlist")
97
- # ((alt (seq) (seq x xlist)) '')
98
- #
99
- # >>> expression("a | (b - c)")
100
- # ((alt a (diff b c)) '')
101
- #
102
- # >>> expression("a b | c d")
103
- # ((alt (seq a b) (seq c d)) '')
104
- #
105
- # >>> expression("a | b | c")
106
- # ((alt a b c) '')
107
- #
108
- # >>> expression("a) b c")
109
- # (a ' b c')
110
- #
111
- # >>> expression("BaseDecl? PrefixDecl*")
112
- # ((seq (opt BaseDecl) (star PrefixDecl)) '')
113
- #
114
- # >>> expression("NCCHAR1 | diff | [0-9] | #x00B7 | [#x0300-#x036F] | \[#x203F-#x2040\]")
115
- # ((alt NCCHAR1 diff
116
- # (range '0-9')
117
- # (hex '#x00B7')
118
- # (range '#x0300-#x036F')
119
- # (range, '#x203F-#x2040')) '')
120
- #
121
- # @param [String] s
122
- # @return [Array]
123
- def expression(s)
124
- debug("expression") {"(#{s.inspect})"}
125
- e, s = depth {alt(s)}
126
- debug {"=> alt returned #{[e, s].inspect}"}
127
- unless s.to_s.empty?
128
- t, ss = depth {terminal(s)}
129
- debug {"=> terminal returned #{[t, ss].inspect}"}
130
- return [e, ss] if t.is_a?(Array) && t.first == :")"
131
- end
132
- [e, s]
48
+ # Terminal for `RANGE` is matched as part of a `primary` rule.
49
+ #
50
+ # [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' - LHS
51
+ terminal(:RANGE, RANGE) do |value|
52
+ [:range, value[1..-2]]
133
53
  end
134
-
135
- ##
136
- # Parse alt
137
- # >>> alt("a | b | c")
138
- # ((alt a b c) '')
139
- # @param [String] s
140
- # @return [Array]
141
- def alt(s)
142
- debug("alt") {"(#{s.inspect})"}
143
- args = []
144
- while !s.to_s.empty?
145
- e, s = depth {seq(s)}
146
- debug {"=> seq returned #{[e, s].inspect}"}
147
- if e.to_s.empty?
148
- break unless args.empty?
149
- e = [:seq, []] # empty sequence
150
- end
151
- args << e
152
- unless s.to_s.empty?
153
- t, ss = depth {terminal(s)}
154
- break unless t[0] == :alt
155
- s = ss
156
- end
157
- end
158
- args.length > 1 ? [args.unshift(:alt), s] : [e, s]
54
+
55
+ # Terminal for `O_RANGE` is matched as part of a `primary` rule.
56
+ #
57
+ # [15] O_RANGE ::= '[^' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']'
58
+ terminal(:O_RANGE, O_RANGE) do |value|
59
+ [:range, value[1..-2]]
159
60
  end
160
-
161
- ##
162
- # parse seq
163
- #
164
- # >>> seq("a b c")
165
- # ((seq a b c) '')
166
- #
167
- # >>> seq("a b? c")
168
- # ((seq a (opt b) c) '')
169
- def seq(s)
170
- debug("seq") {"(#{s.inspect})"}
171
- args = []
172
- while !s.to_s.empty?
173
- e, ss = depth {diff(s)}
174
- debug {"=> diff returned #{[e, ss].inspect}"}
175
- unless e.to_s.empty?
176
- args << e
177
- s = ss
178
- else
179
- break;
180
- end
181
- end
182
- if args.length > 1
183
- [args.unshift(:seq), s]
184
- elsif args.length == 1
185
- args + [s]
61
+
62
+ # Match double quote string
63
+ #
64
+ # [16] STRING1 ::= '"' (CHAR - '"')* '"'
65
+ terminal(:STRING1, STRING1) do |value|
66
+ value[1..-2]
67
+ end
68
+
69
+ # Match single quote string
70
+ #
71
+ # [17] STRING2 ::= "'" (CHAR - "'")* "'"
72
+ terminal(:STRING2, STRING2) do |value|
73
+ value[1..-2]
74
+ end
75
+
76
+ # The `CHAR` and `R_CHAR` productions are not used explicitly
77
+
78
+ # Match `POSTFIX` terminal
79
+ #
80
+ # [20] POSTFIX ::= [?*+]
81
+ terminal(:POSTFIX, POSTFIX)
82
+
83
+ # The `PASS` productions is not used explicitly
84
+
85
+ # ## Non-terminal productions
86
+ # Define productions for non-Termainals. This can include `start_production` as well as `production` to hook into rule start and end. In some cases, we need to use sub-productions as generated when turning EBNF into PEG.
87
+ #
88
+ # Productions are defined with a symbol matching the associated rule name.
89
+ #
90
+ # The result of the productions is typically the abstract syntax tree matched by the rule, so far, but could be a specific semantic value, or could be ignored with the result being returned via the `callback`.
91
+ #
92
+ # The `value` parameter is the result returned from child productions
93
+ #
94
+ # The `data` parameter other data which may be returned by child productions placing information onto their input (unused in this example).
95
+ #
96
+ # The `callback` parameter provides access to a callback defined in the call to `parse`).
97
+
98
+ # Production for end of `declaration` non-terminal.
99
+ #
100
+ # Look for `@terminals` to change parser state to parsing terminals.
101
+ #
102
+ # Clears the packrat parser when called.
103
+ #
104
+ # `@pass` is ignored here.
105
+ #
106
+ # [2] declaration ::= '@terminals' | pass
107
+ production(:declaration, clear_packrat: true) do |value, data, callback|
108
+ # value contains a declaration.
109
+ # Invoke callback
110
+ callback.call(:terminals) if value == '@terminals'
111
+ nil
112
+ end
113
+
114
+ # Production for end of `rule` non-terminal.
115
+ #
116
+ # By setting `as_hash: true` in the `start_production`, the `value` parameter will be in the form `{LHS: "v", expression: "v"}`. Otherwise, it would be expressed using an array of hashes of the form `[{LHS: "v"}, {expression: "v"}]`.
117
+ #
118
+ # Clears the packrat parser when called.
119
+ #
120
+ # Create rule from expression value and pass to callback
121
+ #
122
+ # [3] rule ::= LHS expression
123
+ start_production(:rule, as_hash: true)
124
+ production(:rule, clear_packrat: true) do |value, data, callback|
125
+ # value contains an expression.
126
+ # Invoke callback
127
+ id, sym = value[:LHS]
128
+ expression = value[:expression]
129
+ callback.call(:rule, EBNF::Rule.new(sym.to_sym, id, expression))
130
+ nil
131
+ end
132
+
133
+ # Production for end of `expression` non-terminal.
134
+ # Passes through the optimized value of the alt production as follows:
135
+ #
136
+ # The `value` parameter, is of the form `[{alt: "v"}]`.
137
+ #
138
+ # [:alt foo] => foo
139
+ # [:alt foo bar] => [:alt foo bar]
140
+ #
141
+ # [4] expression ::= alt
142
+ production(:expression) do |value|
143
+ value.first[:alt]
144
+ end
145
+
146
+ # Production for end of `alt` non-terminal.
147
+ # Passes through the optimized value of the seq production as follows:
148
+ #
149
+ # The `value` parameter, is of the form `{seq: "v", _alt_1: "v"}`.
150
+ #
151
+ # [:seq foo] => foo
152
+ # [:seq foo bar] => [:seq foo bar]
153
+ #
154
+ # Note that this also may just pass through from `_alt_1`
155
+ #
156
+ # [5] alt ::= seq ('|' seq)*
157
+ start_production(:alt, as_hash: true)
158
+ production(:alt) do |value|
159
+ if value[:_alt_1].length > 0
160
+ [:alt, value[:seq]] + value[:_alt_1]
186
161
  else
187
- ["", s]
162
+ value[:seq]
188
163
  end
189
164
  end
190
-
191
- ##
192
- # parse diff
193
- #
194
- # >>> diff("a - b")
195
- # ((diff a b) '')
196
- def diff(s)
197
- debug("diff") {"(#{s.inspect})"}
198
- e1, s = depth {postfix(s)}
199
- debug {"=> postfix returned #{[e1, s].inspect}"}
200
- unless e1.to_s.empty?
201
- unless s.to_s.empty?
202
- t, ss = depth {terminal(s)}
203
- debug {"diff #{[t, ss].inspect}"}
204
- if t.is_a?(Array) && t.first == :diff
205
- s = ss
206
- e2, s = primary(s)
207
- unless e2.to_s.empty?
208
- return [[:diff, e1, e2], s]
209
- else
210
- error("diff", "Syntax Error")
211
- raise "Syntax Error"
212
- end
213
- end
214
- end
215
- end
216
- [e1, s]
165
+
166
+ # Production for end of `_alt_1` non-terminal.
167
+ # Used to collect the `('|' seq)*` portion of the `alt` non-terminal:
168
+ #
169
+ # The `value` parameter, is of the form `[{seq: ["v"]}]`.
170
+ #
171
+ # [5] _alt_1 ::= ('|' seq)*
172
+ production(:_alt_1) do |value|
173
+ value.map {|a1| a1.last[:seq]}.compact # Get rid of '|'
217
174
  end
218
-
219
- ##
220
- # parse postfix
221
- #
222
- # >>> postfix("a b c")
223
- # (a ' b c')
224
- #
225
- # >>> postfix("a? b c")
226
- # ((opt, a) ' b c')
227
- def postfix(s)
228
- debug("postfix") {"(#{s.inspect})"}
229
- e, s = depth {primary(s)}
230
- debug {"=> primary returned #{[e, s].inspect}"}
231
- return ["", s] if e.to_s.empty?
232
- if !s.to_s.empty?
233
- t, ss = depth {terminal(s)}
234
- debug {"=> #{[t, ss].inspect}"}
235
- if t.is_a?(Array) && [:opt, :star, :plus].include?(t.first)
236
- return [[t.first, e], ss]
237
- end
238
- end
239
- [e, s]
175
+
176
+ # Production for end of `seq` non-terminal.
177
+ # Passes through the optimized value of the `diff` production as follows:
178
+ #
179
+ # The `value` parameter, is an array of values, which cannot be empty.
180
+ #
181
+ # [:diff foo] => foo
182
+ # [:diff foo bar] => [:diff foo bar]
183
+ #
184
+ # Note that this also may just pass through from `_seq_1`
185
+ #
186
+ # [6] seq ::= diff+
187
+ production(:seq) do |value|
188
+ value.length == 1 ? value.first : ([:seq] + value)
240
189
  end
241
190
 
242
- ##
243
- # parse primary
244
- #
245
- # >>> primary("a b c")
246
- # (a ' b c')
247
- def primary(s)
248
- debug("primary") {"(#{s.inspect})"}
249
- t, s = depth {terminal(s)}
250
- debug {"=> terminal returned #{[t, s].inspect}"}
251
- if t.is_a?(Symbol) || t.is_a?(String)
252
- [t, s]
253
- elsif %w(range hex).map(&:to_sym).include?(t.first)
254
- [t, s]
255
- elsif t.first == :"("
256
- e, s = depth {expression(s)}
257
- debug {"=> expression returned #{[e, s].inspect}"}
258
- [e, s]
191
+ # `Diff` production returns concatenated postfix values
192
+ #
193
+ # The `value` parameter, is of the form `{postfix: "v", _diff_1: "v"}`.
194
+ #
195
+ # [7] diff ::= postfix ('-' postfix)?
196
+ start_production(:diff, as_hash: true)
197
+ production(:diff) do |value|
198
+ if value[:_diff_1]
199
+ [:diff, value[:postfix], value[:_diff_1]]
259
200
  else
260
- ["", s]
201
+ value[:postfix]
261
202
  end
262
203
  end
263
-
264
- ##
265
- # parse one terminal; return the terminal and the remaining string
266
- #
267
- # A terminal is represented as a tuple whose 1st item gives the type;
268
- # some types have additional info in the tuple.
269
- #
270
- # @example
271
- # >>> terminal("'abc' def")
272
- # ('abc' ' def')
273
- #
274
- # >>> terminal("[0-9]")
275
- # ((range '0-9') '')
276
- # >>> terminal("#x00B7")
277
- # ((hex '#x00B7') '')
278
- # >>> terminal ("\[#x0300-#x036F\]")
279
- # ((range '#x0300-#x036F') '')
280
- # >>> terminal("\[^<>'{}|^`\]-\[#x00-#x20\]")
281
- # ((range "^<>'{}|^`") '-\[#x00-#x20\]')
282
- def terminal(s)
283
- s = s.strip
284
- #STDERR.puts s.inspect
285
- case m = s[0,1]
286
- when '"', "'" # STRING1 or STRING2
287
- l, s = s[1..-1].split(m.rstrip, 2)
288
- [LL1::Lexer.unescape_string(l), s]
289
- when '[' # RANGE, O_RANGE
290
- l, s = s[1..-1].split(/(?<=[^\\])\]/, 2)
291
- [[:range, LL1::Lexer.unescape_string(l)], s]
292
- when '#' # HEX
293
- s.match(/(#x\h+)(.*)$/)
294
- l, s = $1, $2
295
- [[:hex, l], s]
296
- when /[\w\.]/ # SYMBOL
297
- s.match(/([\w\.]+)(.*)$/)
298
- l, s = $1, $2
299
- [l.to_sym, s]
300
- when '@' # @pass or @terminals
301
- s.match(/@(#\w+)(.*)$/)
302
- l, s = $1, $2
303
- [[:"@", l], s]
304
- when '-'
305
- [[:diff], s[1..-1]]
306
- when '?'
307
- [[:opt], s[1..-1]]
308
- when '|'
309
- [[:alt], s[1..-1]]
310
- when '+'
311
- [[:plus], s[1..-1]]
312
- when '*'
313
- [[:star], s[1..-1]]
314
- when /[\(\)]/ # '(' or ')'
315
- [[m.to_sym], s[1..-1]]
316
- else
317
- error("terminal", "unrecognized terminal: #{s.inspect}")
318
- raise "Syntax Error, unrecognized terminal: #{s.inspect}"
204
+
205
+ production(:_diff_1) do |value|
206
+ value.last[:postfix] if value
207
+ end
208
+
209
+ # Production for end of `postfix` non-terminal.
210
+ # Either returns the `primary` production value, or as modified by the `postfix`.
211
+ #
212
+ # The `value` parameter, is of the form `{primary: "v", _postfix_1: "v"}`.
213
+ #
214
+ # [:primary] => [:primary]
215
+ # [:primary, '*'] => [:star, :primary]
216
+ # [:primary, '+'] => [:plus, :primary]
217
+ # [:primary, '?'] => [:opt, :primary]
218
+ #
219
+ # [8] postfix ::= primary POSTFIX?
220
+ start_production(:postfix, as_hash: true)
221
+ production(:postfix) do |value|
222
+ # Push result onto input stack, as the `diff` production can have some number of `postfix` values that are applied recursively
223
+ case value[:_postfix_1]
224
+ when "*" then [:star, value[:primary]]
225
+ when "+" then [:plus, value[:primary]]
226
+ when "?" then [:opt, value[:primary]]
227
+ else value[:primary]
228
+ end
229
+ end
230
+
231
+ # Production for end of `primary` non-terminal.
232
+ # Places `:primary` on the stack
233
+ #
234
+ # The `value` parameter, is either a string (for a terminal) or an array of the form `['(': '(', expression: "v", ')', ')']`.
235
+ #
236
+ # This may either be a terminal, or the result of an `expression`.
237
+ #
238
+ # [9] primary ::= HEX
239
+ # | SYMBOL
240
+ # | RANGE
241
+ # | O_RANGE
242
+ # | STRING1
243
+ # | STRING2
244
+ # | '(' expression ')'
245
+ production(:primary) do |value|
246
+ Array(value).length > 2 ? value[1][:expression] : value
247
+ end
248
+
249
+ # Production for end of pass non-terminal.
250
+ #
251
+ # [10] pass ::= '@pass' expression
252
+ production(:pass) do |value, data, callback|
253
+ # Invoke callback
254
+ callback.call(:pass, value.last[:expression])
255
+ end
256
+
257
+ # ## Parser invocation.
258
+ # On start, yield ourselves if a block is given, otherwise, return this parser instance
259
+ #
260
+ # @param [#read, #to_s] input
261
+ # @param [Hash{Symbol => Object}] options
262
+ # @option options [Boolean] :level
263
+ # Trace level. 0(debug), 1(info), 2(warn), 3(error).
264
+ # @return [EBNFParser]
265
+ def initialize(input, **options, &block)
266
+ # If the `level` option is set, instantiate a logger for collecting trace information.
267
+ if options.has_key?(:level)
268
+ options[:logger] = Logger.new(STDERR)
269
+ options[:logger].level = options[:level]
270
+ options[:logger].formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"}
271
+ end
272
+
273
+ # Read input, if necessary, which will be used in a Scanner.
274
+ @input = input.respond_to?(:read) ? input.read : input.to_s
275
+
276
+ parsing_terminals = false
277
+ @ast = []
278
+ parse(@input, :ebnf, EBNFMeta::RULES,
279
+ # Use an optimized Regexp for whitespace
280
+ whitespace: EBNF::Terminals::PASS,
281
+ **options
282
+ ) do |context, *data|
283
+ rule = case context
284
+ when :terminals
285
+ # After parsing `@terminals`
286
+ # This changes the state of the parser to treat subsequent rules as terminals.
287
+ parsing_terminals = true
288
+ rule = EBNF::Rule.new(nil, nil, data.first, kind: :terminals)
289
+ when :pass
290
+ # After parsing `@pass`
291
+ # This defines a specific rule for whitespace.
292
+ rule = EBNF::Rule.new(nil, nil, data.first, kind: :pass)
293
+ when :rule
294
+ # A rule which has already been turned into a `Rule` object.
295
+ rule = data.first
296
+ rule.kind = :terminal if parsing_terminals
297
+ rule
298
+ end
299
+ @ast << rule if rule
319
300
  end
301
+ rescue EBNF::PEG::Parser::Error => e
302
+ raise SyntaxError, e.message
320
303
  end
321
304
  end
322
305
  end