ebnf 1.2.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,39 @@
1
+ module EBNF
2
+ module PEG
3
+ autoload :Parser, 'ebnf/peg/parser'
4
+ autoload :Rule, 'ebnf/peg/rule'
5
+
6
+ ##
7
+ # Transform EBNF Rule set for PEG parsing:
8
+ #
9
+ # * Transform each rule into a set of sub-rules extracting unnamed sequences into new rules, using {Rule#to_peg}.
10
+ # @return [ENBF] self
11
+ def make_peg
12
+ progress("make_peg") {"Start: #{@ast.length} rules"}
13
+ new_ast = []
14
+
15
+ ast.each do |rule|
16
+ debug("make_peg") {"expand from: #{rule.inspect}"}
17
+ new_rules = rule.to_peg
18
+ debug(" => ") {new_rules.map(&:sym).join(', ')}
19
+ new_ast += new_rules
20
+ end
21
+
22
+ @ast = new_ast
23
+ progress("make_peg") {"End: #{@ast.length} rules"}
24
+ self
25
+ end
26
+
27
+ ##
28
+ # Output Ruby parser files for PEG parsing
29
+ #
30
+ # @param [IO, StringIO] output
31
+ def to_ruby_peg(output, **options)
32
+ output.puts " RULES = ["
33
+ ast.each do |rule|
34
+ output.puts " " + rule.to_ruby + '.extend(EBNF::PEG::Rule),'
35
+ end
36
+ output.puts " ]"
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,535 @@
1
+ module EBNF::PEG
2
+ ##
3
+ # A Generic PEG parser using the parsed rules modified for PEG parseing.
4
+ module Parser
5
+ ##
6
+ # @return [Regexp, Rule] how to remove inter-rule whitespace
7
+ attr_reader :whitespace
8
+
9
+ ##
10
+ # @return [Scanner] used for scanning input.
11
+ attr_reader :scanner
12
+
13
+ ##
14
+ # A Hash structure used for memoizing rule results for a given input location.
15
+ #
16
+ # @example Partial structure for memoizing results for a particular rule
17
+ #
18
+ # {
19
+ # rule: {
20
+ # 86: {
21
+ # pos:
22
+ # result: [<EBNF::Rule:80 {
23
+ # sym: :ebnf,
24
+ # id: "1",
25
+ # kind: :rule,
26
+ # expr: [:star, [:alt, :declaration, :rule]]}>],
27
+ # }
28
+ # 131: [<EBNF::Rule:80 {sym: :ebnf,
29
+ # id: "1",
30
+ # kind: :rule,
31
+ # expr: [:star, [:alt, :declaration, :rule]]}>,
32
+ # <EBNF::Rule:100 {
33
+ # sym: :declaration,
34
+ # id: "2",
35
+ # kind: :rule,
36
+ # expr: [:alt, "@terminals", :pass]}>]
37
+ # },
38
+ # POSTFIX: {
39
+ # 80: "*",
40
+ # 368: "*",
41
+ # 399: "+"
42
+ # }
43
+ # }
44
+ # @return [Hash{Integer => Hash{Symbol => Object}}]
45
+ attr_reader :packrat
46
+
47
+ def self.included(base)
48
+ base.extend(ClassMethods)
49
+ end
50
+
51
+ # DSL for creating terminals and productions
52
+ module ClassMethods
53
+ def start_handlers; (@start_handlers ||= {}); end
54
+ def production_handlers; (@production_handlers ||= {}); end
55
+ def terminal_handlers; (@terminal_handlers ||= {}); end
56
+ def terminal_regexps; (@terminal_regexps ||= {}); end
57
+
58
+ ##
59
+ # Defines the pattern for a terminal node and a block to be invoked
60
+ # when ther terminal is encountered. If the block is missing, the
61
+ # value of the terminal will be placed on the input hash to be returned
62
+ # to a previous production. Block is called in an evaluation block from
63
+ # the enclosing parser.
64
+ #
65
+ # If no block is provided, then the value which would have been passed to the block is used as the result directly.
66
+ #
67
+ # @param [Symbol] term
68
+ # The terminal name.
69
+ # @param [Regexp] regexp (nil)
70
+ # Pattern used to scan for this terminal,
71
+ # defaults to the expression defined in the associated rule.
72
+ # If unset, the terminal rule is used for matching.
73
+ # @param [Hash] options
74
+ # @option options [Hash{String => String}] :map ({})
75
+ # A mapping from terminals, in lower-case form, to
76
+ # their canonical value
77
+ # @option options [Boolean] :unescape
78
+ # Cause strings and codepoints to be unescaped.
79
+ # @yield [value, prod]
80
+ # @yieldparam [String] value
81
+ # The scanned terminal value.
82
+ # @yieldparam [Symbol] prod
83
+ # A symbol indicating the production which referenced this terminal
84
+ # @yieldparam [Proc] block
85
+ # Block passed to initialization for yielding to calling parser.
86
+ # Should conform to the yield specs for #initialize
87
+ def terminal(term, regexp = nil, **options, &block)
88
+ terminal_regexps[term] = regexp if regexp
89
+ terminal_handlers[term] = block if block_given?
90
+ end
91
+
92
+ ##
93
+ # Defines a production called at the beggining of a particular production
94
+ # with data from previous production along with data defined for the
95
+ # current production. Block is called in an evaluation block from
96
+ # the enclosing parser.
97
+ #
98
+ # @param [Symbol] term
99
+ # The rule name
100
+ # @yield [data, block]
101
+ # @yieldparam [Hash] data
102
+ # A Hash defined for the current production, during :start
103
+ # may be initialized with data to pass to further productions,
104
+ # during :finish, it contains data placed by earlier productions
105
+ # @yieldparam [Proc] block
106
+ # Block passed to initialization for yielding to calling parser.
107
+ # Should conform to the yield specs for #initialize
108
+ # Yield to generate a triple
109
+ def start_production(term, &block)
110
+ start_handlers[term] = block
111
+ end
112
+
113
+ ##
114
+ # Defines a production called when production of associated
115
+ # non-terminals has completed
116
+ # with data from previous production along with data defined for the
117
+ # current production. Block is called in an evaluation block from
118
+ # the enclosing parser.
119
+ #
120
+ # @param [Symbol] term
121
+ # Term which is a key in the branch table
122
+ # @param [Boolean] clear_packrat (false)
123
+ # Clears the packrat state on completion to reduce memory requirements of parser. Use only on a top-level rule when it is determined that no further backtracking is necessary.
124
+ # @yield [result, data, block]
125
+ # @yieldparam [Object] result
126
+ # The result from sucessfully parsing the production.
127
+ # @yieldparam [Hash] data
128
+ # A Hash defined for the current production, during :start
129
+ # may be initialized with data to pass to further productions,
130
+ # during :finish, it contains data placed by earlier productions
131
+ # @yieldparam [Proc] block
132
+ # Block passed to initialization for yielding to calling parser.
133
+ # Should conform to the yield specs for #initialize
134
+ # @yieldreturn [Object] the result of this production.
135
+ # Yield to generate a triple
136
+ def production(term, clear_packrat: false, &block)
137
+ production_handlers[term] = [block, clear_packrat]
138
+ end
139
+
140
+ # Evaluate a handler, delegating to the specified object.
141
+ # This is necessary so that handlers can operate within the
142
+ # binding context of the parser in which they're invoked.
143
+ # @param [Object] object
144
+ # @return [Object]
145
+ def eval_with_binding(object)
146
+ @delegate = object
147
+ object.instance_eval {yield}
148
+ end
149
+
150
+ private
151
+
152
+ def method_missing(method, *args, &block)
153
+ if @delegate ||= nil
154
+ # special handling when last arg is **options
155
+ params = @delegate.method(method).parameters
156
+ if params.any? {|t, _| t == :keyrest} && args.last.is_a?(Hash)
157
+ opts = args.pop
158
+ @delegate.send(method, *args, **opts, &block)
159
+ else
160
+ @delegate.send(method, *args, &block)
161
+ end
162
+ else
163
+ super
164
+ end
165
+ end
166
+ end
167
+
168
+ ##
169
+ # Initializes a new parser instance.
170
+ #
171
+ # @param [String, #to_s] input
172
+ # @param [Symbol, #to_s] start
173
+ # The starting production for the parser. It may be a URI from the grammar, or a symbol representing the local_name portion of the grammar URI.
174
+ # @param [Array<EBNF::PEG::Rule>] rules
175
+ # The parsed rules, which control parsing sequence.
176
+ # Identify the symbol of the starting rule with `start`.
177
+ # @param [Hash{Symbol => Object}] options
178
+ # @option options[Integer] :high_water passed to lexer
179
+ # @option options [Logger] :logger for errors/progress/debug.
180
+ # @option options[Integer] :low_water passed to lexer
181
+ # @option options [Symbol, Regexp] :whitespace
182
+ # Symbol of whitespace rule (defaults to `@pass`), or a regular expression
183
+ # for eating whitespace between non-terminal rules (strongly encouraged).
184
+ # @yield [context, *data]
185
+ # Yields to return data to parser
186
+ # @yieldparam [:statement, :trace] context
187
+ # Context for block
188
+ # @yieldparam [Symbol] *data
189
+ # Data specific to the call
190
+ # @return [Object] AST resulting from parse
191
+ # @raise [Exception] Raises exceptions for parsing errors
192
+ # or errors raised during processing callbacks. Internal
193
+ # errors are raised using {Error}.
194
+ def parse(input = nil, start = nil, rules = nil, **options, &block)
195
+ start ||= options[:start]
196
+ rules ||= options[:rules] || []
197
+ @rules = rules.inject({}) {|memo, rule| memo.merge(rule.sym => rule)}
198
+ @packrat = {}
199
+
200
+ # Add parser reference to each rule
201
+ @rules.each_value {|rule| rule.parser = self}
202
+
203
+ # Take whitespace from options, a named rule, a `pass` rule, a rule named :WS, or a default
204
+ @whitespace = case options[:whitespace]
205
+ when Regexp then options[:whitespace]
206
+ when Symbol then @rules[options[:whitespace]]
207
+ end ||
208
+ @rules.values.detect(&:pass?) ||
209
+ /(?:\s|(?:#[^x][^\n\r]*))+/m.freeze
210
+
211
+ @options = options.dup
212
+ @productions = []
213
+ @parse_callback = block
214
+ @error_log = []
215
+ @prod_data = []
216
+
217
+ @scanner = EBNF::LL1::Scanner.new(input)
218
+ start = start.split('#').last.to_sym unless start.is_a?(Symbol)
219
+ start_rule = @rules[start]
220
+ raise Error, "Starting production #{start.inspect} not defined" unless start_rule
221
+
222
+ result = start_rule.parse(scanner)
223
+ if result == :unmatched
224
+ # Start rule wasn't matched, which is about the only error condition
225
+ error("--top--", @furthest_failure.to_s,
226
+ pos: @furthest_failure.pos,
227
+ lineno: @furthest_failure.lineno,
228
+ rest: scanner.string[@furthest_failure.pos, 20])
229
+ end
230
+
231
+ # Eat any remaining whitespace
232
+ start_rule.eat_whitespace(scanner)
233
+ if !scanner.eos?
234
+ error("--top--", @furthest_failure.to_s,
235
+ pos: @furthest_failure.pos,
236
+ lineno: @furthest_failure.lineno,
237
+ rest: scanner.string[@furthest_failure.pos, 20])
238
+ end
239
+
240
+ # When all is said and done, raise the error log
241
+ unless @error_log.empty?
242
+ raise Error, @error_log.join("\n")
243
+ end
244
+
245
+ result
246
+ end
247
+
248
+ # Depth of parsing, for log output.
249
+ def depth; (@productions || []).length; end
250
+
251
+ # Current ProdData element
252
+ def prod_data; @prod_data.last || {}; end
253
+
254
+ # Clear out packrat memoizer. This is appropriate when completing a top-level rule when there is no possibility of backtracking.
255
+ def clear_packrat; @packrat.clear; end
256
+
257
+ ##
258
+ # Error information, used as level `3` logger messages.
259
+ # Messages may be logged and are saved for reporting at end of parsing.
260
+ #
261
+ # @param [String] node Relevant location associated with message
262
+ # @param [String] message Error string
263
+ # @param [Hash{Symbol => Object}] options
264
+ # @option options [URI, #to_s] :production
265
+ # @option options [Token] :token
266
+ # @see #debug
267
+ def error(node, message, **options)
268
+ lineno = options[:lineno] || (scanner.lineno if scanner)
269
+ m = "ERROR "
270
+ m += "[line: #{lineno}] " if lineno
271
+ m += message
272
+ m += " (found #{options[:rest].inspect})" if options[:rest]
273
+ m += ", production = #{options[:production].inspect}" if options[:production]
274
+ @error_log << m unless @recovering
275
+ @recovering = true
276
+ debug(node, m, level: 3, **options)
277
+ if options[:raise] || @options[:validate]
278
+ raise Error.new(m, lineno: lineno, rest: options[:rest], production: options[:production])
279
+ end
280
+ end
281
+
282
+ ##
283
+ # Warning information, used as level `2` logger messages.
284
+ # Messages may be logged and are saved for reporting at end of parsing.
285
+ #
286
+ # @param [String] node Relevant location associated with message
287
+ # @param [String] message Error string
288
+ # @param [Hash] options
289
+ # @option options [URI, #to_s] :production
290
+ # @option options [Token] :token
291
+ # @see #debug
292
+ def warn(node, message, **options)
293
+ lineno = options[:lineno] || (scanner.lineno if scanner)
294
+ m = "WARNING "
295
+ m += "[line: #{lineno}] " if lineno
296
+ m += message
297
+ m += " (found #{options[:rest].inspect})" if options[:rest]
298
+ m += ", production = #{options[:production].inspect}" if options[:production]
299
+ debug(node, m, level: 2, **options)
300
+ end
301
+
302
+ ##
303
+ # Progress logged when parsing. Passed as level `1` logger messages.
304
+ #
305
+ # The call is ignored, unless `@options[:logger]` is set.
306
+ #
307
+ # @overload progress(node, message, **options, &block)
308
+ # @param [String] node Relevant location associated with message
309
+ # @param [String] message ("")
310
+ # @param [Hash] options
311
+ # @option options [Integer] :depth
312
+ # Recursion depth for indenting output
313
+ # @see #debug
314
+ def progress(node, *args, &block)
315
+ return unless @options[:logger]
316
+ args << {} unless args.last.is_a?(Hash)
317
+ args.last[:level] ||= 1
318
+ debug(node, *args, &block)
319
+ end
320
+
321
+ ##
322
+ # Debug logging.
323
+ #
324
+ # The call is ignored, unless `@options[:logger]` is set.
325
+ #
326
+ # @overload debug(node, message, **options)
327
+ # @param [Array<String>] args Relevant location associated with message
328
+ # @param [Hash] options
329
+ # @option options [Integer] :depth
330
+ # Recursion depth for indenting output
331
+ # @yieldreturn [String] additional string appended to `message`.
332
+ def debug(*args)
333
+ return unless @options[:logger]
334
+ options = args.last.is_a?(Hash) ? args.pop : {}
335
+ lineno = options[:lineno] || (scanner.lineno if scanner)
336
+ level = options.fetch(:level, 0)
337
+
338
+ depth = options[:depth] || self.depth
339
+ args << yield if block_given?
340
+ @options[:logger].add(level, "[#{lineno}]" + (" " * depth) + args.join(" "))
341
+ end
342
+
343
+ # Start for production
344
+ # Adds data avoiable during the processing of the production
345
+ def onStart(prod)
346
+ handler = self.class.start_handlers[prod]
347
+ @productions << prod
348
+ debug("#{prod}(:start)", "",
349
+ lineno: (scanner.lineno if scanner),
350
+ pos: (scanner.pos if scanner),
351
+ depth: (depth + 1)) {"#{prod}, pos: #{scanner ? scanner.pos : '?'}, rest: #{scanner ? scanner.rest[0..20].inspect : '?'}"}
352
+ if handler
353
+ # Create a new production data element, potentially allowing handler
354
+ # to customize before pushing on the @prod_data stack
355
+ data = {}
356
+ begin
357
+ self.class.eval_with_binding(self) {
358
+ handler.call(data, @parse_callback)
359
+ }
360
+ rescue ArgumentError, Error => e
361
+ error("start", "#{e.class}: #{e.message}", production: prod)
362
+ @recovering = false
363
+ end
364
+ @prod_data << data
365
+ elsif self.class.production_handlers[prod]
366
+ # Make sure we push as many was we pop, even if there is no
367
+ # explicit start handler
368
+ @prod_data << {}
369
+ end
370
+ end
371
+
372
+ # Finish of production
373
+ #
374
+ # @param [Object] result parse result
375
+ # @return [Object] parse result, or the value returned from the handler
376
+ def onFinish(result)
377
+ #puts "prod_data(f): " + @prod_data.inspect
378
+ prod = @productions.last
379
+ handler, clear_packrat = self.class.production_handlers[prod]
380
+ data = @prod_data.pop if handler || self.class.start_handlers[prod]
381
+ if handler && !@recovering && result != :unmatched
382
+ # Pop production data element from stack, potentially allowing handler to use it
383
+ result = begin
384
+ self.class.eval_with_binding(self) {
385
+ handler.call(result, data, @parse_callback)
386
+ }
387
+ rescue ArgumentError, Error => e
388
+ error("finish", "#{e.class}: #{e.message}", production: prod)
389
+ @recovering = false
390
+ end
391
+ end
392
+ progress("#{prod}(:finish)", "",
393
+ depth: (depth + 1),
394
+ lineno: (scanner.lineno if scanner),
395
+ level: result == :unmatched ? 0 : 1) do
396
+ "#{result.inspect}@(#{scanner ? scanner.pos : '?'}), rest: #{scanner ? scanner.rest[0..20].inspect : '?'}"
397
+ end
398
+ self.clear_packrat if clear_packrat
399
+ @productions.pop
400
+ result
401
+ end
402
+
403
+ # A terminal with a defined handler
404
+ #
405
+ # @param [Symbol] prod from the symbol of the associated rule
406
+ # @param [String] value the scanned string
407
+ # @return [String, Object] either the result from the handler, or the token
408
+ def onTerminal(prod, value)
409
+ parentProd = @productions.last
410
+ handler = self.class.terminal_handlers[prod]
411
+ if handler && value != :unmatched
412
+ value = begin
413
+ self.class.eval_with_binding(self) {
414
+ handler.call(value, parentProd, @parse_callback)
415
+ }
416
+ rescue ArgumentError, Error => e
417
+ error("terminal", "#{e.class}: #{e.message}", value: value, production: prod)
418
+ @recovering = false
419
+ end
420
+ end
421
+ progress("#{prod}(:terminal)", "",
422
+ depth: (depth + 2),
423
+ lineno: (scanner.lineno if scanner),
424
+ level: value == :unmatched ? 0 : 1) do
425
+ "#{value.inspect}@(#{scanner ? scanner.pos : '?'})"
426
+ end
427
+ value
428
+ end
429
+
430
+ ##
431
+ # Find a rule for a symbol
432
+ #
433
+ # @param [Symbol] sym
434
+ # @return [Rule]
435
+ def find_rule(sym)
436
+ @rules[sym]
437
+ end
438
+
439
+ ##
440
+ # Find a regular expression defined for a terminal
441
+ #
442
+ # @param [Symbol] sym
443
+ # @return [Regexp]
444
+ def find_terminal_regexp(sym)
445
+ self.class.terminal_regexps[sym]
446
+ end
447
+
448
+ ##
449
+ # Record furthest failure.
450
+ #
451
+ # @param [Integer] pos
452
+ # The position in the input stream where the failure occured.
453
+ # @param [Integer] lineno
454
+ # Line where the failure occured.
455
+ # @param [Symbol, String] token
456
+ # The terminal token or string which attempted to match.
457
+ # @see https://arxiv.org/pdf/1405.6646.pdf
458
+ def update_furthest_failure(pos, lineno, token)
459
+ # Skip generated productions
460
+ return if token.is_a?(Symbol) && token.to_s.start_with?('_')
461
+ if @furthest_failure.nil? || pos > @furthest_failure.pos
462
+ @furthest_failure = Unmatched.new(pos, lineno, [token])
463
+ elsif pos == @furthest_failure.pos && !@furthest_failure[:expecting].include?(token)
464
+ @furthest_failure[:expecting] << token
465
+ end
466
+ end
467
+
468
+ public
469
+
470
+ ##
471
+ # @!parse
472
+ # # Record details about an inmatched rule, including the following:
473
+ # #
474
+ # # * Input location and line number at time of failure.
475
+ # # * The rule at which this was found (non-terminal, and nat starting with '_').
476
+ # class Unmatched
477
+ # # @return [Integer] The position within the scanner which did not match.
478
+ # attr_reader :pos
479
+ # # @return [Integer] The line number which did not match.
480
+ # attr_reader :lineno
481
+ # # @return [Array<Symbol,String>]
482
+ # # Strings or production rules that attempted to match at this position.
483
+ # attr_reader :expecting
484
+ # end
485
+ class Unmatched < Struct.new(:pos, :lineno, :expecting)
486
+ def to_s
487
+ "syntax error, expecting #{expecting.map(&:inspect).join(', ')}"
488
+ end
489
+ end
490
+
491
+ ##
492
+ # Raised for errors during parsing.
493
+ #
494
+ # @example Raising a parser error
495
+ # raise Error.new(
496
+ # "invalid token '%' on line 10",
497
+ # rest: '%', lineno: 9, production: :turtleDoc)
498
+ #
499
+ # @see https://ruby-doc.org/core/classes/StandardError.html
500
+ class Error < StandardError
501
+ ##
502
+ # The current production.
503
+ #
504
+ # @return [Symbol]
505
+ attr_reader :production
506
+
507
+ ##
508
+ # The read head when scanning failed
509
+ #
510
+ # @return [String]
511
+ attr_reader :rest
512
+
513
+ ##
514
+ # The line number where the error occurred.
515
+ #
516
+ # @return [Integer]
517
+ attr_reader :lineno
518
+
519
+ ##
520
+ # Initializes a new lexer error instance.
521
+ #
522
+ # @param [String, #to_s] message
523
+ # @param [Hash{Symbol => Object}] options
524
+ # @option options [Symbol] :production (nil)
525
+ # @option options [String] :rest (nil)
526
+ # @option options [Integer] :lineno (nil)
527
+ def initialize(message, **options)
528
+ @production = options[:production]
529
+ @rest = options[:rest]
530
+ @lineno = options[:lineno]
531
+ super(message.to_s)
532
+ end
533
+ end # class Error
534
+ end # class Parser
535
+ end # module EBNF::LL1