ebnf 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ module EBNF
2
+ module PEG
3
+ autoload :Parser, 'ebnf/peg/parser'
4
+ autoload :Rule, 'ebnf/peg/rule'
5
+
6
+ ##
7
+ # Transform EBNF Rule set for PEG parsing:
8
+ #
9
+ # * Transform each rule into a set of sub-rules extracting unnamed sequences into new rules, using {Rule#to_peg}.
10
+ # @return [ENBF] self
11
+ def make_peg
12
+ progress("make_peg") {"Start: #{@ast.length} rules"}
13
+ new_ast = []
14
+
15
+ ast.each do |rule|
16
+ debug("make_peg") {"expand from: #{rule.inspect}"}
17
+ new_rules = rule.to_peg
18
+ debug(" => ") {new_rules.map(&:sym).join(', ')}
19
+ new_ast += new_rules
20
+ end
21
+
22
+ @ast = new_ast
23
+ progress("make_peg") {"End: #{@ast.length} rules"}
24
+ self
25
+ end
26
+
27
+ ##
28
+ # Output Ruby parser files for PEG parsing
29
+ #
30
+ # @param [IO, StringIO] output
31
+ def to_ruby_peg(output, **options)
32
+ output.puts " RULES = ["
33
+ ast.each do |rule|
34
+ output.puts " " + rule.to_ruby + '.extend(EBNF::PEG::Rule),'
35
+ end
36
+ output.puts " ]"
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,535 @@
1
+ module EBNF::PEG
2
+ ##
3
+ # A Generic PEG parser using the parsed rules modified for PEG parseing.
4
+ module Parser
5
+ ##
6
+ # @return [Regexp, Rule] how to remove inter-rule whitespace
7
+ attr_reader :whitespace
8
+
9
+ ##
10
+ # @return [Scanner] used for scanning input.
11
+ attr_reader :scanner
12
+
13
+ ##
14
+ # A Hash structure used for memoizing rule results for a given input location.
15
+ #
16
+ # @example Partial structure for memoizing results for a particular rule
17
+ #
18
+ # {
19
+ # rule: {
20
+ # 86: {
21
+ # pos:
22
+ # result: [<EBNF::Rule:80 {
23
+ # sym: :ebnf,
24
+ # id: "1",
25
+ # kind: :rule,
26
+ # expr: [:star, [:alt, :declaration, :rule]]}>],
27
+ # }
28
+ # 131: [<EBNF::Rule:80 {sym: :ebnf,
29
+ # id: "1",
30
+ # kind: :rule,
31
+ # expr: [:star, [:alt, :declaration, :rule]]}>,
32
+ # <EBNF::Rule:100 {
33
+ # sym: :declaration,
34
+ # id: "2",
35
+ # kind: :rule,
36
+ # expr: [:alt, "@terminals", :pass]}>]
37
+ # },
38
+ # POSTFIX: {
39
+ # 80: "*",
40
+ # 368: "*",
41
+ # 399: "+"
42
+ # }
43
+ # }
44
+ # @return [Hash{Integer => Hash{Symbol => Object}}]
45
+ attr_reader :packrat
46
+
47
+ def self.included(base)
48
+ base.extend(ClassMethods)
49
+ end
50
+
51
+ # DSL for creating terminals and productions
52
+ module ClassMethods
53
+ def start_handlers; (@start_handlers ||= {}); end
54
+ def production_handlers; (@production_handlers ||= {}); end
55
+ def terminal_handlers; (@terminal_handlers ||= {}); end
56
+ def terminal_regexps; (@terminal_regexps ||= {}); end
57
+
58
+ ##
59
+ # Defines the pattern for a terminal node and a block to be invoked
60
+ # when ther terminal is encountered. If the block is missing, the
61
+ # value of the terminal will be placed on the input hash to be returned
62
+ # to a previous production. Block is called in an evaluation block from
63
+ # the enclosing parser.
64
+ #
65
+ # If no block is provided, then the value which would have been passed to the block is used as the result directly.
66
+ #
67
+ # @param [Symbol] term
68
+ # The terminal name.
69
+ # @param [Regexp] regexp (nil)
70
+ # Pattern used to scan for this terminal,
71
+ # defaults to the expression defined in the associated rule.
72
+ # If unset, the terminal rule is used for matching.
73
+ # @param [Hash] options
74
+ # @option options [Hash{String => String}] :map ({})
75
+ # A mapping from terminals, in lower-case form, to
76
+ # their canonical value
77
+ # @option options [Boolean] :unescape
78
+ # Cause strings and codepoints to be unescaped.
79
+ # @yield [value, prod]
80
+ # @yieldparam [String] value
81
+ # The scanned terminal value.
82
+ # @yieldparam [Symbol] prod
83
+ # A symbol indicating the production which referenced this terminal
84
+ # @yieldparam [Proc] block
85
+ # Block passed to initialization for yielding to calling parser.
86
+ # Should conform to the yield specs for #initialize
87
+ def terminal(term, regexp = nil, **options, &block)
88
+ terminal_regexps[term] = regexp if regexp
89
+ terminal_handlers[term] = block if block_given?
90
+ end
91
+
92
+ ##
93
+ # Defines a production called at the beggining of a particular production
94
+ # with data from previous production along with data defined for the
95
+ # current production. Block is called in an evaluation block from
96
+ # the enclosing parser.
97
+ #
98
+ # @param [Symbol] term
99
+ # The rule name
100
+ # @yield [data, block]
101
+ # @yieldparam [Hash] data
102
+ # A Hash defined for the current production, during :start
103
+ # may be initialized with data to pass to further productions,
104
+ # during :finish, it contains data placed by earlier productions
105
+ # @yieldparam [Proc] block
106
+ # Block passed to initialization for yielding to calling parser.
107
+ # Should conform to the yield specs for #initialize
108
+ # Yield to generate a triple
109
+ def start_production(term, &block)
110
+ start_handlers[term] = block
111
+ end
112
+
113
+ ##
114
+ # Defines a production called when production of associated
115
+ # non-terminals has completed
116
+ # with data from previous production along with data defined for the
117
+ # current production. Block is called in an evaluation block from
118
+ # the enclosing parser.
119
+ #
120
+ # @param [Symbol] term
121
+ # Term which is a key in the branch table
122
+ # @param [Boolean] clear_packrat (false)
123
+ # Clears the packrat state on completion to reduce memory requirements of parser. Use only on a top-level rule when it is determined that no further backtracking is necessary.
124
+ # @yield [result, data, block]
125
+ # @yieldparam [Object] result
126
+ # The result from sucessfully parsing the production.
127
+ # @yieldparam [Hash] data
128
+ # A Hash defined for the current production, during :start
129
+ # may be initialized with data to pass to further productions,
130
+ # during :finish, it contains data placed by earlier productions
131
+ # @yieldparam [Proc] block
132
+ # Block passed to initialization for yielding to calling parser.
133
+ # Should conform to the yield specs for #initialize
134
+ # @yieldreturn [Object] the result of this production.
135
+ # Yield to generate a triple
136
+ def production(term, clear_packrat: false, &block)
137
+ production_handlers[term] = [block, clear_packrat]
138
+ end
139
+
140
+ # Evaluate a handler, delegating to the specified object.
141
+ # This is necessary so that handlers can operate within the
142
+ # binding context of the parser in which they're invoked.
143
+ # @param [Object] object
144
+ # @return [Object]
145
+ def eval_with_binding(object)
146
+ @delegate = object
147
+ object.instance_eval {yield}
148
+ end
149
+
150
+ private
151
+
152
+ def method_missing(method, *args, &block)
153
+ if @delegate ||= nil
154
+ # special handling when last arg is **options
155
+ params = @delegate.method(method).parameters
156
+ if params.any? {|t, _| t == :keyrest} && args.last.is_a?(Hash)
157
+ opts = args.pop
158
+ @delegate.send(method, *args, **opts, &block)
159
+ else
160
+ @delegate.send(method, *args, &block)
161
+ end
162
+ else
163
+ super
164
+ end
165
+ end
166
+ end
167
+
168
+ ##
169
+ # Initializes a new parser instance.
170
+ #
171
+ # @param [String, #to_s] input
172
+ # @param [Symbol, #to_s] start
173
+ # The starting production for the parser. It may be a URI from the grammar, or a symbol representing the local_name portion of the grammar URI.
174
+ # @param [Array<EBNF::PEG::Rule>] rules
175
+ # The parsed rules, which control parsing sequence.
176
+ # Identify the symbol of the starting rule with `start`.
177
+ # @param [Hash{Symbol => Object}] options
178
+ # @option options[Integer] :high_water passed to lexer
179
+ # @option options [Logger] :logger for errors/progress/debug.
180
+ # @option options[Integer] :low_water passed to lexer
181
+ # @option options [Symbol, Regexp] :whitespace
182
+ # Symbol of whitespace rule (defaults to `@pass`), or a regular expression
183
+ # for eating whitespace between non-terminal rules (strongly encouraged).
184
+ # @yield [context, *data]
185
+ # Yields to return data to parser
186
+ # @yieldparam [:statement, :trace] context
187
+ # Context for block
188
+ # @yieldparam [Symbol] *data
189
+ # Data specific to the call
190
+ # @return [Object] AST resulting from parse
191
+ # @raise [Exception] Raises exceptions for parsing errors
192
+ # or errors raised during processing callbacks. Internal
193
+ # errors are raised using {Error}.
194
+ def parse(input = nil, start = nil, rules = nil, **options, &block)
195
+ start ||= options[:start]
196
+ rules ||= options[:rules] || []
197
+ @rules = rules.inject({}) {|memo, rule| memo.merge(rule.sym => rule)}
198
+ @packrat = {}
199
+
200
+ # Add parser reference to each rule
201
+ @rules.each_value {|rule| rule.parser = self}
202
+
203
+ # Take whitespace from options, a named rule, a `pass` rule, a rule named :WS, or a default
204
+ @whitespace = case options[:whitespace]
205
+ when Regexp then options[:whitespace]
206
+ when Symbol then @rules[options[:whitespace]]
207
+ end ||
208
+ @rules.values.detect(&:pass?) ||
209
+ /(?:\s|(?:#[^x][^\n\r]*))+/m.freeze
210
+
211
+ @options = options.dup
212
+ @productions = []
213
+ @parse_callback = block
214
+ @error_log = []
215
+ @prod_data = []
216
+
217
+ @scanner = EBNF::LL1::Scanner.new(input)
218
+ start = start.split('#').last.to_sym unless start.is_a?(Symbol)
219
+ start_rule = @rules[start]
220
+ raise Error, "Starting production #{start.inspect} not defined" unless start_rule
221
+
222
+ result = start_rule.parse(scanner)
223
+ if result == :unmatched
224
+ # Start rule wasn't matched, which is about the only error condition
225
+ error("--top--", @furthest_failure.to_s,
226
+ pos: @furthest_failure.pos,
227
+ lineno: @furthest_failure.lineno,
228
+ rest: scanner.string[@furthest_failure.pos, 20])
229
+ end
230
+
231
+ # Eat any remaining whitespace
232
+ start_rule.eat_whitespace(scanner)
233
+ if !scanner.eos?
234
+ error("--top--", @furthest_failure.to_s,
235
+ pos: @furthest_failure.pos,
236
+ lineno: @furthest_failure.lineno,
237
+ rest: scanner.string[@furthest_failure.pos, 20])
238
+ end
239
+
240
+ # When all is said and done, raise the error log
241
+ unless @error_log.empty?
242
+ raise Error, @error_log.join("\n")
243
+ end
244
+
245
+ result
246
+ end
247
+
248
+ # Depth of parsing, for log output.
249
+ def depth; (@productions || []).length; end
250
+
251
+ # Current ProdData element
252
+ def prod_data; @prod_data.last || {}; end
253
+
254
+ # Clear out packrat memoizer. This is appropriate when completing a top-level rule when there is no possibility of backtracking.
255
+ def clear_packrat; @packrat.clear; end
256
+
257
+ ##
258
+ # Error information, used as level `3` logger messages.
259
+ # Messages may be logged and are saved for reporting at end of parsing.
260
+ #
261
+ # @param [String] node Relevant location associated with message
262
+ # @param [String] message Error string
263
+ # @param [Hash{Symbol => Object}] options
264
+ # @option options [URI, #to_s] :production
265
+ # @option options [Token] :token
266
+ # @see #debug
267
+ def error(node, message, **options)
268
+ lineno = options[:lineno] || (scanner.lineno if scanner)
269
+ m = "ERROR "
270
+ m += "[line: #{lineno}] " if lineno
271
+ m += message
272
+ m += " (found #{options[:rest].inspect})" if options[:rest]
273
+ m += ", production = #{options[:production].inspect}" if options[:production]
274
+ @error_log << m unless @recovering
275
+ @recovering = true
276
+ debug(node, m, level: 3, **options)
277
+ if options[:raise] || @options[:validate]
278
+ raise Error.new(m, lineno: lineno, rest: options[:rest], production: options[:production])
279
+ end
280
+ end
281
+
282
+ ##
283
+ # Warning information, used as level `2` logger messages.
284
+ # Messages may be logged and are saved for reporting at end of parsing.
285
+ #
286
+ # @param [String] node Relevant location associated with message
287
+ # @param [String] message Error string
288
+ # @param [Hash] options
289
+ # @option options [URI, #to_s] :production
290
+ # @option options [Token] :token
291
+ # @see #debug
292
+ def warn(node, message, **options)
293
+ lineno = options[:lineno] || (scanner.lineno if scanner)
294
+ m = "WARNING "
295
+ m += "[line: #{lineno}] " if lineno
296
+ m += message
297
+ m += " (found #{options[:rest].inspect})" if options[:rest]
298
+ m += ", production = #{options[:production].inspect}" if options[:production]
299
+ debug(node, m, level: 2, **options)
300
+ end
301
+
302
+ ##
303
+ # Progress logged when parsing. Passed as level `1` logger messages.
304
+ #
305
+ # The call is ignored, unless `@options[:logger]` is set.
306
+ #
307
+ # @overload progress(node, message, **options, &block)
308
+ # @param [String] node Relevant location associated with message
309
+ # @param [String] message ("")
310
+ # @param [Hash] options
311
+ # @option options [Integer] :depth
312
+ # Recursion depth for indenting output
313
+ # @see #debug
314
+ def progress(node, *args, &block)
315
+ return unless @options[:logger]
316
+ args << {} unless args.last.is_a?(Hash)
317
+ args.last[:level] ||= 1
318
+ debug(node, *args, &block)
319
+ end
320
+
321
+ ##
322
+ # Debug logging.
323
+ #
324
+ # The call is ignored, unless `@options[:logger]` is set.
325
+ #
326
+ # @overload debug(node, message, **options)
327
+ # @param [Array<String>] args Relevant location associated with message
328
+ # @param [Hash] options
329
+ # @option options [Integer] :depth
330
+ # Recursion depth for indenting output
331
+ # @yieldreturn [String] additional string appended to `message`.
332
+ def debug(*args)
333
+ return unless @options[:logger]
334
+ options = args.last.is_a?(Hash) ? args.pop : {}
335
+ lineno = options[:lineno] || (scanner.lineno if scanner)
336
+ level = options.fetch(:level, 0)
337
+
338
+ depth = options[:depth] || self.depth
339
+ args << yield if block_given?
340
+ @options[:logger].add(level, "[#{lineno}]" + (" " * depth) + args.join(" "))
341
+ end
342
+
343
+ # Start for production
344
+ # Adds data avoiable during the processing of the production
345
+ def onStart(prod)
346
+ handler = self.class.start_handlers[prod]
347
+ @productions << prod
348
+ debug("#{prod}(:start)", "",
349
+ lineno: (scanner.lineno if scanner),
350
+ pos: (scanner.pos if scanner),
351
+ depth: (depth + 1)) {"#{prod}, pos: #{scanner ? scanner.pos : '?'}, rest: #{scanner ? scanner.rest[0..20].inspect : '?'}"}
352
+ if handler
353
+ # Create a new production data element, potentially allowing handler
354
+ # to customize before pushing on the @prod_data stack
355
+ data = {}
356
+ begin
357
+ self.class.eval_with_binding(self) {
358
+ handler.call(data, @parse_callback)
359
+ }
360
+ rescue ArgumentError, Error => e
361
+ error("start", "#{e.class}: #{e.message}", production: prod)
362
+ @recovering = false
363
+ end
364
+ @prod_data << data
365
+ elsif self.class.production_handlers[prod]
366
+ # Make sure we push as many was we pop, even if there is no
367
+ # explicit start handler
368
+ @prod_data << {}
369
+ end
370
+ end
371
+
372
+ # Finish of production
373
+ #
374
+ # @param [Object] result parse result
375
+ # @return [Object] parse result, or the value returned from the handler
376
+ def onFinish(result)
377
+ #puts "prod_data(f): " + @prod_data.inspect
378
+ prod = @productions.last
379
+ handler, clear_packrat = self.class.production_handlers[prod]
380
+ data = @prod_data.pop if handler || self.class.start_handlers[prod]
381
+ if handler && !@recovering && result != :unmatched
382
+ # Pop production data element from stack, potentially allowing handler to use it
383
+ result = begin
384
+ self.class.eval_with_binding(self) {
385
+ handler.call(result, data, @parse_callback)
386
+ }
387
+ rescue ArgumentError, Error => e
388
+ error("finish", "#{e.class}: #{e.message}", production: prod)
389
+ @recovering = false
390
+ end
391
+ end
392
+ progress("#{prod}(:finish)", "",
393
+ depth: (depth + 1),
394
+ lineno: (scanner.lineno if scanner),
395
+ level: result == :unmatched ? 0 : 1) do
396
+ "#{result.inspect}@(#{scanner ? scanner.pos : '?'}), rest: #{scanner ? scanner.rest[0..20].inspect : '?'}"
397
+ end
398
+ self.clear_packrat if clear_packrat
399
+ @productions.pop
400
+ result
401
+ end
402
+
403
+ # A terminal with a defined handler
404
+ #
405
+ # @param [Symbol] prod from the symbol of the associated rule
406
+ # @param [String] value the scanned string
407
+ # @return [String, Object] either the result from the handler, or the token
408
+ def onTerminal(prod, value)
409
+ parentProd = @productions.last
410
+ handler = self.class.terminal_handlers[prod]
411
+ if handler && value != :unmatched
412
+ value = begin
413
+ self.class.eval_with_binding(self) {
414
+ handler.call(value, parentProd, @parse_callback)
415
+ }
416
+ rescue ArgumentError, Error => e
417
+ error("terminal", "#{e.class}: #{e.message}", value: value, production: prod)
418
+ @recovering = false
419
+ end
420
+ end
421
+ progress("#{prod}(:terminal)", "",
422
+ depth: (depth + 2),
423
+ lineno: (scanner.lineno if scanner),
424
+ level: value == :unmatched ? 0 : 1) do
425
+ "#{value.inspect}@(#{scanner ? scanner.pos : '?'})"
426
+ end
427
+ value
428
+ end
429
+
430
+ ##
431
+ # Find a rule for a symbol
432
+ #
433
+ # @param [Symbol] sym
434
+ # @return [Rule]
435
+ def find_rule(sym)
436
+ @rules[sym]
437
+ end
438
+
439
+ ##
440
+ # Find a regular expression defined for a terminal
441
+ #
442
+ # @param [Symbol] sym
443
+ # @return [Regexp]
444
+ def find_terminal_regexp(sym)
445
+ self.class.terminal_regexps[sym]
446
+ end
447
+
448
+ ##
449
+ # Record furthest failure.
450
+ #
451
+ # @param [Integer] pos
452
+ # The position in the input stream where the failure occured.
453
+ # @param [Integer] lineno
454
+ # Line where the failure occured.
455
+ # @param [Symbol, String] token
456
+ # The terminal token or string which attempted to match.
457
+ # @see https://arxiv.org/pdf/1405.6646.pdf
458
+ def update_furthest_failure(pos, lineno, token)
459
+ # Skip generated productions
460
+ return if token.is_a?(Symbol) && token.to_s.start_with?('_')
461
+ if @furthest_failure.nil? || pos > @furthest_failure.pos
462
+ @furthest_failure = Unmatched.new(pos, lineno, [token])
463
+ elsif pos == @furthest_failure.pos && !@furthest_failure[:expecting].include?(token)
464
+ @furthest_failure[:expecting] << token
465
+ end
466
+ end
467
+
468
+ public
469
+
470
+ ##
471
+ # @!parse
472
+ # # Record details about an inmatched rule, including the following:
473
+ # #
474
+ # # * Input location and line number at time of failure.
475
+ # # * The rule at which this was found (non-terminal, and nat starting with '_').
476
+ # class Unmatched
477
+ # # @return [Integer] The position within the scanner which did not match.
478
+ # attr_reader :pos
479
+ # # @return [Integer] The line number which did not match.
480
+ # attr_reader :lineno
481
+ # # @return [Array<Symbol,String>]
482
+ # # Strings or production rules that attempted to match at this position.
483
+ # attr_reader :expecting
484
+ # end
485
+ class Unmatched < Struct.new(:pos, :lineno, :expecting)
486
+ def to_s
487
+ "syntax error, expecting #{expecting.map(&:inspect).join(', ')}"
488
+ end
489
+ end
490
+
491
+ ##
492
+ # Raised for errors during parsing.
493
+ #
494
+ # @example Raising a parser error
495
+ # raise Error.new(
496
+ # "invalid token '%' on line 10",
497
+ # rest: '%', lineno: 9, production: :turtleDoc)
498
+ #
499
+ # @see https://ruby-doc.org/core/classes/StandardError.html
500
+ class Error < StandardError
501
+ ##
502
+ # The current production.
503
+ #
504
+ # @return [Symbol]
505
+ attr_reader :production
506
+
507
+ ##
508
+ # The read head when scanning failed
509
+ #
510
+ # @return [String]
511
+ attr_reader :rest
512
+
513
+ ##
514
+ # The line number where the error occurred.
515
+ #
516
+ # @return [Integer]
517
+ attr_reader :lineno
518
+
519
+ ##
520
+ # Initializes a new lexer error instance.
521
+ #
522
+ # @param [String, #to_s] message
523
+ # @param [Hash{Symbol => Object}] options
524
+ # @option options [Symbol] :production (nil)
525
+ # @option options [String] :rest (nil)
526
+ # @option options [Integer] :lineno (nil)
527
+ def initialize(message, **options)
528
+ @production = options[:production]
529
+ @rest = options[:rest]
530
+ @lineno = options[:lineno]
531
+ super(message.to_s)
532
+ end
533
+ end # class Error
534
+ end # class Parser
535
+ end # module EBNF::LL1