ebnf 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,541 @@
1
+ require 'rdf'
2
+ require 'rdf/ll1/lexer'
3
+
4
+ module RDF::LL1
5
+ ##
6
+ # A Generic LL1 parser using a lexer and branch tables defined using the SWAP tool chain (modified).
7
+ module Parser
8
+ ##
9
+ # @private
10
+ # level above which debug messages are supressed
11
+ DEBUG_LEVEL = 10
12
+
13
+ ##
14
+ # @!attribute [r] lineno
15
+ # @return [Integer] line number of current token
16
+ attr_reader :lineno
17
+
18
+ def self.included(base)
19
+ base.extend(ClassMethods)
20
+ end
21
+
22
+ # DSL for creating terminals and productions
23
+ module ClassMethods
24
+ def production_handlers; @@production_handlers || {}; end
25
+ def terminal_handlers; @@terminal_handlers || {}; end
26
+ def patterns; @@patterns || []; end
27
+ def unescape_terms; @@unescape_terms || []; end
28
+
29
+ ##
30
+ # Defines a production called during different phases of parsing
31
+ # with data from previous production along with data defined for the
32
+ # current production
33
+ #
34
+ # @param [Symbol] term
35
+ # Term which is a key in the branch table
36
+ # @yield [reader, phase, input, current]
37
+ # @yieldparam [RDF::Reader] reader
38
+ # Reader instance
39
+ # @yieldparam [Symbol] phase
40
+ # Phase of parsing, one of :start, or :finish
41
+ # @yieldparam [Hash] input
42
+ # A Hash containing input from the parent production
43
+ # @yieldparam [Hash] current
44
+ # A Hash defined for the current production, during :start
45
+ # may be initialized with data to pass to further productions,
46
+ # during :finish, it contains data placed by earlier productions
47
+ # @yieldparam [Prod] block
48
+ # Block passed to initialization for yielding to calling reader.
49
+ # Should conform to the yield specs for #initialize
50
+ # Yield to generate a triple
51
+ def production(term, &block)
52
+ @@production_handlers ||= {}
53
+ @@production_handlers[term] = block
54
+ end
55
+
56
+ ##
57
+ # Defines the pattern for a terminal node and a block to be invoked
58
+ # when ther terminal is encountered. If the block is missing, the
59
+ # value of the terminal will be placed on the input hash to be returned
60
+ # to a previous production.
61
+ #
62
+ # @param [Symbol, String] term
63
+ # Defines a terminal production, which appears as within a sequence in the branch table
64
+ # @param [Regexp] regexp
65
+ # Pattern used to scan for this terminal
66
+ # @param [Hash] options
67
+ # @option options [Boolean] :unescape
68
+ # Cause strings and codepoints to be unescaped.
69
+ # @yield [reader, term, token, input]
70
+ # @yieldparam [RDF::Reader] reader
71
+ # Reader instance
72
+ # @yieldparam [Symbol] term
73
+ # A symbol indicating the production which referenced this terminal
74
+ # @yieldparam [String] token
75
+ # The scanned token
76
+ # @yieldparam [Hash] input
77
+ # A Hash containing input from the parent production
78
+ # @yieldparam [Prod] block
79
+ # Block passed to initialization for yielding to calling reader.
80
+ # Should conform to the yield specs for #initialize
81
+ def terminal(term, regexp, options = {}, &block)
82
+ @@patterns ||= []
83
+ @@patterns << [term, regexp] # Passed in order to define evaulation sequence
84
+ @@terminal_handlers ||= {}
85
+ @@terminal_handlers[term] = block if block_given?
86
+ @@unescape_terms ||= []
87
+ @@unescape_terms << term if options[:unescape]
88
+ end
89
+ end
90
+
91
+ ##
92
+ # Initializes a new parser instance.
93
+ #
94
+ # Attempts to recover from errors.
95
+ #
96
+ # @example
97
+ # require 'rdf/ll1/parser'
98
+ #
99
+ # class Reader << RDF::Reader
100
+ # include RDF::LL1::Parser
101
+ #
102
+ # branch RDF::Turtle::Reader::BRANCH
103
+ #
104
+ # ##
105
+ # # Defines a production called during different phases of parsing
106
+ # # with data from previous production along with data defined for the
107
+ # # current production
108
+ # #
109
+ # # Yield to generate a triple
110
+ # production :object do |reader, phase, input, current|
111
+ # object = current[:resource]
112
+ # yield :statement, RDF::Statement.new(input[:subject], input[:predicate], object)
113
+ # end
114
+ #
115
+ # ##
116
+ # # Defines the pattern for a terminal node
117
+ # terminal :BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL})) do |reader, production, token, input|
118
+ # input[:BLANK_NODE_LABEL] = RDF::Node.new(token)
119
+ # end
120
+ #
121
+ # ##
122
+ # # Iterates the given block for each RDF statement in the input.
123
+ # #
124
+ # # @yield [statement]
125
+ # # @yieldparam [RDF::Statement] statement
126
+ # # @return [void]
127
+ # def each_statement(&block)
128
+ # @callback = block
129
+ #
130
+ # parse(START.to_sym) do |context, *data|
131
+ # case context
132
+ # when :statement
133
+ # yield *data
134
+ # end
135
+ # end
136
+ # end
137
+ #
138
+ # end
139
+ #
140
+ # @param [String, #to_s] input
141
+ # @param [Symbol, #to_s] prod The starting production for the parser.
142
+ # It may be a URI from the grammar, or a symbol representing the local_name portion of the grammar URI.
143
+ # @param [Hash{Symbol => Object}] options
144
+ # @option options [Hash{Symbol,String => Hash{Symbol,String => Array<Symbol,String>}}] :branch
145
+ # LL1 branch table.
146
+ # @option options [HHash{Symbol,String => Array<Symbol,String>}] :first ({})
147
+ # Lists valid terminals that can precede each production (for error recovery).
148
+ # @option options [HHash{Symbol,String => Array<Symbol,String>}] :follow ({})
149
+ # Lists valid terminals that can follow each production (for error recovery).
150
+ # @option options [Boolean] :validate (false)
151
+ # whether to validate the parsed statements and values. If not validating,
152
+ # the parser will attempt to recover from errors.
153
+ # @option options [Boolean] :progress
154
+ # Show progress of parser productions
155
+ # @option options [Boolean] :debug
156
+ # Detailed debug output
157
+ # @yield [context, *data]
158
+ # Yields for to return data to reader
159
+ # @yieldparam [:statement, :trace] context
160
+ # Context for block
161
+ # @yieldparam [Symbol] *data
162
+ # Data specific to the call
163
+ # @return [RDF::LL1::Parser]
164
+ # @see http://cs.adelaide.edu.au/~charles/lt/Lectures/07-ErrorRecovery.pdf
165
+ def parse(input = nil, prod = nil, options = {}, &block)
166
+ @options = options.dup
167
+ @branch = options[:branch]
168
+ @first = options[:first] ||= {}
169
+ @follow = options[:follow] ||= {}
170
+ @lexer = input.is_a?(Lexer) ? input : Lexer.new(input, self.class.patterns, @options.merge(:unescape_terms => self.class.unescape_terms))
171
+ @productions = []
172
+ @parse_callback = block
173
+ @recovering = false
174
+ @error_log = []
175
+ terminals = self.class.patterns.map(&:first) # Get defined terminals to help with branching
176
+
177
+ # Unrecoverable errors
178
+ raise Error, "Branch table not defined" unless @branch && @branch.length > 0
179
+ raise Error, "Starting production not defined" unless prod
180
+
181
+ @prod_data = [{}]
182
+ prod = RDF::URI(prod).fragment.to_sym unless prod.is_a?(Symbol)
183
+ todo_stack = [{:prod => prod, :terms => nil}]
184
+
185
+ while !todo_stack.empty?
186
+ pushed = false
187
+ if todo_stack.last[:terms].nil?
188
+ todo_stack.last[:terms] = []
189
+ cur_prod = todo_stack.last[:prod]
190
+
191
+ # Get this first valid token appropriate for the stacked productions,
192
+ # skipping invalid tokens until either a valid token is found (from @first),
193
+ # or a token appearing in @follow appears.
194
+ token = skip_until_valid(todo_stack)
195
+
196
+ # At this point, token is either nil, in the first set of the production,
197
+ # or in the follow set of this production or any previous production
198
+ debug("parse(production)") do
199
+ "token #{token ? token.representation.inspect : 'nil'}, " +
200
+ "prod #{cur_prod.inspect}, " +
201
+ "depth #{depth}"
202
+ end
203
+
204
+ # Got an opened production
205
+ onStart(cur_prod)
206
+ break if token.nil?
207
+
208
+ if prod_branch = @branch[cur_prod]
209
+ @recovering = false
210
+ sequence = prod_branch[token.representation]
211
+ debug("parse(production)", :level => 2) do
212
+ "token #{token.representation.inspect} " +
213
+ "prod #{cur_prod.inspect}, " +
214
+ "prod_branch #{prod_branch.keys.inspect}, " +
215
+ "sequence #{sequence.inspect}"
216
+ end
217
+
218
+ if sequence.nil?
219
+ if prod_branch.has_key?(:"ebnf:empty")
220
+ debug("parse(production)", :level => 2) {"empty sequence for ebnf:empty"}
221
+ else
222
+ # If there is no sequence for this production, we're
223
+ # in error recovery, and _token_ has been advanced to
224
+ # the point where it can reasonably follow this production
225
+ end
226
+ end
227
+ todo_stack.last[:terms] += sequence if sequence
228
+ else
229
+ # Is this a fatal error?
230
+ error("parse(fatal?)", "No branches found for #{cur_prod.inspect}",
231
+ :production => cur_prod, :token => token)
232
+ end
233
+ end
234
+
235
+ debug("parse(terms)", :level => 2) {"todo #{todo_stack.last.inspect}, depth #{depth}"}
236
+ while !todo_stack.last[:terms].to_a.empty?
237
+ begin
238
+ # Get the next term in this sequence
239
+ term = todo_stack.last[:terms].shift
240
+ debug("parse(token)") {"accept #{term.inspect}"}
241
+ if token = accept(term)
242
+ @recovering = false
243
+ debug("parse(token)") {"token #{token.inspect}, term #{term.inspect}"}
244
+ onToken(term, token)
245
+ elsif terminals.include?(term)
246
+ # If term is a terminal, then it is an error of token does not
247
+ # match it
248
+ skip_until_valid(todo_stack)
249
+ else
250
+ # If it's not a string (a symbol), it is a non-terminal and we push the new state
251
+ todo_stack << {:prod => term, :terms => nil}
252
+ debug("parse(push)", :level => 2) {"term #{term.inspect}, depth #{depth}"}
253
+ pushed = true
254
+ break
255
+ end
256
+ end
257
+ end
258
+
259
+ # After completing the last production in a sequence, pop down until we find a production
260
+ #
261
+ # If in recovery mode, continue popping until we find a term with a follow list
262
+ while !pushed &&
263
+ !todo_stack.empty? &&
264
+ ( todo_stack.last[:terms].to_a.empty? ||
265
+ (@recovering && @follow[todo_stack.last[:term]].nil?))
266
+ debug("parse(pop)", :level => 2) {"todo #{todo_stack.last.inspect}, depth #{depth}, recovering? #{@recovering.inspect}"}
267
+ prod = todo_stack.last[:prod]
268
+ @recovering = false if @follow[prod] # Stop recovering when we might have a match
269
+ todo_stack.pop
270
+ onFinish
271
+ end
272
+ end
273
+
274
+ error("parse(eof)", "Finished processing before end of file", :token => @lexer.first) if @lexer.first
275
+
276
+ # Continue popping contexts off of the stack
277
+ while !todo_stack.empty?
278
+ debug("parse(eof)", :level => 2) {"stack #{todo_stack.last.inspect}, depth #{depth}"}
279
+ if todo_stack.last[:terms].length > 0
280
+ error("parse(eof)",
281
+ "End of input before end of production: stack #{todo_stack.last.inspect}, depth #{depth}"
282
+ )
283
+ end
284
+ todo_stack.pop
285
+ onFinish
286
+ end
287
+
288
+ # When all is said and done, raise the error log
289
+ unless @error_log.empty?
290
+ raise Error, @error_log.join("\n\t")
291
+ end
292
+ end
293
+
294
+ def depth; (@productions || []).length; end
295
+
296
+ private
297
+ # Start for production
298
+ def onStart(prod)
299
+ handler = self.class.production_handlers[prod]
300
+ @productions << prod
301
+ if handler
302
+ # Create a new production data element, potentially allowing handler
303
+ # to customize before pushing on the @prod_data stack
304
+ progress("#{prod}(:start):#{@prod_data.length}") {@prod_data.last}
305
+ data = {}
306
+ handler.call(self, :start, @prod_data.last, data, @parse_callback)
307
+ @prod_data << data
308
+ else
309
+ progress("#{prod}(:start)") { get_token.inspect}
310
+ end
311
+ #puts @prod_data.inspect
312
+ end
313
+
314
+ # Finish of production
315
+ def onFinish
316
+ prod = @productions.last
317
+ handler = self.class.production_handlers[prod]
318
+ if handler
319
+ # Pop production data element from stack, potentially allowing handler to use it
320
+ data = @prod_data.pop
321
+ handler.call(self, :finish, @prod_data.last, data, @parse_callback)
322
+ progress("#{prod}(:finish):#{@prod_data.length}") {@prod_data.last}
323
+ else
324
+ progress("#{prod}(:finish)", '')
325
+ end
326
+ @productions.pop
327
+ end
328
+
329
+ # A token
330
+ def onToken(prod, token)
331
+ unless @productions.empty?
332
+ parentProd = @productions.last
333
+ handler = self.class.terminal_handlers[prod]
334
+ # Allows catch-all for simple string terminals
335
+ handler ||= self.class.terminal_handlers[nil] if prod.is_a?(String)
336
+ if handler
337
+ handler.call(self, parentProd, token, @prod_data.last)
338
+ progress("#{prod}(:token)", "", :depth => (depth + 1)) {"#{token}: #{@prod_data.last}"}
339
+ else
340
+ progress("#{prod}(:token)", "", :depth => (depth + 1)) {token.to_s}
341
+ end
342
+ else
343
+ error("#{parentProd}(:token)", "Token has no parent production", :production => prod)
344
+ end
345
+ end
346
+
347
+ # Skip through the input stream until something is found that
348
+ # is either valid based on the content of the production stack,
349
+ # or can follow a production in the stack.
350
+ #
351
+ # @return [Token]
352
+ def skip_until_valid(todo_stack)
353
+ cur_prod = todo_stack.last[:prod]
354
+ token = get_token
355
+ first = @first[cur_prod] || []
356
+
357
+ # If this token can be used by the top production, return it
358
+ # Otherwise, if the banch table allows empty, also return the token
359
+ return token if !@recovering && (
360
+ (@branch[cur_prod] && @branch[cur_prod].has_key?(:"ebnf:empty")) ||
361
+ first.any? {|t| token === t})
362
+
363
+ # Otherwise, it's an error condition, and skip either until
364
+ # we find a valid token for this production, or until we find
365
+ # something that can follow this production
366
+ expected = first.map {|v| v.inspect}.join(", ")
367
+ error("skip_until_valid", "expected one of #{expected}",
368
+ :production => cur_prod, :token => token)
369
+
370
+ debug("recovery", "stack follows:")
371
+ todo_stack.reverse.each do |todo|
372
+ debug("recovery") {" #{todo[:prod]}: #{@follow[todo[:prod]].inspect}"}
373
+ end
374
+
375
+ # Find all follows to the top of the stack
376
+ follows = todo_stack.inject([]) do |follow, todo|
377
+ prod = todo[:prod]
378
+ follow += @follow[prod] || []
379
+ end.uniq
380
+ debug("recovery") {"follows: #{follows.inspect}"}
381
+
382
+ # Skip tokens until one is found in first or follows
383
+ while (token = get_token) && (first + follows).none? {|t| token === t}
384
+ skipped = @lexer.shift
385
+ progress("recovery") {"skip #{skipped.inspect}"}
386
+ end
387
+ debug("recovery") {"found #{token.inspect}"}
388
+
389
+ # If the token is a first, just return it. Otherwise, it is a follow
390
+ # and we need to skip to the end of the production
391
+ unless first.any? {|t| token == t} || todo_stack.last[:terms].empty?
392
+ debug("recovery") {"token in follows, skip past #{todo_stack.last[:terms].inspect}"}
393
+ todo_stack.last[:terms] = []
394
+ end
395
+ token
396
+ end
397
+
398
+ ##
399
+ # @param [String] node Relevant location associated with message
400
+ # @param [String] message Error string
401
+ # @param [Hash] options
402
+ # @option options [URI, #to_s] :production
403
+ # @option options [Token] :token
404
+ def error(node, message, options = {})
405
+ message += ", found #{options[:token].representation.inspect}" if options[:token]
406
+ message += " at line #{@lineno}" if @lineno
407
+ message += ", production = #{options[:production].inspect}" if options[:production] && @options[:debug]
408
+ @error_log << message unless @recovering
409
+ @recovering = true
410
+ debug(node, message, options.merge(:level => 0))
411
+ end
412
+
413
+ ##
414
+ # Return the next token, entering error recovery if the token is invalid
415
+ #
416
+ # @return [Token]
417
+ def get_token
418
+ token = begin
419
+ @lexer.first
420
+ rescue RDF::LL1::Lexer::Error => e
421
+ # Recover from lexer error
422
+ @lineno = e.lineno
423
+ error("get_token", "With input '#{e.input}': #{e.message}",
424
+ :production => @productions.last)
425
+
426
+ # Retrieve next valid token
427
+ t = @lexer.recover
428
+ debug("get_token", :level => 2) {"skipped to #{t.inspect}"}
429
+ t
430
+ end
431
+ #progress("token") {token.inspect}
432
+ @lineno = token.lineno if token
433
+ token
434
+ end
435
+
436
+ ##
437
+ # Progress output when parsing
438
+ # param [String] node Relevant location associated with message
439
+ # param [String] message ("")
440
+ # param [Hash] options
441
+ # option options [Integer] :depth
442
+ # Recursion depth for indenting output
443
+ # yieldreturn [String] added to message
444
+ def progress(node, *args)
445
+ return unless @options[:progress] || @options[:debug]
446
+ options = args.last.is_a?(Hash) ? args.pop : {}
447
+ message = args.join(",")
448
+ depth = options[:depth] || self.depth
449
+ message += yield.to_s if block_given?
450
+ if @options[:debug]
451
+ return debug(node, message, {:level => 0}.merge(options))
452
+ else
453
+ str = "[#{@lineno}]#{' ' * depth}#{node}: #{message}"
454
+ $stderr.puts("[#{@lineno}]#{' ' * depth}#{node}: #{message}")
455
+ end
456
+ end
457
+
458
+ ##
459
+ # Progress output when debugging
460
+ # @param [String] node Relevant location associated with message
461
+ # @param [String] message ("")
462
+ # @param [Hash] options
463
+ # @option options [Integer] :depth
464
+ # Recursion depth for indenting output
465
+ # @yieldreturn [String] added to message
466
+ def debug(node, message = "", options = {})
467
+ return unless @options[:debug]
468
+ debug_level = options.fetch(:level, 1)
469
+ return unless debug_level <= DEBUG_LEVEL
470
+ depth = options[:depth] || self.depth
471
+ message += yield if block_given?
472
+ str = "[#{@lineno}](#{debug_level})#{' ' * depth}#{node}: #{message}"
473
+ case @options[:debug]
474
+ when Array
475
+ @options[:debug] << str
476
+ when TrueClass
477
+ $stderr.puts str
478
+ when :yield
479
+ @parse_callback.call(:trace, node, message, options)
480
+ end
481
+ end
482
+
483
+ ##
484
+ # Accept the first token in the input stream if it matches
485
+ # _type\_or\_value_. Return nil otherwise.
486
+ #
487
+ # @param [Symbol, String] type_or_value
488
+ # @return [Token]
489
+ def accept(type_or_value)
490
+ if (token = get_token) && token === type_or_value
491
+ debug("accept") {"#{token.inspect} === #{type_or_value.inspect}"}
492
+ @lexer.shift
493
+ end
494
+ end
495
+ public
496
+
497
+ ##
498
+ # Raised for errors during parsing.
499
+ #
500
+ # @example Raising a parser error
501
+ # raise Error.new(
502
+ # "invalid token '%' on line 10",
503
+ # :token => '%', :lineno => 9, :production => :turtleDoc)
504
+ #
505
+ # @see http://ruby-doc.org/core/classes/StandardError.html
506
+ class Error < StandardError
507
+ ##
508
+ # The current production.
509
+ #
510
+ # @return [Symbol]
511
+ attr_reader :production
512
+
513
+ ##
514
+ # The invalid token which triggered the error.
515
+ #
516
+ # @return [String]
517
+ attr_reader :token
518
+
519
+ ##
520
+ # The line number where the error occurred.
521
+ #
522
+ # @return [Integer]
523
+ attr_reader :lineno
524
+
525
+ ##
526
+ # Initializes a new lexer error instance.
527
+ #
528
+ # @param [String, #to_s] message
529
+ # @param [Hash{Symbol => Object}] options
530
+ # @option options [Symbol] :production (nil)
531
+ # @option options [String] :token (nil)
532
+ # @option options [Integer] :lineno (nil)
533
+ def initialize(message, options = {})
534
+ @production = options[:production]
535
+ @token = options[:token]
536
+ @lineno = options[:lineno]
537
+ super(message.to_s)
538
+ end
539
+ end # class Error
540
+ end # class Reader
541
+ end # module RDF::Turtle