rdf-turtle 1.0.0 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,541 +0,0 @@
1
- require 'rdf'
2
- require 'rdf/ll1/lexer'
3
-
4
- module RDF::LL1
5
- ##
6
- # A Generic LL1 parser using a lexer and branch tables defined using the SWAP tool chain (modified).
7
- module Parser
8
- ##
9
- # @private
10
- # level above which debug messages are supressed
11
- DEBUG_LEVEL = 10
12
-
13
- ##
14
- # @!attribute [r] lineno
15
- # @return [Integer] line number of current token
16
- attr_reader :lineno
17
-
18
- def self.included(base)
19
- base.extend(ClassMethods)
20
- end
21
-
22
- # DSL for creating terminals and productions
23
- module ClassMethods
24
- def production_handlers; @@production_handlers || {}; end
25
- def terminal_handlers; @@terminal_handlers || {}; end
26
- def patterns; @@patterns || []; end
27
- def unescape_terms; @@unescape_terms || []; end
28
-
29
- ##
30
- # Defines a production called during different phases of parsing
31
- # with data from previous production along with data defined for the
32
- # current production
33
- #
34
- # @param [Symbol] term
35
- # Term which is a key in the branch table
36
- # @yield [reader, phase, input, current]
37
- # @yieldparam [RDF::Reader] reader
38
- # Reader instance
39
- # @yieldparam [Symbol] phase
40
- # Phase of parsing, one of :start, or :finish
41
- # @yieldparam [Hash] input
42
- # A Hash containing input from the parent production
43
- # @yieldparam [Hash] current
44
- # A Hash defined for the current production, during :start
45
- # may be initialized with data to pass to further productions,
46
- # during :finish, it contains data placed by earlier productions
47
- # @yieldparam [Prod] block
48
- # Block passed to initialization for yielding to calling reader.
49
- # Should conform to the yield specs for #initialize
50
- # Yield to generate a triple
51
- def production(term, &block)
52
- @@production_handlers ||= {}
53
- @@production_handlers[term] = block
54
- end
55
-
56
- ##
57
- # Defines the pattern for a terminal node and a block to be invoked
58
- # when ther terminal is encountered. If the block is missing, the
59
- # value of the terminal will be placed on the input hash to be returned
60
- # to a previous production.
61
- #
62
- # @param [Symbol, String] term
63
- # Defines a terminal production, which appears as within a sequence in the branch table
64
- # @param [Regexp] regexp
65
- # Pattern used to scan for this terminal
66
- # @param [Hash] options
67
- # @option options [Boolean] :unescape
68
- # Cause strings and codepoints to be unescaped.
69
- # @yield [reader, term, token, input]
70
- # @yieldparam [RDF::Reader] reader
71
- # Reader instance
72
- # @yieldparam [Symbol] term
73
- # A symbol indicating the production which referenced this terminal
74
- # @yieldparam [String] token
75
- # The scanned token
76
- # @yieldparam [Hash] input
77
- # A Hash containing input from the parent production
78
- # @yieldparam [Prod] block
79
- # Block passed to initialization for yielding to calling reader.
80
- # Should conform to the yield specs for #initialize
81
- def terminal(term, regexp, options = {}, &block)
82
- @@patterns ||= []
83
- @@patterns << [term, regexp] # Passed in order to define evaulation sequence
84
- @@terminal_handlers ||= {}
85
- @@terminal_handlers[term] = block if block_given?
86
- @@unescape_terms ||= []
87
- @@unescape_terms << term if options[:unescape]
88
- end
89
- end
90
-
91
- ##
92
- # Initializes a new parser instance.
93
- #
94
- # Attempts to recover from errors.
95
- #
96
- # @example
97
- # require 'rdf/ll1/parser'
98
- #
99
- # class Reader << RDF::Reader
100
- # include RDF::LL1::Parser
101
- #
102
- # branch RDF::Turtle::Reader::BRANCH
103
- #
104
- # ##
105
- # # Defines a production called during different phases of parsing
106
- # # with data from previous production along with data defined for the
107
- # # current production
108
- # #
109
- # # Yield to generate a triple
110
- # production :object do |reader, phase, input, current|
111
- # object = current[:resource]
112
- # yield :statement, RDF::Statement.new(input[:subject], input[:predicate], object)
113
- # end
114
- #
115
- # ##
116
- # # Defines the pattern for a terminal node
117
- # terminal :BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL})) do |reader, production, token, input|
118
- # input[:BLANK_NODE_LABEL] = RDF::Node.new(token)
119
- # end
120
- #
121
- # ##
122
- # # Iterates the given block for each RDF statement in the input.
123
- # #
124
- # # @yield [statement]
125
- # # @yieldparam [RDF::Statement] statement
126
- # # @return [void]
127
- # def each_statement(&block)
128
- # @callback = block
129
- #
130
- # parse(START.to_sym) do |context, *data|
131
- # case context
132
- # when :statement
133
- # yield *data
134
- # end
135
- # end
136
- # end
137
- #
138
- # end
139
- #
140
- # @param [String, #to_s] input
141
- # @param [Symbol, #to_s] prod The starting production for the parser.
142
- # It may be a URI from the grammar, or a symbol representing the local_name portion of the grammar URI.
143
- # @param [Hash{Symbol => Object}] options
144
- # @option options [Hash{Symbol,String => Hash{Symbol,String => Array<Symbol,String>}}] :branch
145
- # LL1 branch table.
146
- # @option options [HHash{Symbol,String => Array<Symbol,String>}] :first ({})
147
- # Lists valid terminals that can precede each production (for error recovery).
148
- # @option options [HHash{Symbol,String => Array<Symbol,String>}] :follow ({})
149
- # Lists valid terminals that can follow each production (for error recovery).
150
- # @option options [Boolean] :validate (false)
151
- # whether to validate the parsed statements and values. If not validating,
152
- # the parser will attempt to recover from errors.
153
- # @option options [Boolean] :progress
154
- # Show progress of parser productions
155
- # @option options [Boolean] :debug
156
- # Detailed debug output
157
- # @yield [context, *data]
158
- # Yields for to return data to reader
159
- # @yieldparam [:statement, :trace] context
160
- # Context for block
161
- # @yieldparam [Symbol] *data
162
- # Data specific to the call
163
- # @return [RDF::LL1::Parser]
164
- # @see http://cs.adelaide.edu.au/~charles/lt/Lectures/07-ErrorRecovery.pdf
165
- def parse(input = nil, prod = nil, options = {}, &block)
166
- @options = options.dup
167
- @branch = options[:branch]
168
- @first = options[:first] ||= {}
169
- @follow = options[:follow] ||= {}
170
- @lexer = input.is_a?(Lexer) ? input : Lexer.new(input, self.class.patterns, @options.merge(:unescape_terms => self.class.unescape_terms))
171
- @productions = []
172
- @parse_callback = block
173
- @recovering = false
174
- @error_log = []
175
- terminals = self.class.patterns.map(&:first) # Get defined terminals to help with branching
176
-
177
- # Unrecoverable errors
178
- raise Error, "Branch table not defined" unless @branch && @branch.length > 0
179
- raise Error, "Starting production not defined" unless prod
180
-
181
- @prod_data = [{}]
182
- prod = RDF::URI(prod).fragment.to_sym unless prod.is_a?(Symbol)
183
- todo_stack = [{:prod => prod, :terms => nil}]
184
-
185
- while !todo_stack.empty?
186
- pushed = false
187
- if todo_stack.last[:terms].nil?
188
- todo_stack.last[:terms] = []
189
- cur_prod = todo_stack.last[:prod]
190
-
191
- # Get this first valid token appropriate for the stacked productions,
192
- # skipping invalid tokens until either a valid token is found (from @first),
193
- # or a token appearing in @follow appears.
194
- token = skip_until_valid(todo_stack)
195
-
196
- # At this point, token is either nil, in the first set of the production,
197
- # or in the follow set of this production or any previous production
198
- debug("parse(production)") do
199
- "token #{token ? token.representation.inspect : 'nil'}, " +
200
- "prod #{cur_prod.inspect}, " +
201
- "depth #{depth}"
202
- end
203
-
204
- # Got an opened production
205
- onStart(cur_prod)
206
- break if token.nil?
207
-
208
- if prod_branch = @branch[cur_prod]
209
- @recovering = false
210
- sequence = prod_branch[token.representation]
211
- debug("parse(production)", :level => 2) do
212
- "token #{token.representation.inspect} " +
213
- "prod #{cur_prod.inspect}, " +
214
- "prod_branch #{prod_branch.keys.inspect}, " +
215
- "sequence #{sequence.inspect}"
216
- end
217
-
218
- if sequence.nil?
219
- if prod_branch.has_key?(:"ebnf:empty")
220
- debug("parse(production)", :level => 2) {"empty sequence for ebnf:empty"}
221
- else
222
- # If there is no sequence for this production, we're
223
- # in error recovery, and _token_ has been advanced to
224
- # the point where it can reasonably follow this production
225
- end
226
- end
227
- todo_stack.last[:terms] += sequence if sequence
228
- else
229
- # Is this a fatal error?
230
- error("parse(fatal?)", "No branches found for #{cur_prod.inspect}",
231
- :production => cur_prod, :token => token)
232
- end
233
- end
234
-
235
- debug("parse(terms)", :level => 2) {"todo #{todo_stack.last.inspect}, depth #{depth}"}
236
- while !todo_stack.last[:terms].to_a.empty?
237
- begin
238
- # Get the next term in this sequence
239
- term = todo_stack.last[:terms].shift
240
- debug("parse(token)") {"accept #{term.inspect}"}
241
- if token = accept(term)
242
- @recovering = false
243
- debug("parse(token)") {"token #{token.inspect}, term #{term.inspect}"}
244
- onToken(term, token)
245
- elsif terminals.include?(term)
246
- # If term is a terminal, then it is an error of token does not
247
- # match it
248
- skip_until_valid(todo_stack)
249
- else
250
- # If it's not a string (a symbol), it is a non-terminal and we push the new state
251
- todo_stack << {:prod => term, :terms => nil}
252
- debug("parse(push)", :level => 2) {"term #{term.inspect}, depth #{depth}"}
253
- pushed = true
254
- break
255
- end
256
- end
257
- end
258
-
259
- # After completing the last production in a sequence, pop down until we find a production
260
- #
261
- # If in recovery mode, continue popping until we find a term with a follow list
262
- while !pushed &&
263
- !todo_stack.empty? &&
264
- ( todo_stack.last[:terms].to_a.empty? ||
265
- (@recovering && @follow[todo_stack.last[:term]].nil?))
266
- debug("parse(pop)", :level => 2) {"todo #{todo_stack.last.inspect}, depth #{depth}, recovering? #{@recovering.inspect}"}
267
- prod = todo_stack.last[:prod]
268
- @recovering = false if @follow[prod] # Stop recovering when we might have a match
269
- todo_stack.pop
270
- onFinish
271
- end
272
- end
273
-
274
- error("parse(eof)", "Finished processing before end of file", :token => @lexer.first) if @lexer.first
275
-
276
- # Continue popping contexts off of the stack
277
- while !todo_stack.empty?
278
- debug("parse(eof)", :level => 2) {"stack #{todo_stack.last.inspect}, depth #{depth}"}
279
- if todo_stack.last[:terms].length > 0
280
- error("parse(eof)",
281
- "End of input before end of production: stack #{todo_stack.last.inspect}, depth #{depth}"
282
- )
283
- end
284
- todo_stack.pop
285
- onFinish
286
- end
287
-
288
- # When all is said and done, raise the error log
289
- unless @error_log.empty?
290
- raise Error, @error_log.join("\n\t")
291
- end
292
- end
293
-
294
- def depth; (@productions || []).length; end
295
-
296
- private
297
- # Start for production
298
- def onStart(prod)
299
- handler = self.class.production_handlers[prod]
300
- @productions << prod
301
- if handler
302
- # Create a new production data element, potentially allowing handler
303
- # to customize before pushing on the @prod_data stack
304
- progress("#{prod}(:start):#{@prod_data.length}") {@prod_data.last}
305
- data = {}
306
- handler.call(self, :start, @prod_data.last, data, @parse_callback)
307
- @prod_data << data
308
- else
309
- progress("#{prod}(:start)") { get_token.inspect}
310
- end
311
- #puts @prod_data.inspect
312
- end
313
-
314
- # Finish of production
315
- def onFinish
316
- prod = @productions.last
317
- handler = self.class.production_handlers[prod]
318
- if handler
319
- # Pop production data element from stack, potentially allowing handler to use it
320
- data = @prod_data.pop
321
- handler.call(self, :finish, @prod_data.last, data, @parse_callback)
322
- progress("#{prod}(:finish):#{@prod_data.length}") {@prod_data.last}
323
- else
324
- progress("#{prod}(:finish)", '')
325
- end
326
- @productions.pop
327
- end
328
-
329
- # A token
330
- def onToken(prod, token)
331
- unless @productions.empty?
332
- parentProd = @productions.last
333
- handler = self.class.terminal_handlers[prod]
334
- # Allows catch-all for simple string terminals
335
- handler ||= self.class.terminal_handlers[nil] if prod.is_a?(String)
336
- if handler
337
- handler.call(self, parentProd, token, @prod_data.last)
338
- progress("#{prod}(:token)", "", :depth => (depth + 1)) {"#{token}: #{@prod_data.last}"}
339
- else
340
- progress("#{prod}(:token)", "", :depth => (depth + 1)) {token.to_s}
341
- end
342
- else
343
- error("#{parentProd}(:token)", "Token has no parent production", :production => prod)
344
- end
345
- end
346
-
347
- # Skip through the input stream until something is found that
348
- # is either valid based on the content of the production stack,
349
- # or can follow a production in the stack.
350
- #
351
- # @return [Token]
352
- def skip_until_valid(todo_stack)
353
- cur_prod = todo_stack.last[:prod]
354
- token = get_token
355
- first = @first[cur_prod] || []
356
-
357
- # If this token can be used by the top production, return it
358
- # Otherwise, if the banch table allows empty, also return the token
359
- return token if !@recovering && (
360
- (@branch[cur_prod] && @branch[cur_prod].has_key?(:"ebnf:empty")) ||
361
- first.any? {|t| token === t})
362
-
363
- # Otherwise, it's an error condition, and skip either until
364
- # we find a valid token for this production, or until we find
365
- # something that can follow this production
366
- expected = first.map {|v| v.inspect}.join(", ")
367
- error("skip_until_valid", "expected one of #{expected}",
368
- :production => cur_prod, :token => token)
369
-
370
- debug("recovery", "stack follows:")
371
- todo_stack.reverse.each do |todo|
372
- debug("recovery") {" #{todo[:prod]}: #{@follow[todo[:prod]].inspect}"}
373
- end
374
-
375
- # Find all follows to the top of the stack
376
- follows = todo_stack.inject([]) do |follow, todo|
377
- prod = todo[:prod]
378
- follow += @follow[prod] || []
379
- end.uniq
380
- debug("recovery") {"follows: #{follows.inspect}"}
381
-
382
- # Skip tokens until one is found in first or follows
383
- while (token = get_token) && (first + follows).none? {|t| token === t}
384
- skipped = @lexer.shift
385
- progress("recovery") {"skip #{skipped.inspect}"}
386
- end
387
- debug("recovery") {"found #{token.inspect}"}
388
-
389
- # If the token is a first, just return it. Otherwise, it is a follow
390
- # and we need to skip to the end of the production
391
- unless first.any? {|t| token == t} || todo_stack.last[:terms].empty?
392
- debug("recovery") {"token in follows, skip past #{todo_stack.last[:terms].inspect}"}
393
- todo_stack.last[:terms] = []
394
- end
395
- token
396
- end
397
-
398
- ##
399
- # @param [String] node Relevant location associated with message
400
- # @param [String] message Error string
401
- # @param [Hash] options
402
- # @option options [URI, #to_s] :production
403
- # @option options [Token] :token
404
- def error(node, message, options = {})
405
- message += ", found #{options[:token].representation.inspect}" if options[:token]
406
- message += " at line #{@lineno}" if @lineno
407
- message += ", production = #{options[:production].inspect}" if options[:production] && @options[:debug]
408
- @error_log << message unless @recovering
409
- @recovering = true
410
- debug(node, message, options.merge(:level => 0))
411
- end
412
-
413
- ##
414
- # Return the next token, entering error recovery if the token is invalid
415
- #
416
- # @return [Token]
417
- def get_token
418
- token = begin
419
- @lexer.first
420
- rescue RDF::LL1::Lexer::Error => e
421
- # Recover from lexer error
422
- @lineno = e.lineno
423
- error("get_token", "With input '#{e.input}': #{e.message}",
424
- :production => @productions.last)
425
-
426
- # Retrieve next valid token
427
- t = @lexer.recover
428
- debug("get_token", :level => 2) {"skipped to #{t.inspect}"}
429
- t
430
- end
431
- #progress("token") {token.inspect}
432
- @lineno = token.lineno if token
433
- token
434
- end
435
-
436
- ##
437
- # Progress output when parsing
438
- # param [String] node Relevant location associated with message
439
- # param [String] message ("")
440
- # param [Hash] options
441
- # option options [Integer] :depth
442
- # Recursion depth for indenting output
443
- # yieldreturn [String] added to message
444
- def progress(node, *args)
445
- return unless @options[:progress] || @options[:debug]
446
- options = args.last.is_a?(Hash) ? args.pop : {}
447
- message = args.join(",")
448
- depth = options[:depth] || self.depth
449
- message += yield.to_s if block_given?
450
- if @options[:debug]
451
- return debug(node, message, {:level => 0}.merge(options))
452
- else
453
- str = "[#{@lineno}]#{' ' * depth}#{node}: #{message}"
454
- $stderr.puts("[#{@lineno}]#{' ' * depth}#{node}: #{message}")
455
- end
456
- end
457
-
458
- ##
459
- # Progress output when debugging
460
- # @param [String] node Relevant location associated with message
461
- # @param [String] message ("")
462
- # @param [Hash] options
463
- # @option options [Integer] :depth
464
- # Recursion depth for indenting output
465
- # @yieldreturn [String] added to message
466
- def debug(node, message = "", options = {})
467
- return unless @options[:debug]
468
- debug_level = options.fetch(:level, 1)
469
- return unless debug_level <= DEBUG_LEVEL
470
- depth = options[:depth] || self.depth
471
- message += yield if block_given?
472
- str = "[#{@lineno}](#{debug_level})#{' ' * depth}#{node}: #{message}"
473
- case @options[:debug]
474
- when Array
475
- @options[:debug] << str
476
- when TrueClass
477
- $stderr.puts str
478
- when :yield
479
- @parse_callback.call(:trace, node, message, options)
480
- end
481
- end
482
-
483
- ##
484
- # Accept the first token in the input stream if it matches
485
- # _type\_or\_value_. Return nil otherwise.
486
- #
487
- # @param [Symbol, String] type_or_value
488
- # @return [Token]
489
- def accept(type_or_value)
490
- if (token = get_token) && token === type_or_value
491
- debug("accept") {"#{token.inspect} === #{type_or_value.inspect}"}
492
- @lexer.shift
493
- end
494
- end
495
- public
496
-
497
- ##
498
- # Raised for errors during parsing.
499
- #
500
- # @example Raising a parser error
501
- # raise Error.new(
502
- # "invalid token '%' on line 10",
503
- # :token => '%', :lineno => 9, :production => :turtleDoc)
504
- #
505
- # @see http://ruby-doc.org/core/classes/StandardError.html
506
- class Error < StandardError
507
- ##
508
- # The current production.
509
- #
510
- # @return [Symbol]
511
- attr_reader :production
512
-
513
- ##
514
- # The invalid token which triggered the error.
515
- #
516
- # @return [String]
517
- attr_reader :token
518
-
519
- ##
520
- # The line number where the error occurred.
521
- #
522
- # @return [Integer]
523
- attr_reader :lineno
524
-
525
- ##
526
- # Initializes a new lexer error instance.
527
- #
528
- # @param [String, #to_s] message
529
- # @param [Hash{Symbol => Object}] options
530
- # @option options [Symbol] :production (nil)
531
- # @option options [String] :token (nil)
532
- # @option options [Integer] :lineno (nil)
533
- def initialize(message, options = {})
534
- @production = options[:production]
535
- @token = options[:token]
536
- @lineno = options[:lineno]
537
- super(message.to_s)
538
- end
539
- end # class Error
540
- end # class Reader
541
- end # module RDF::Turtle