rdf-turtle 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,541 +0,0 @@
1
- require 'rdf'
2
- require 'rdf/ll1/lexer'
3
-
4
- module RDF::LL1
5
- ##
6
- # A Generic LL1 parser using a lexer and branch tables defined using the SWAP tool chain (modified).
7
- module Parser
8
- ##
9
- # @private
10
- # level above which debug messages are supressed
11
- DEBUG_LEVEL = 10
12
-
13
- ##
14
- # @!attribute [r] lineno
15
- # @return [Integer] line number of current token
16
- attr_reader :lineno
17
-
18
- def self.included(base)
19
- base.extend(ClassMethods)
20
- end
21
-
22
- # DSL for creating terminals and productions
23
- module ClassMethods
24
- def production_handlers; @@production_handlers || {}; end
25
- def terminal_handlers; @@terminal_handlers || {}; end
26
- def patterns; @@patterns || []; end
27
- def unescape_terms; @@unescape_terms || []; end
28
-
29
- ##
30
- # Defines a production called during different phases of parsing
31
- # with data from previous production along with data defined for the
32
- # current production
33
- #
34
- # @param [Symbol] term
35
- # Term which is a key in the branch table
36
- # @yield [reader, phase, input, current]
37
- # @yieldparam [RDF::Reader] reader
38
- # Reader instance
39
- # @yieldparam [Symbol] phase
40
- # Phase of parsing, one of :start, or :finish
41
- # @yieldparam [Hash] input
42
- # A Hash containing input from the parent production
43
- # @yieldparam [Hash] current
44
- # A Hash defined for the current production, during :start
45
- # may be initialized with data to pass to further productions,
46
- # during :finish, it contains data placed by earlier productions
47
- # @yieldparam [Prod] block
48
- # Block passed to initialization for yielding to calling reader.
49
- # Should conform to the yield specs for #initialize
50
- # Yield to generate a triple
51
- def production(term, &block)
52
- @@production_handlers ||= {}
53
- @@production_handlers[term] = block
54
- end
55
-
56
- ##
57
- # Defines the pattern for a terminal node and a block to be invoked
58
- # when ther terminal is encountered. If the block is missing, the
59
- # value of the terminal will be placed on the input hash to be returned
60
- # to a previous production.
61
- #
62
- # @param [Symbol, String] term
63
- # Defines a terminal production, which appears as within a sequence in the branch table
64
- # @param [Regexp] regexp
65
- # Pattern used to scan for this terminal
66
- # @param [Hash] options
67
- # @option options [Boolean] :unescape
68
- # Cause strings and codepoints to be unescaped.
69
- # @yield [reader, term, token, input]
70
- # @yieldparam [RDF::Reader] reader
71
- # Reader instance
72
- # @yieldparam [Symbol] term
73
- # A symbol indicating the production which referenced this terminal
74
- # @yieldparam [String] token
75
- # The scanned token
76
- # @yieldparam [Hash] input
77
- # A Hash containing input from the parent production
78
- # @yieldparam [Prod] block
79
- # Block passed to initialization for yielding to calling reader.
80
- # Should conform to the yield specs for #initialize
81
- def terminal(term, regexp, options = {}, &block)
82
- @@patterns ||= []
83
- @@patterns << [term, regexp] # Passed in order to define evaulation sequence
84
- @@terminal_handlers ||= {}
85
- @@terminal_handlers[term] = block if block_given?
86
- @@unescape_terms ||= []
87
- @@unescape_terms << term if options[:unescape]
88
- end
89
- end
90
-
91
- ##
92
- # Initializes a new parser instance.
93
- #
94
- # Attempts to recover from errors.
95
- #
96
- # @example
97
- # require 'rdf/ll1/parser'
98
- #
99
- # class Reader << RDF::Reader
100
- # include RDF::LL1::Parser
101
- #
102
- # branch RDF::Turtle::Reader::BRANCH
103
- #
104
- # ##
105
- # # Defines a production called during different phases of parsing
106
- # # with data from previous production along with data defined for the
107
- # # current production
108
- # #
109
- # # Yield to generate a triple
110
- # production :object do |reader, phase, input, current|
111
- # object = current[:resource]
112
- # yield :statement, RDF::Statement.new(input[:subject], input[:predicate], object)
113
- # end
114
- #
115
- # ##
116
- # # Defines the pattern for a terminal node
117
- # terminal :BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL})) do |reader, production, token, input|
118
- # input[:BLANK_NODE_LABEL] = RDF::Node.new(token)
119
- # end
120
- #
121
- # ##
122
- # # Iterates the given block for each RDF statement in the input.
123
- # #
124
- # # @yield [statement]
125
- # # @yieldparam [RDF::Statement] statement
126
- # # @return [void]
127
- # def each_statement(&block)
128
- # @callback = block
129
- #
130
- # parse(START.to_sym) do |context, *data|
131
- # case context
132
- # when :statement
133
- # yield *data
134
- # end
135
- # end
136
- # end
137
- #
138
- # end
139
- #
140
- # @param [String, #to_s] input
141
- # @param [Symbol, #to_s] prod The starting production for the parser.
142
- # It may be a URI from the grammar, or a symbol representing the local_name portion of the grammar URI.
143
- # @param [Hash{Symbol => Object}] options
144
- # @option options [Hash{Symbol,String => Hash{Symbol,String => Array<Symbol,String>}}] :branch
145
- # LL1 branch table.
146
- # @option options [HHash{Symbol,String => Array<Symbol,String>}] :first ({})
147
- # Lists valid terminals that can precede each production (for error recovery).
148
- # @option options [HHash{Symbol,String => Array<Symbol,String>}] :follow ({})
149
- # Lists valid terminals that can follow each production (for error recovery).
150
- # @option options [Boolean] :validate (false)
151
- # whether to validate the parsed statements and values. If not validating,
152
- # the parser will attempt to recover from errors.
153
- # @option options [Boolean] :progress
154
- # Show progress of parser productions
155
- # @option options [Boolean] :debug
156
- # Detailed debug output
157
- # @yield [context, *data]
158
- # Yields for to return data to reader
159
- # @yieldparam [:statement, :trace] context
160
- # Context for block
161
- # @yieldparam [Symbol] *data
162
- # Data specific to the call
163
- # @return [RDF::LL1::Parser]
164
- # @see http://cs.adelaide.edu.au/~charles/lt/Lectures/07-ErrorRecovery.pdf
165
- def parse(input = nil, prod = nil, options = {}, &block)
166
- @options = options.dup
167
- @branch = options[:branch]
168
- @first = options[:first] ||= {}
169
- @follow = options[:follow] ||= {}
170
- @lexer = input.is_a?(Lexer) ? input : Lexer.new(input, self.class.patterns, @options.merge(:unescape_terms => self.class.unescape_terms))
171
- @productions = []
172
- @parse_callback = block
173
- @recovering = false
174
- @error_log = []
175
- terminals = self.class.patterns.map(&:first) # Get defined terminals to help with branching
176
-
177
- # Unrecoverable errors
178
- raise Error, "Branch table not defined" unless @branch && @branch.length > 0
179
- raise Error, "Starting production not defined" unless prod
180
-
181
- @prod_data = [{}]
182
- prod = RDF::URI(prod).fragment.to_sym unless prod.is_a?(Symbol)
183
- todo_stack = [{:prod => prod, :terms => nil}]
184
-
185
- while !todo_stack.empty?
186
- pushed = false
187
- if todo_stack.last[:terms].nil?
188
- todo_stack.last[:terms] = []
189
- cur_prod = todo_stack.last[:prod]
190
-
191
- # Get this first valid token appropriate for the stacked productions,
192
- # skipping invalid tokens until either a valid token is found (from @first),
193
- # or a token appearing in @follow appears.
194
- token = skip_until_valid(todo_stack)
195
-
196
- # At this point, token is either nil, in the first set of the production,
197
- # or in the follow set of this production or any previous production
198
- debug("parse(production)") do
199
- "token #{token ? token.representation.inspect : 'nil'}, " +
200
- "prod #{cur_prod.inspect}, " +
201
- "depth #{depth}"
202
- end
203
-
204
- # Got an opened production
205
- onStart(cur_prod)
206
- break if token.nil?
207
-
208
- if prod_branch = @branch[cur_prod]
209
- @recovering = false
210
- sequence = prod_branch[token.representation]
211
- debug("parse(production)", :level => 2) do
212
- "token #{token.representation.inspect} " +
213
- "prod #{cur_prod.inspect}, " +
214
- "prod_branch #{prod_branch.keys.inspect}, " +
215
- "sequence #{sequence.inspect}"
216
- end
217
-
218
- if sequence.nil?
219
- if prod_branch.has_key?(:"ebnf:empty")
220
- debug("parse(production)", :level => 2) {"empty sequence for ebnf:empty"}
221
- else
222
- # If there is no sequence for this production, we're
223
- # in error recovery, and _token_ has been advanced to
224
- # the point where it can reasonably follow this production
225
- end
226
- end
227
- todo_stack.last[:terms] += sequence if sequence
228
- else
229
- # Is this a fatal error?
230
- error("parse(fatal?)", "No branches found for #{cur_prod.inspect}",
231
- :production => cur_prod, :token => token)
232
- end
233
- end
234
-
235
- debug("parse(terms)", :level => 2) {"todo #{todo_stack.last.inspect}, depth #{depth}"}
236
- while !todo_stack.last[:terms].to_a.empty?
237
- begin
238
- # Get the next term in this sequence
239
- term = todo_stack.last[:terms].shift
240
- debug("parse(token)") {"accept #{term.inspect}"}
241
- if token = accept(term)
242
- @recovering = false
243
- debug("parse(token)") {"token #{token.inspect}, term #{term.inspect}"}
244
- onToken(term, token)
245
- elsif terminals.include?(term)
246
- # If term is a terminal, then it is an error of token does not
247
- # match it
248
- skip_until_valid(todo_stack)
249
- else
250
- # If it's not a string (a symbol), it is a non-terminal and we push the new state
251
- todo_stack << {:prod => term, :terms => nil}
252
- debug("parse(push)", :level => 2) {"term #{term.inspect}, depth #{depth}"}
253
- pushed = true
254
- break
255
- end
256
- end
257
- end
258
-
259
- # After completing the last production in a sequence, pop down until we find a production
260
- #
261
- # If in recovery mode, continue popping until we find a term with a follow list
262
- while !pushed &&
263
- !todo_stack.empty? &&
264
- ( todo_stack.last[:terms].to_a.empty? ||
265
- (@recovering && @follow[todo_stack.last[:term]].nil?))
266
- debug("parse(pop)", :level => 2) {"todo #{todo_stack.last.inspect}, depth #{depth}, recovering? #{@recovering.inspect}"}
267
- prod = todo_stack.last[:prod]
268
- @recovering = false if @follow[prod] # Stop recovering when we might have a match
269
- todo_stack.pop
270
- onFinish
271
- end
272
- end
273
-
274
- error("parse(eof)", "Finished processing before end of file", :token => @lexer.first) if @lexer.first
275
-
276
- # Continue popping contexts off of the stack
277
- while !todo_stack.empty?
278
- debug("parse(eof)", :level => 2) {"stack #{todo_stack.last.inspect}, depth #{depth}"}
279
- if todo_stack.last[:terms].length > 0
280
- error("parse(eof)",
281
- "End of input before end of production: stack #{todo_stack.last.inspect}, depth #{depth}"
282
- )
283
- end
284
- todo_stack.pop
285
- onFinish
286
- end
287
-
288
- # When all is said and done, raise the error log
289
- unless @error_log.empty?
290
- raise Error, @error_log.join("\n\t")
291
- end
292
- end
293
-
294
- def depth; (@productions || []).length; end
295
-
296
- private
297
- # Start for production
298
- def onStart(prod)
299
- handler = self.class.production_handlers[prod]
300
- @productions << prod
301
- if handler
302
- # Create a new production data element, potentially allowing handler
303
- # to customize before pushing on the @prod_data stack
304
- progress("#{prod}(:start):#{@prod_data.length}") {@prod_data.last}
305
- data = {}
306
- handler.call(self, :start, @prod_data.last, data, @parse_callback)
307
- @prod_data << data
308
- else
309
- progress("#{prod}(:start)") { get_token.inspect}
310
- end
311
- #puts @prod_data.inspect
312
- end
313
-
314
- # Finish of production
315
- def onFinish
316
- prod = @productions.last
317
- handler = self.class.production_handlers[prod]
318
- if handler
319
- # Pop production data element from stack, potentially allowing handler to use it
320
- data = @prod_data.pop
321
- handler.call(self, :finish, @prod_data.last, data, @parse_callback)
322
- progress("#{prod}(:finish):#{@prod_data.length}") {@prod_data.last}
323
- else
324
- progress("#{prod}(:finish)", '')
325
- end
326
- @productions.pop
327
- end
328
-
329
- # A token
330
- def onToken(prod, token)
331
- unless @productions.empty?
332
- parentProd = @productions.last
333
- handler = self.class.terminal_handlers[prod]
334
- # Allows catch-all for simple string terminals
335
- handler ||= self.class.terminal_handlers[nil] if prod.is_a?(String)
336
- if handler
337
- handler.call(self, parentProd, token, @prod_data.last)
338
- progress("#{prod}(:token)", "", :depth => (depth + 1)) {"#{token}: #{@prod_data.last}"}
339
- else
340
- progress("#{prod}(:token)", "", :depth => (depth + 1)) {token.to_s}
341
- end
342
- else
343
- error("#{parentProd}(:token)", "Token has no parent production", :production => prod)
344
- end
345
- end
346
-
347
- # Skip through the input stream until something is found that
348
- # is either valid based on the content of the production stack,
349
- # or can follow a production in the stack.
350
- #
351
- # @return [Token]
352
- def skip_until_valid(todo_stack)
353
- cur_prod = todo_stack.last[:prod]
354
- token = get_token
355
- first = @first[cur_prod] || []
356
-
357
- # If this token can be used by the top production, return it
358
- # Otherwise, if the banch table allows empty, also return the token
359
- return token if !@recovering && (
360
- (@branch[cur_prod] && @branch[cur_prod].has_key?(:"ebnf:empty")) ||
361
- first.any? {|t| token === t})
362
-
363
- # Otherwise, it's an error condition, and skip either until
364
- # we find a valid token for this production, or until we find
365
- # something that can follow this production
366
- expected = first.map {|v| v.inspect}.join(", ")
367
- error("skip_until_valid", "expected one of #{expected}",
368
- :production => cur_prod, :token => token)
369
-
370
- debug("recovery", "stack follows:")
371
- todo_stack.reverse.each do |todo|
372
- debug("recovery") {" #{todo[:prod]}: #{@follow[todo[:prod]].inspect}"}
373
- end
374
-
375
- # Find all follows to the top of the stack
376
- follows = todo_stack.inject([]) do |follow, todo|
377
- prod = todo[:prod]
378
- follow += @follow[prod] || []
379
- end.uniq
380
- debug("recovery") {"follows: #{follows.inspect}"}
381
-
382
- # Skip tokens until one is found in first or follows
383
- while (token = get_token) && (first + follows).none? {|t| token === t}
384
- skipped = @lexer.shift
385
- progress("recovery") {"skip #{skipped.inspect}"}
386
- end
387
- debug("recovery") {"found #{token.inspect}"}
388
-
389
- # If the token is a first, just return it. Otherwise, it is a follow
390
- # and we need to skip to the end of the production
391
- unless first.any? {|t| token == t} || todo_stack.last[:terms].empty?
392
- debug("recovery") {"token in follows, skip past #{todo_stack.last[:terms].inspect}"}
393
- todo_stack.last[:terms] = []
394
- end
395
- token
396
- end
397
-
398
- ##
399
- # @param [String] node Relevant location associated with message
400
- # @param [String] message Error string
401
- # @param [Hash] options
402
- # @option options [URI, #to_s] :production
403
- # @option options [Token] :token
404
- def error(node, message, options = {})
405
- message += ", found #{options[:token].representation.inspect}" if options[:token]
406
- message += " at line #{@lineno}" if @lineno
407
- message += ", production = #{options[:production].inspect}" if options[:production] && @options[:debug]
408
- @error_log << message unless @recovering
409
- @recovering = true
410
- debug(node, message, options.merge(:level => 0))
411
- end
412
-
413
- ##
414
- # Return the next token, entering error recovery if the token is invalid
415
- #
416
- # @return [Token]
417
- def get_token
418
- token = begin
419
- @lexer.first
420
- rescue RDF::LL1::Lexer::Error => e
421
- # Recover from lexer error
422
- @lineno = e.lineno
423
- error("get_token", "With input '#{e.input}': #{e.message}",
424
- :production => @productions.last)
425
-
426
- # Retrieve next valid token
427
- t = @lexer.recover
428
- debug("get_token", :level => 2) {"skipped to #{t.inspect}"}
429
- t
430
- end
431
- #progress("token") {token.inspect}
432
- @lineno = token.lineno if token
433
- token
434
- end
435
-
436
- ##
437
- # Progress output when parsing
438
- # param [String] node Relevant location associated with message
439
- # param [String] message ("")
440
- # param [Hash] options
441
- # option options [Integer] :depth
442
- # Recursion depth for indenting output
443
- # yieldreturn [String] added to message
444
- def progress(node, *args)
445
- return unless @options[:progress] || @options[:debug]
446
- options = args.last.is_a?(Hash) ? args.pop : {}
447
- message = args.join(",")
448
- depth = options[:depth] || self.depth
449
- message += yield.to_s if block_given?
450
- if @options[:debug]
451
- return debug(node, message, {:level => 0}.merge(options))
452
- else
453
- str = "[#{@lineno}]#{' ' * depth}#{node}: #{message}"
454
- $stderr.puts("[#{@lineno}]#{' ' * depth}#{node}: #{message}")
455
- end
456
- end
457
-
458
- ##
459
- # Progress output when debugging
460
- # @param [String] node Relevant location associated with message
461
- # @param [String] message ("")
462
- # @param [Hash] options
463
- # @option options [Integer] :depth
464
- # Recursion depth for indenting output
465
- # @yieldreturn [String] added to message
466
- def debug(node, message = "", options = {})
467
- return unless @options[:debug]
468
- debug_level = options.fetch(:level, 1)
469
- return unless debug_level <= DEBUG_LEVEL
470
- depth = options[:depth] || self.depth
471
- message += yield if block_given?
472
- str = "[#{@lineno}](#{debug_level})#{' ' * depth}#{node}: #{message}"
473
- case @options[:debug]
474
- when Array
475
- @options[:debug] << str
476
- when TrueClass
477
- $stderr.puts str
478
- when :yield
479
- @parse_callback.call(:trace, node, message, options)
480
- end
481
- end
482
-
483
- ##
484
- # Accept the first token in the input stream if it matches
485
- # _type\_or\_value_. Return nil otherwise.
486
- #
487
- # @param [Symbol, String] type_or_value
488
- # @return [Token]
489
- def accept(type_or_value)
490
- if (token = get_token) && token === type_or_value
491
- debug("accept") {"#{token.inspect} === #{type_or_value.inspect}"}
492
- @lexer.shift
493
- end
494
- end
495
- public
496
-
497
- ##
498
- # Raised for errors during parsing.
499
- #
500
- # @example Raising a parser error
501
- # raise Error.new(
502
- # "invalid token '%' on line 10",
503
- # :token => '%', :lineno => 9, :production => :turtleDoc)
504
- #
505
- # @see http://ruby-doc.org/core/classes/StandardError.html
506
- class Error < StandardError
507
- ##
508
- # The current production.
509
- #
510
- # @return [Symbol]
511
- attr_reader :production
512
-
513
- ##
514
- # The invalid token which triggered the error.
515
- #
516
- # @return [String]
517
- attr_reader :token
518
-
519
- ##
520
- # The line number where the error occurred.
521
- #
522
- # @return [Integer]
523
- attr_reader :lineno
524
-
525
- ##
526
- # Initializes a new lexer error instance.
527
- #
528
- # @param [String, #to_s] message
529
- # @param [Hash{Symbol => Object}] options
530
- # @option options [Symbol] :production (nil)
531
- # @option options [String] :token (nil)
532
- # @option options [Integer] :lineno (nil)
533
- def initialize(message, options = {})
534
- @production = options[:production]
535
- @token = options[:token]
536
- @lineno = options[:lineno]
537
- super(message.to_s)
538
- end
539
- end # class Error
540
- end # class Reader
541
- end # module RDF::Turtle