rdf-turtle 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,462 @@
1
+ require 'rdf'
2
+ require 'rdf/ll1/lexer'
3
+
4
+ module RDF::LL1
5
+ ##
6
+ # A Generic LL1 parser using a lexer and branch tables defined using the SWAP tool chain (modified).
7
+ module Parser
8
+ ##
9
+ # @attr [Integer] lineno
10
+ attr_reader :lineno
11
+
12
+ def self.included(base)
13
+ base.extend(ClassMethods)
14
+ end
15
+
16
+ module ClassMethods
17
+ def production_handlers; @production_handlers || {}; end
18
+ def terminal_handlers; @terminal_handlers || {}; end
19
+ def patterns; @patterns || []; end
20
+ def unescape_terms; @unescape_terms || []; end
21
+
22
+ ##
23
+ # Defines a production called during different phases of parsing
24
+ # with data from previous production along with data defined for the
25
+ # current production
26
+ #
27
+ # @param [Symbol] term
28
+ # Term which is a key in the branch table
29
+ # @yield [reader, phase, input, current]
30
+ # @yieldparam [RDF::Reader] reader
31
+ # Reader instance
32
+ # @yieldparam [Symbol] phase
33
+ # Phase of parsing, one of :start, or :finish
34
+ # @yieldparam [Hash] input
35
+ # A Hash containing input from the parent production
36
+ # @yieldparam [Hash] current
37
+ # A Hash defined for the current production, during :start
38
+ # may be initialized with data to pass to further productions,
39
+ # during :finish, it contains data placed by earlier productions
40
+ # @yieldparam [Prod] block
41
+ # Block passed to initialization for yielding to calling reader.
42
+ # Should conform to the yield specs for #initialize
43
+ # Yield to generate a triple
44
+ def production(term, &block)
45
+ @production_handlers ||= {}
46
+ @production_handlers[term] = block
47
+ end
48
+
49
+ ##
50
+ # Defines the pattern for a terminal node and a block to be invoked
51
+ # when ther terminal is encountered. If the block is missing, the
52
+ # value of the terminal will be placed on the input hash to be returned
53
+ # to a previous production.
54
+ #
55
+ # @param [Symbol, String] term
56
+ # Defines a terminal production, which appears as within a sequence in the branch table
57
+ # @param [Regexp] regexp
58
+ # Pattern used to scan for this terminal
59
+ # @param [Hash] options
60
+ # @option options [Boolean] :unescape
61
+ # Cause strings and codepoints to be unescaped.
62
+ # @yield [reader, term, token, input]
63
+ # @yieldparam [RDF::Reader] reader
64
+ # Reader instance
65
+ # @yieldparam [Symbol] term
66
+ # A symbol indicating the production which referenced this terminal
67
+ # @yieldparam [String] token
68
+ # The scanned token
69
+ # @yieldparam [Hash] input
70
+ # A Hash containing input from the parent production
71
+ # @yieldparam [Prod] block
72
+ # Block passed to initialization for yielding to calling reader.
73
+ # Should conform to the yield specs for #initialize
74
+ def terminal(term, regexp, options = {}, &block)
75
+ @patterns ||= []
76
+ @patterns << [term, regexp] # Passed in order to define evaulation sequence
77
+ @terminal_handlers ||= {}
78
+ @terminal_handlers[term] = block
79
+ @unescape_terms ||= []
80
+ @unescape_terms << term if options[:unescape]
81
+ end
82
+ end
83
+
84
+ ##
85
+ # Initializes a new parser instance.
86
+ #
87
+ # Attempts to recover from errors.
88
+ #
89
+ # @example
90
+ # require 'rdf/ll1/parser'
91
+ #
92
+ # class Reader << RDF::Reader
93
+ # include RDF::LL1::Parser
94
+ #
95
+ # branch RDF::Turtle::Reader::BRANCH
96
+ #
97
+ # ##
98
+ # # Defines a production called during different phases of parsing
99
+ # # with data from previous production along with data defined for the
100
+ # # current production
101
+ # #
102
+ # # Yield to generate a triple
103
+ # production :object do |reader, phase, input, current|
104
+ # object = current[:resource]
105
+ # yield :statement, RDF::Statement.new(input[:subject], input[:predicate], object)
106
+ # end
107
+ #
108
+ # ##
109
+ # # Defines the pattern for a terminal node
110
+ # terminal :BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL})) do |reader, production, token, input|
111
+ # input[:BLANK_NODE_LABEL] = RDF::Node.new(token)
112
+ # end
113
+ #
114
+ # ##
115
+ # # Iterates the given block for each RDF statement in the input.
116
+ # #
117
+ # # @yield [statement]
118
+ # # @yieldparam [RDF::Statement] statement
119
+ # # @return [void]
120
+ # def each_statement(&block)
121
+ # @callback = block
122
+ #
123
+ # parse(START.to_sym) do |context, *data|
124
+ # case context
125
+ # when :statement
126
+ # yield *data
127
+ # end
128
+ # end
129
+ # end
130
+ #
131
+ # end
132
+ #
133
+ # @param [String, #to_s] input
134
+ # @param [Symbol, #to_s] prod The starting production for the parser.
135
+ # It may be a URI from the grammar, or a symbol representing the local_name portion of the grammar URI.
136
+ # @param [Hash{Symbol => Object}] options
137
+ # @option options [Hash{Symbol,String => Hash{Symbol,String => Array<Symbol,String>}}] :branch
138
+ # LL1 branch table.
139
+ # @option options [HHash{Symbol,String => Array<Symbol,String>}] :follow ({})
140
+ # Lists valid terminals that can follow each production (for error recovery).
141
+ # @option options [Boolean] :validate (false)
142
+ # whether to validate the parsed statements and values. If not validating,
143
+ # the parser will attempt to recover from errors.
144
+ # @option options [Boolean] :progress
145
+ # Show progress of parser productions
146
+ # @option options [Boolean] :debug
147
+ # Detailed debug output
148
+ # @yield [context, *data]
149
+ # Yields for to return data to reader
150
+ # @yieldparam [:statement, :trace] context
151
+ # Context for block
152
+ # @yieldparam [Symbol] *data
153
+ # Data specific to the call
154
+ # @return [RDF::LL1::Parser]
155
+ # @see http://cs.adelaide.edu.au/~charles/lt/Lectures/07-ErrorRecovery.pdf
156
+ def parse(input = nil, prod = nil, options = {}, &block)
157
+ @options = options.dup
158
+ @branch = options[:branch]
159
+ @follow = options[:follow] ||= {}
160
+ @lexer = input.is_a?(Lexer) ? input : Lexer.new(input, self.class.patterns, @options.merge(:unescape_terms => self.class.unescape_terms))
161
+ @productions = []
162
+ @parse_callback = block
163
+ @recovering = false
164
+ terminals = self.class.patterns.map(&:first) # Get defined terminals to help with branching
165
+
166
+ # Unrecoverable errors
167
+ raise Error, "Branch table not defined" unless @branch && @branch.length > 0
168
+ raise Error, "Starting production not defined" unless prod
169
+
170
+ @prod_data = [{}]
171
+ prod = RDF::URI(prod).fragment.to_sym unless prod.is_a?(Symbol)
172
+ todo_stack = [{:prod => prod, :terms => nil}]
173
+
174
+ while !todo_stack.empty?
175
+ pushed = false
176
+ if todo_stack.last[:terms].nil?
177
+ todo_stack.last[:terms] = []
178
+ begin
179
+ token = @lexer.first
180
+ rescue RDF::LL1::Lexer::Error => e
181
+ # Recover from lexer error
182
+ @lineno = e.lineno
183
+ error("parse(production)", "With input '#{e.input}': #{e.message}",
184
+ :production => @productions.last)
185
+
186
+ # Retrieve next valid token
187
+ token = @lexer.recover
188
+ end
189
+ @lineno = token.lineno if token
190
+ debug("parse(production)",
191
+ "#{token ? token.representation.inspect : 'nil'}, " +
192
+ "prod #{todo_stack.last[:prod].inspect}, " +
193
+ "depth #{depth}")
194
+
195
+ # Got an opened production
196
+ cur_prod = todo_stack.last[:prod]
197
+ # Got an opened production
198
+ onStart(cur_prod)
199
+ break if token.nil?
200
+
201
+ if prod_branch = @branch[cur_prod]
202
+ sequence = prod_branch[token.representation]
203
+ debug("parse(production)",
204
+ "#{token.representation.inspect} " +
205
+ "prod #{cur_prod.inspect}, " +
206
+ "prod_branch #{prod_branch.keys.inspect}, " +
207
+ "sequence #{sequence.inspect}")
208
+ if sequence.nil?
209
+ if prod_branch.has_key?(:"ebnf:empty")
210
+ debug("parse(production)", "empty sequence for ebnf:empty")
211
+ else
212
+ expected = prod_branch.keys.map {|v| v.inspect}.join(", ")
213
+ error("parse", "expected one of #{expected}",
214
+ :production => cur_prod, :token => token)
215
+
216
+ # Skip input until we find something that can follow the current production
217
+ skip_until_follow(todo_stack)
218
+ todo_stack.last[:terms] = []
219
+ end
220
+ end
221
+ @recovering = false
222
+ todo_stack.last[:terms] += sequence if sequence
223
+ else
224
+ error("parse", "No branches found for #{cur_prod.inspect}",
225
+ :production => cur_prod, :token => token)
226
+ todo_stack.last[:terms] = []
227
+ end
228
+ end
229
+
230
+ debug("parse(terms)", "todo #{todo_stack.last.inspect}, depth #{depth}")
231
+ while !todo_stack.last[:terms].to_a.empty?
232
+ begin
233
+ # Get the next term in this sequence
234
+ term = todo_stack.last[:terms].shift
235
+ if token = accept(term)
236
+ debug("parse(token)", "#{token.inspect}, term #{term.inspect}")
237
+ @lineno = token.lineno if token
238
+ onToken(term, token)
239
+ elsif terminals.include?(term)
240
+ error("parse", "#{term.inspect} expected",
241
+ :production => todo_stack.last[:prod], :token => @lexer.first)
242
+
243
+ # Recover until we find something that can follow this term
244
+ skip_until_follow(todo_stack)
245
+ else
246
+ # If it's not a string (a symbol), it is a non-terminal and we push the new state
247
+ todo_stack << {:prod => term, :terms => nil}
248
+ debug("parse(push)", "term #{term.inspect}, depth #{depth}")
249
+ pushed = true
250
+ break
251
+ end
252
+ rescue RDF::LL1::Lexer::Error => e
253
+ # Skip forward for acceptable lexer input
254
+ error("parse", "#{term.inspect} expected: #{e.message}",
255
+ :production => todo_stack.last[:prod])
256
+ @lexer.recover
257
+ end
258
+ end
259
+
260
+ # After completing the last production in a sequence, pop down until we find a production
261
+ #
262
+ # If in recovery mode, continue popping until we find a term with a follow list
263
+ while !pushed &&
264
+ !todo_stack.empty? &&
265
+ ( todo_stack.last[:terms].to_a.empty? ||
266
+ (@recovering && @follow[todo_stack.last[:term]].nil?))
267
+ debug("parse(pop)", "todo #{todo_stack.last.inspect}, depth #{depth}, recovering? #{@recovering.inspect}")
268
+ prod = todo_stack.last[:prod]
269
+ @recovering = false if @follow[prod] # Stop recovering when we might have a match
270
+ todo_stack.pop
271
+ onFinish
272
+ end
273
+ end
274
+
275
+ error("parse(eof)", "Finished processing before end of file", :token => @lexer.first) if @lexer.first
276
+
277
+ # Continue popping contexts off of the stack
278
+ while !todo_stack.empty?
279
+ debug("parse(eof)", "stack #{todo_stack.last.inspect}, depth #{depth}")
280
+ todo_stack.pop
281
+ onFinish
282
+ end
283
+
284
+ rescue RDF::LL1::Lexer::Error => e
285
+ @lineno = e.lineno
286
+ error("parse", "With input '#{e.input}': #{e.message}",
287
+ :production => @productions.last)
288
+ end
289
+
290
+ def depth; (@productions || []).length; end
291
+
292
+ private
293
+ # Start for production
294
+ def onStart(prod)
295
+ handler = self.class.production_handlers[prod]
296
+ @productions << prod
297
+ if handler
298
+ # Create a new production data element, potentially allowing handler
299
+ # to customize before pushing on the @prod_data stack
300
+ progress("#{prod}(:start):#{@prod_data.length}", @prod_data.last)
301
+ data = {}
302
+ handler.call(self, :start, @prod_data.last, data, @parse_callback)
303
+ @prod_data << data
304
+ else
305
+ progress("#{prod}(:start)", '')
306
+ end
307
+ #puts @prod_data.inspect
308
+ end
309
+
310
+ # Finish of production
311
+ def onFinish
312
+ prod = @productions.last
313
+ handler = self.class.production_handlers[prod]
314
+ if handler
315
+ # Pop production data element from stack, potentially allowing handler to use it
316
+ data = @prod_data.pop
317
+ handler.call(self, :finish, @prod_data.last, data, @parse_callback)
318
+ progress("#{prod}(:finish):#{@prod_data.length}", @prod_data.last)
319
+ else
320
+ progress("#{prod}(:finish)", '')
321
+ end
322
+ @productions.pop
323
+ end
324
+
325
+ # A token
326
+ def onToken(prod, token)
327
+ unless @productions.empty?
328
+ parentProd = @productions.last
329
+ handler = self.class.terminal_handlers[prod]
330
+ handler ||= self.class.terminal_handlers[nil] if prod.is_a?(String) # Allows catch-all for simple string terminals
331
+ if handler
332
+ handler.call(self, parentProd, token, @prod_data.last)
333
+ progress("#{prod}(:token)", "#{token}: #{@prod_data.last}", :depth => (depth + 1))
334
+ else
335
+ progress("#{prod}(:token)", token.to_s, :depth => (depth + 1))
336
+ end
337
+ else
338
+ error("#{parentProd}(:token)", "Token has no parent production", :production => prod)
339
+ end
340
+ end
341
+
342
+ # Skip throught the input stream until something is found that follows the last production with a list of follows
343
+ def skip_until_follow(todo_stack)
344
+ debug("recovery", "stack follows:")
345
+ todo_stack.each do |todo|
346
+ debug("recovery", " #{todo[:prod]}: #{@follow[todo[:prod]].inspect}")
347
+ end
348
+ follows = todo_stack.inject([]) do |follow, todo|
349
+ prod = todo[:prod]
350
+ follow += @follow[prod] || []
351
+ end.uniq
352
+ progress("recovery", "first #{@lexer.first.inspect}, follows: #{follows.inspect}")
353
+ while (token = @lexer.first) && follows.none? {|t| token === t}
354
+ skipped = @lexer.shift
355
+ progress("recovery", "skip #{skipped.inspect}")
356
+ end
357
+ end
358
+
359
+ # @param [String] str Error string
360
+ # @param [Hash] options
361
+ # @option options [URI, #to_s] :production
362
+ # @option options [Token] :token
363
+ def error(node, message, options = {})
364
+ return if @recovering
365
+ @recovering = true
366
+ message += ", found #{options[:token].representation.inspect}" if options[:token]
367
+ message += " at line #{@lineno}" if @lineno
368
+ message += ", production = #{options[:production].inspect}" if options[:production] && options[:debug]
369
+ if !@options[:validate] && !options[:fatal]
370
+ debug(node, message, options)
371
+ else
372
+ raise Error, message
373
+ end
374
+ end
375
+
376
+ ##
377
+ # Progress output when parsing
378
+ # @param [String] str
379
+ def progress(node, message, options = {})
380
+ return debug(node, message, options) if @options[:debug]
381
+ return unless @options[:progress]
382
+ depth = options[:depth] || self.depth
383
+ str = "[#{@lineno}]#{' ' * depth}#{node}: #{message}"
384
+ $stderr.puts("[#{@lineno}]#{' ' * depth}#{node}: #{message}")
385
+ end
386
+
387
+ ##
388
+ # Progress output when debugging
389
+ # @param [String] node Relevant location associated with message
390
+ # @param [String] message
391
+ # @param [Hash] options
392
+ # @option options [Integer] :depth
393
+ # Recursion depth for indenting output
394
+ def debug(node, message, options = {})
395
+ depth = options[:depth] || self.depth
396
+ str = "[#{@lineno}]#{' ' * depth}#{node}: #{message}"
397
+ case @options[:debug]
398
+ when Array
399
+ @options[:debug] << str
400
+ when TrueClass
401
+ $stderr.puts str
402
+ when :yield
403
+ @parse_callback.call(:debug, node, message, options)
404
+ end
405
+ end
406
+
407
+ ##
408
+ # @param [Symbol, String] type_or_value
409
+ # @return [Token]
410
+ def accept(type_or_value)
411
+ if (token = @lexer.first) && token === type_or_value
412
+ debug("accept", "#{token.inspect} === #{type_or_value}.inspect")
413
+ @lexer.shift
414
+ end
415
+ end
416
+ public
417
+
418
+ ##
419
+ # Raised for errors during parsing.
420
+ #
421
+ # @example Raising a parser error
422
+ # raise Error.new(
423
+ # "invalid token '%' on line 10",
424
+ # :token => '%', :lineno => 9, :production => :turtleDoc)
425
+ #
426
+ # @see http://ruby-doc.org/core/classes/StandardError.html
427
+ class Error < StandardError
428
+ ##
429
+ # The current production.
430
+ #
431
+ # @return [Symbol]
432
+ attr_reader :production
433
+
434
+ ##
435
+ # The invalid token which triggered the error.
436
+ #
437
+ # @return [String]
438
+ attr_reader :token
439
+
440
+ ##
441
+ # The line number where the error occurred.
442
+ #
443
+ # @return [Integer]
444
+ attr_reader :lineno
445
+
446
+ ##
447
+ # Initializes a new lexer error instance.
448
+ #
449
+ # @param [String, #to_s] message
450
+ # @param [Hash{Symbol => Object}] options
451
+ # @option options [Symbol] :production (nil)
452
+ # @option options [String] :token (nil)
453
+ # @option options [Integer] :lineno (nil)
454
+ def initialize(message, options = {})
455
+ @production = options[:production]
456
+ @token = options[:token]
457
+ @lineno = options[:lineno]
458
+ super(message.to_s)
459
+ end
460
+ end # class Error
461
+ end # class Reader
462
+ end # module RDF::Turtle