rdf-turtle 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,462 @@
1
+ require 'rdf'
2
+ require 'rdf/ll1/lexer'
3
+
4
+ module RDF::LL1
5
+ ##
6
+ # A Generic LL1 parser using a lexer and branch tables defined using the SWAP tool chain (modified).
7
+ module Parser
8
+ ##
9
+ # @attr [Integer] lineno
10
+ attr_reader :lineno
11
+
12
+ def self.included(base)
13
+ base.extend(ClassMethods)
14
+ end
15
+
16
+ module ClassMethods
17
+ def production_handlers; @production_handlers || {}; end
18
+ def terminal_handlers; @terminal_handlers || {}; end
19
+ def patterns; @patterns || []; end
20
+ def unescape_terms; @unescape_terms || []; end
21
+
22
+ ##
23
+ # Defines a production called during different phases of parsing
24
+ # with data from previous production along with data defined for the
25
+ # current production
26
+ #
27
+ # @param [Symbol] term
28
+ # Term which is a key in the branch table
29
+ # @yield [reader, phase, input, current]
30
+ # @yieldparam [RDF::Reader] reader
31
+ # Reader instance
32
+ # @yieldparam [Symbol] phase
33
+ # Phase of parsing, one of :start, or :finish
34
+ # @yieldparam [Hash] input
35
+ # A Hash containing input from the parent production
36
+ # @yieldparam [Hash] current
37
+ # A Hash defined for the current production, during :start
38
+ # may be initialized with data to pass to further productions,
39
+ # during :finish, it contains data placed by earlier productions
40
+ # @yieldparam [Prod] block
41
+ # Block passed to initialization for yielding to calling reader.
42
+ # Should conform to the yield specs for #initialize
43
+ # Yield to generate a triple
44
+ def production(term, &block)
45
+ @production_handlers ||= {}
46
+ @production_handlers[term] = block
47
+ end
48
+
49
+ ##
50
+ # Defines the pattern for a terminal node and a block to be invoked
51
+ # when ther terminal is encountered. If the block is missing, the
52
+ # value of the terminal will be placed on the input hash to be returned
53
+ # to a previous production.
54
+ #
55
+ # @param [Symbol, String] term
56
+ # Defines a terminal production, which appears as within a sequence in the branch table
57
+ # @param [Regexp] regexp
58
+ # Pattern used to scan for this terminal
59
+ # @param [Hash] options
60
+ # @option options [Boolean] :unescape
61
+ # Cause strings and codepoints to be unescaped.
62
+ # @yield [reader, term, token, input]
63
+ # @yieldparam [RDF::Reader] reader
64
+ # Reader instance
65
+ # @yieldparam [Symbol] term
66
+ # A symbol indicating the production which referenced this terminal
67
+ # @yieldparam [String] token
68
+ # The scanned token
69
+ # @yieldparam [Hash] input
70
+ # A Hash containing input from the parent production
71
+ # @yieldparam [Prod] block
72
+ # Block passed to initialization for yielding to calling reader.
73
+ # Should conform to the yield specs for #initialize
74
+ def terminal(term, regexp, options = {}, &block)
75
+ @patterns ||= []
76
+ @patterns << [term, regexp] # Passed in order to define evaulation sequence
77
+ @terminal_handlers ||= {}
78
+ @terminal_handlers[term] = block
79
+ @unescape_terms ||= []
80
+ @unescape_terms << term if options[:unescape]
81
+ end
82
+ end
83
+
84
+ ##
85
+ # Initializes a new parser instance.
86
+ #
87
+ # Attempts to recover from errors.
88
+ #
89
+ # @example
90
+ # require 'rdf/ll1/parser'
91
+ #
92
+ # class Reader << RDF::Reader
93
+ # include RDF::LL1::Parser
94
+ #
95
+ # branch RDF::Turtle::Reader::BRANCH
96
+ #
97
+ # ##
98
+ # # Defines a production called during different phases of parsing
99
+ # # with data from previous production along with data defined for the
100
+ # # current production
101
+ # #
102
+ # # Yield to generate a triple
103
+ # production :object do |reader, phase, input, current|
104
+ # object = current[:resource]
105
+ # yield :statement, RDF::Statement.new(input[:subject], input[:predicate], object)
106
+ # end
107
+ #
108
+ # ##
109
+ # # Defines the pattern for a terminal node
110
+ # terminal :BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL})) do |reader, production, token, input|
111
+ # input[:BLANK_NODE_LABEL] = RDF::Node.new(token)
112
+ # end
113
+ #
114
+ # ##
115
+ # # Iterates the given block for each RDF statement in the input.
116
+ # #
117
+ # # @yield [statement]
118
+ # # @yieldparam [RDF::Statement] statement
119
+ # # @return [void]
120
+ # def each_statement(&block)
121
+ # @callback = block
122
+ #
123
+ # parse(START.to_sym) do |context, *data|
124
+ # case context
125
+ # when :statement
126
+ # yield *data
127
+ # end
128
+ # end
129
+ # end
130
+ #
131
+ # end
132
+ #
133
+ # @param [String, #to_s] input
134
+ # @param [Symbol, #to_s] prod The starting production for the parser.
135
+ # It may be a URI from the grammar, or a symbol representing the local_name portion of the grammar URI.
136
+ # @param [Hash{Symbol => Object}] options
137
+ # @option options [Hash{Symbol,String => Hash{Symbol,String => Array<Symbol,String>}}] :branch
138
+ # LL1 branch table.
139
+ # @option options [HHash{Symbol,String => Array<Symbol,String>}] :follow ({})
140
+ # Lists valid terminals that can follow each production (for error recovery).
141
+ # @option options [Boolean] :validate (false)
142
+ # whether to validate the parsed statements and values. If not validating,
143
+ # the parser will attempt to recover from errors.
144
+ # @option options [Boolean] :progress
145
+ # Show progress of parser productions
146
+ # @option options [Boolean] :debug
147
+ # Detailed debug output
148
+ # @yield [context, *data]
149
+ # Yields for to return data to reader
150
+ # @yieldparam [:statement, :trace] context
151
+ # Context for block
152
+ # @yieldparam [Symbol] *data
153
+ # Data specific to the call
154
+ # @return [RDF::LL1::Parser]
155
+ # @see http://cs.adelaide.edu.au/~charles/lt/Lectures/07-ErrorRecovery.pdf
156
+ def parse(input = nil, prod = nil, options = {}, &block)
157
+ @options = options.dup
158
+ @branch = options[:branch]
159
+ @follow = options[:follow] ||= {}
160
+ @lexer = input.is_a?(Lexer) ? input : Lexer.new(input, self.class.patterns, @options.merge(:unescape_terms => self.class.unescape_terms))
161
+ @productions = []
162
+ @parse_callback = block
163
+ @recovering = false
164
+ terminals = self.class.patterns.map(&:first) # Get defined terminals to help with branching
165
+
166
+ # Unrecoverable errors
167
+ raise Error, "Branch table not defined" unless @branch && @branch.length > 0
168
+ raise Error, "Starting production not defined" unless prod
169
+
170
+ @prod_data = [{}]
171
+ prod = RDF::URI(prod).fragment.to_sym unless prod.is_a?(Symbol)
172
+ todo_stack = [{:prod => prod, :terms => nil}]
173
+
174
+ while !todo_stack.empty?
175
+ pushed = false
176
+ if todo_stack.last[:terms].nil?
177
+ todo_stack.last[:terms] = []
178
+ begin
179
+ token = @lexer.first
180
+ rescue RDF::LL1::Lexer::Error => e
181
+ # Recover from lexer error
182
+ @lineno = e.lineno
183
+ error("parse(production)", "With input '#{e.input}': #{e.message}",
184
+ :production => @productions.last)
185
+
186
+ # Retrieve next valid token
187
+ token = @lexer.recover
188
+ end
189
+ @lineno = token.lineno if token
190
+ debug("parse(production)",
191
+ "#{token ? token.representation.inspect : 'nil'}, " +
192
+ "prod #{todo_stack.last[:prod].inspect}, " +
193
+ "depth #{depth}")
194
+
195
+ # Got an opened production
196
+ cur_prod = todo_stack.last[:prod]
197
+ # Got an opened production
198
+ onStart(cur_prod)
199
+ break if token.nil?
200
+
201
+ if prod_branch = @branch[cur_prod]
202
+ sequence = prod_branch[token.representation]
203
+ debug("parse(production)",
204
+ "#{token.representation.inspect} " +
205
+ "prod #{cur_prod.inspect}, " +
206
+ "prod_branch #{prod_branch.keys.inspect}, " +
207
+ "sequence #{sequence.inspect}")
208
+ if sequence.nil?
209
+ if prod_branch.has_key?(:"ebnf:empty")
210
+ debug("parse(production)", "empty sequence for ebnf:empty")
211
+ else
212
+ expected = prod_branch.keys.map {|v| v.inspect}.join(", ")
213
+ error("parse", "expected one of #{expected}",
214
+ :production => cur_prod, :token => token)
215
+
216
+ # Skip input until we find something that can follow the current production
217
+ skip_until_follow(todo_stack)
218
+ todo_stack.last[:terms] = []
219
+ end
220
+ end
221
+ @recovering = false
222
+ todo_stack.last[:terms] += sequence if sequence
223
+ else
224
+ error("parse", "No branches found for #{cur_prod.inspect}",
225
+ :production => cur_prod, :token => token)
226
+ todo_stack.last[:terms] = []
227
+ end
228
+ end
229
+
230
+ debug("parse(terms)", "todo #{todo_stack.last.inspect}, depth #{depth}")
231
+ while !todo_stack.last[:terms].to_a.empty?
232
+ begin
233
+ # Get the next term in this sequence
234
+ term = todo_stack.last[:terms].shift
235
+ if token = accept(term)
236
+ debug("parse(token)", "#{token.inspect}, term #{term.inspect}")
237
+ @lineno = token.lineno if token
238
+ onToken(term, token)
239
+ elsif terminals.include?(term)
240
+ error("parse", "#{term.inspect} expected",
241
+ :production => todo_stack.last[:prod], :token => @lexer.first)
242
+
243
+ # Recover until we find something that can follow this term
244
+ skip_until_follow(todo_stack)
245
+ else
246
+ # If it's not a string (a symbol), it is a non-terminal and we push the new state
247
+ todo_stack << {:prod => term, :terms => nil}
248
+ debug("parse(push)", "term #{term.inspect}, depth #{depth}")
249
+ pushed = true
250
+ break
251
+ end
252
+ rescue RDF::LL1::Lexer::Error => e
253
+ # Skip forward for acceptable lexer input
254
+ error("parse", "#{term.inspect} expected: #{e.message}",
255
+ :production => todo_stack.last[:prod])
256
+ @lexer.recover
257
+ end
258
+ end
259
+
260
+ # After completing the last production in a sequence, pop down until we find a production
261
+ #
262
+ # If in recovery mode, continue popping until we find a term with a follow list
263
+ while !pushed &&
264
+ !todo_stack.empty? &&
265
+ ( todo_stack.last[:terms].to_a.empty? ||
266
+ (@recovering && @follow[todo_stack.last[:term]].nil?))
267
+ debug("parse(pop)", "todo #{todo_stack.last.inspect}, depth #{depth}, recovering? #{@recovering.inspect}")
268
+ prod = todo_stack.last[:prod]
269
+ @recovering = false if @follow[prod] # Stop recovering when we might have a match
270
+ todo_stack.pop
271
+ onFinish
272
+ end
273
+ end
274
+
275
+ error("parse(eof)", "Finished processing before end of file", :token => @lexer.first) if @lexer.first
276
+
277
+ # Continue popping contexts off of the stack
278
+ while !todo_stack.empty?
279
+ debug("parse(eof)", "stack #{todo_stack.last.inspect}, depth #{depth}")
280
+ todo_stack.pop
281
+ onFinish
282
+ end
283
+
284
+ rescue RDF::LL1::Lexer::Error => e
285
+ @lineno = e.lineno
286
+ error("parse", "With input '#{e.input}': #{e.message}",
287
+ :production => @productions.last)
288
+ end
289
+
290
+ def depth; (@productions || []).length; end
291
+
292
+ private
293
+ # Start for production
294
+ def onStart(prod)
295
+ handler = self.class.production_handlers[prod]
296
+ @productions << prod
297
+ if handler
298
+ # Create a new production data element, potentially allowing handler
299
+ # to customize before pushing on the @prod_data stack
300
+ progress("#{prod}(:start):#{@prod_data.length}", @prod_data.last)
301
+ data = {}
302
+ handler.call(self, :start, @prod_data.last, data, @parse_callback)
303
+ @prod_data << data
304
+ else
305
+ progress("#{prod}(:start)", '')
306
+ end
307
+ #puts @prod_data.inspect
308
+ end
309
+
310
+ # Finish of production
311
+ def onFinish
312
+ prod = @productions.last
313
+ handler = self.class.production_handlers[prod]
314
+ if handler
315
+ # Pop production data element from stack, potentially allowing handler to use it
316
+ data = @prod_data.pop
317
+ handler.call(self, :finish, @prod_data.last, data, @parse_callback)
318
+ progress("#{prod}(:finish):#{@prod_data.length}", @prod_data.last)
319
+ else
320
+ progress("#{prod}(:finish)", '')
321
+ end
322
+ @productions.pop
323
+ end
324
+
325
+ # A token
326
+ def onToken(prod, token)
327
+ unless @productions.empty?
328
+ parentProd = @productions.last
329
+ handler = self.class.terminal_handlers[prod]
330
+ handler ||= self.class.terminal_handlers[nil] if prod.is_a?(String) # Allows catch-all for simple string terminals
331
+ if handler
332
+ handler.call(self, parentProd, token, @prod_data.last)
333
+ progress("#{prod}(:token)", "#{token}: #{@prod_data.last}", :depth => (depth + 1))
334
+ else
335
+ progress("#{prod}(:token)", token.to_s, :depth => (depth + 1))
336
+ end
337
+ else
338
+ error("#{parentProd}(:token)", "Token has no parent production", :production => prod)
339
+ end
340
+ end
341
+
342
+ # Skip throught the input stream until something is found that follows the last production with a list of follows
343
+ def skip_until_follow(todo_stack)
344
+ debug("recovery", "stack follows:")
345
+ todo_stack.each do |todo|
346
+ debug("recovery", " #{todo[:prod]}: #{@follow[todo[:prod]].inspect}")
347
+ end
348
+ follows = todo_stack.inject([]) do |follow, todo|
349
+ prod = todo[:prod]
350
+ follow += @follow[prod] || []
351
+ end.uniq
352
+ progress("recovery", "first #{@lexer.first.inspect}, follows: #{follows.inspect}")
353
+ while (token = @lexer.first) && follows.none? {|t| token === t}
354
+ skipped = @lexer.shift
355
+ progress("recovery", "skip #{skipped.inspect}")
356
+ end
357
+ end
358
+
359
+ # @param [String] str Error string
360
+ # @param [Hash] options
361
+ # @option options [URI, #to_s] :production
362
+ # @option options [Token] :token
363
+ def error(node, message, options = {})
364
+ return if @recovering
365
+ @recovering = true
366
+ message += ", found #{options[:token].representation.inspect}" if options[:token]
367
+ message += " at line #{@lineno}" if @lineno
368
+ message += ", production = #{options[:production].inspect}" if options[:production] && options[:debug]
369
+ if !@options[:validate] && !options[:fatal]
370
+ debug(node, message, options)
371
+ else
372
+ raise Error, message
373
+ end
374
+ end
375
+
376
+ ##
377
+ # Progress output when parsing
378
+ # @param [String] str
379
+ def progress(node, message, options = {})
380
+ return debug(node, message, options) if @options[:debug]
381
+ return unless @options[:progress]
382
+ depth = options[:depth] || self.depth
383
+ str = "[#{@lineno}]#{' ' * depth}#{node}: #{message}"
384
+ $stderr.puts("[#{@lineno}]#{' ' * depth}#{node}: #{message}")
385
+ end
386
+
387
+ ##
388
+ # Progress output when debugging
389
+ # @param [String] node Relevant location associated with message
390
+ # @param [String] message
391
+ # @param [Hash] options
392
+ # @option options [Integer] :depth
393
+ # Recursion depth for indenting output
394
+ def debug(node, message, options = {})
395
+ depth = options[:depth] || self.depth
396
+ str = "[#{@lineno}]#{' ' * depth}#{node}: #{message}"
397
+ case @options[:debug]
398
+ when Array
399
+ @options[:debug] << str
400
+ when TrueClass
401
+ $stderr.puts str
402
+ when :yield
403
+ @parse_callback.call(:debug, node, message, options)
404
+ end
405
+ end
406
+
407
+ ##
408
+ # @param [Symbol, String] type_or_value
409
+ # @return [Token]
410
+ def accept(type_or_value)
411
+ if (token = @lexer.first) && token === type_or_value
412
+ debug("accept", "#{token.inspect} === #{type_or_value}.inspect")
413
+ @lexer.shift
414
+ end
415
+ end
416
+ public
417
+
418
+ ##
419
+ # Raised for errors during parsing.
420
+ #
421
+ # @example Raising a parser error
422
+ # raise Error.new(
423
+ # "invalid token '%' on line 10",
424
+ # :token => '%', :lineno => 9, :production => :turtleDoc)
425
+ #
426
+ # @see http://ruby-doc.org/core/classes/StandardError.html
427
+ class Error < StandardError
428
+ ##
429
+ # The current production.
430
+ #
431
+ # @return [Symbol]
432
+ attr_reader :production
433
+
434
+ ##
435
+ # The invalid token which triggered the error.
436
+ #
437
+ # @return [String]
438
+ attr_reader :token
439
+
440
+ ##
441
+ # The line number where the error occurred.
442
+ #
443
+ # @return [Integer]
444
+ attr_reader :lineno
445
+
446
+ ##
447
+ # Initializes a new lexer error instance.
448
+ #
449
+ # @param [String, #to_s] message
450
+ # @param [Hash{Symbol => Object}] options
451
+ # @option options [Symbol] :production (nil)
452
+ # @option options [String] :token (nil)
453
+ # @option options [Integer] :lineno (nil)
454
+ def initialize(message, options = {})
455
+ @production = options[:production]
456
+ @token = options[:token]
457
+ @lineno = options[:lineno]
458
+ super(message.to_s)
459
+ end
460
+ end # class Error
461
+ end # class Reader
462
+ end # module RDF::Turtle