rdf-turtle 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/AUTHORS +1 -0
- data/History +9 -0
- data/README.markdown +142 -0
- data/UNLICENSE +24 -0
- data/VERSION +1 -0
- data/lib/rdf/ll1/lexer.rb +458 -0
- data/lib/rdf/ll1/parser.rb +462 -0
- data/lib/rdf/ll1/scanner.rb +100 -0
- data/lib/rdf/turtle.rb +35 -0
- data/lib/rdf/turtle/format.rb +41 -0
- data/lib/rdf/turtle/meta.rb +1748 -0
- data/lib/rdf/turtle/patches.rb +38 -0
- data/lib/rdf/turtle/reader.rb +362 -0
- data/lib/rdf/turtle/terminals.rb +88 -0
- data/lib/rdf/turtle/writer.rb +562 -0
- metadata +115 -0
@@ -0,0 +1,462 @@
|
|
1
|
+
require 'rdf'
|
2
|
+
require 'rdf/ll1/lexer'
|
3
|
+
|
4
|
+
module RDF::LL1
|
5
|
+
##
|
6
|
+
# A Generic LL1 parser using a lexer and branch tables defined using the SWAP tool chain (modified).
|
7
|
+
module Parser
|
8
|
+
##
|
9
|
+
# @attr [Integer] lineno
|
10
|
+
attr_reader :lineno
|
11
|
+
|
12
|
+
def self.included(base)
|
13
|
+
base.extend(ClassMethods)
|
14
|
+
end
|
15
|
+
|
16
|
+
module ClassMethods
|
17
|
+
def production_handlers; @production_handlers || {}; end
|
18
|
+
def terminal_handlers; @terminal_handlers || {}; end
|
19
|
+
def patterns; @patterns || []; end
|
20
|
+
def unescape_terms; @unescape_terms || []; end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Defines a production called during different phases of parsing
|
24
|
+
# with data from previous production along with data defined for the
|
25
|
+
# current production
|
26
|
+
#
|
27
|
+
# @param [Symbol] term
|
28
|
+
# Term which is a key in the branch table
|
29
|
+
# @yield [reader, phase, input, current]
|
30
|
+
# @yieldparam [RDF::Reader] reader
|
31
|
+
# Reader instance
|
32
|
+
# @yieldparam [Symbol] phase
|
33
|
+
# Phase of parsing, one of :start, or :finish
|
34
|
+
# @yieldparam [Hash] input
|
35
|
+
# A Hash containing input from the parent production
|
36
|
+
# @yieldparam [Hash] current
|
37
|
+
# A Hash defined for the current production, during :start
|
38
|
+
# may be initialized with data to pass to further productions,
|
39
|
+
# during :finish, it contains data placed by earlier productions
|
40
|
+
# @yieldparam [Prod] block
|
41
|
+
# Block passed to initialization for yielding to calling reader.
|
42
|
+
# Should conform to the yield specs for #initialize
|
43
|
+
# Yield to generate a triple
|
44
|
+
def production(term, &block)
|
45
|
+
@production_handlers ||= {}
|
46
|
+
@production_handlers[term] = block
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# Defines the pattern for a terminal node and a block to be invoked
|
51
|
+
# when ther terminal is encountered. If the block is missing, the
|
52
|
+
# value of the terminal will be placed on the input hash to be returned
|
53
|
+
# to a previous production.
|
54
|
+
#
|
55
|
+
# @param [Symbol, String] term
|
56
|
+
# Defines a terminal production, which appears as within a sequence in the branch table
|
57
|
+
# @param [Regexp] regexp
|
58
|
+
# Pattern used to scan for this terminal
|
59
|
+
# @param [Hash] options
|
60
|
+
# @option options [Boolean] :unescape
|
61
|
+
# Cause strings and codepoints to be unescaped.
|
62
|
+
# @yield [reader, term, token, input]
|
63
|
+
# @yieldparam [RDF::Reader] reader
|
64
|
+
# Reader instance
|
65
|
+
# @yieldparam [Symbol] term
|
66
|
+
# A symbol indicating the production which referenced this terminal
|
67
|
+
# @yieldparam [String] token
|
68
|
+
# The scanned token
|
69
|
+
# @yieldparam [Hash] input
|
70
|
+
# A Hash containing input from the parent production
|
71
|
+
# @yieldparam [Prod] block
|
72
|
+
# Block passed to initialization for yielding to calling reader.
|
73
|
+
# Should conform to the yield specs for #initialize
|
74
|
+
def terminal(term, regexp, options = {}, &block)
|
75
|
+
@patterns ||= []
|
76
|
+
@patterns << [term, regexp] # Passed in order to define evaulation sequence
|
77
|
+
@terminal_handlers ||= {}
|
78
|
+
@terminal_handlers[term] = block
|
79
|
+
@unescape_terms ||= []
|
80
|
+
@unescape_terms << term if options[:unescape]
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
##
|
85
|
+
# Initializes a new parser instance.
|
86
|
+
#
|
87
|
+
# Attempts to recover from errors.
|
88
|
+
#
|
89
|
+
# @example
|
90
|
+
# require 'rdf/ll1/parser'
|
91
|
+
#
|
92
|
+
# class Reader << RDF::Reader
|
93
|
+
# include RDF::LL1::Parser
|
94
|
+
#
|
95
|
+
# branch RDF::Turtle::Reader::BRANCH
|
96
|
+
#
|
97
|
+
# ##
|
98
|
+
# # Defines a production called during different phases of parsing
|
99
|
+
# # with data from previous production along with data defined for the
|
100
|
+
# # current production
|
101
|
+
# #
|
102
|
+
# # Yield to generate a triple
|
103
|
+
# production :object do |reader, phase, input, current|
|
104
|
+
# object = current[:resource]
|
105
|
+
# yield :statement, RDF::Statement.new(input[:subject], input[:predicate], object)
|
106
|
+
# end
|
107
|
+
#
|
108
|
+
# ##
|
109
|
+
# # Defines the pattern for a terminal node
|
110
|
+
# terminal :BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL})) do |reader, production, token, input|
|
111
|
+
# input[:BLANK_NODE_LABEL] = RDF::Node.new(token)
|
112
|
+
# end
|
113
|
+
#
|
114
|
+
# ##
|
115
|
+
# # Iterates the given block for each RDF statement in the input.
|
116
|
+
# #
|
117
|
+
# # @yield [statement]
|
118
|
+
# # @yieldparam [RDF::Statement] statement
|
119
|
+
# # @return [void]
|
120
|
+
# def each_statement(&block)
|
121
|
+
# @callback = block
|
122
|
+
#
|
123
|
+
# parse(START.to_sym) do |context, *data|
|
124
|
+
# case context
|
125
|
+
# when :statement
|
126
|
+
# yield *data
|
127
|
+
# end
|
128
|
+
# end
|
129
|
+
# end
|
130
|
+
#
|
131
|
+
# end
|
132
|
+
#
|
133
|
+
# @param [String, #to_s] input
|
134
|
+
# @param [Symbol, #to_s] prod The starting production for the parser.
|
135
|
+
# It may be a URI from the grammar, or a symbol representing the local_name portion of the grammar URI.
|
136
|
+
# @param [Hash{Symbol => Object}] options
|
137
|
+
# @option options [Hash{Symbol,String => Hash{Symbol,String => Array<Symbol,String>}}] :branch
|
138
|
+
# LL1 branch table.
|
139
|
+
# @option options [HHash{Symbol,String => Array<Symbol,String>}] :follow ({})
|
140
|
+
# Lists valid terminals that can follow each production (for error recovery).
|
141
|
+
# @option options [Boolean] :validate (false)
|
142
|
+
# whether to validate the parsed statements and values. If not validating,
|
143
|
+
# the parser will attempt to recover from errors.
|
144
|
+
# @option options [Boolean] :progress
|
145
|
+
# Show progress of parser productions
|
146
|
+
# @option options [Boolean] :debug
|
147
|
+
# Detailed debug output
|
148
|
+
# @yield [context, *data]
|
149
|
+
# Yields for to return data to reader
|
150
|
+
# @yieldparam [:statement, :trace] context
|
151
|
+
# Context for block
|
152
|
+
# @yieldparam [Symbol] *data
|
153
|
+
# Data specific to the call
|
154
|
+
# @return [RDF::LL1::Parser]
|
155
|
+
# @see http://cs.adelaide.edu.au/~charles/lt/Lectures/07-ErrorRecovery.pdf
|
156
|
+
def parse(input = nil, prod = nil, options = {}, &block)
|
157
|
+
@options = options.dup
|
158
|
+
@branch = options[:branch]
|
159
|
+
@follow = options[:follow] ||= {}
|
160
|
+
@lexer = input.is_a?(Lexer) ? input : Lexer.new(input, self.class.patterns, @options.merge(:unescape_terms => self.class.unescape_terms))
|
161
|
+
@productions = []
|
162
|
+
@parse_callback = block
|
163
|
+
@recovering = false
|
164
|
+
terminals = self.class.patterns.map(&:first) # Get defined terminals to help with branching
|
165
|
+
|
166
|
+
# Unrecoverable errors
|
167
|
+
raise Error, "Branch table not defined" unless @branch && @branch.length > 0
|
168
|
+
raise Error, "Starting production not defined" unless prod
|
169
|
+
|
170
|
+
@prod_data = [{}]
|
171
|
+
prod = RDF::URI(prod).fragment.to_sym unless prod.is_a?(Symbol)
|
172
|
+
todo_stack = [{:prod => prod, :terms => nil}]
|
173
|
+
|
174
|
+
while !todo_stack.empty?
|
175
|
+
pushed = false
|
176
|
+
if todo_stack.last[:terms].nil?
|
177
|
+
todo_stack.last[:terms] = []
|
178
|
+
begin
|
179
|
+
token = @lexer.first
|
180
|
+
rescue RDF::LL1::Lexer::Error => e
|
181
|
+
# Recover from lexer error
|
182
|
+
@lineno = e.lineno
|
183
|
+
error("parse(production)", "With input '#{e.input}': #{e.message}",
|
184
|
+
:production => @productions.last)
|
185
|
+
|
186
|
+
# Retrieve next valid token
|
187
|
+
token = @lexer.recover
|
188
|
+
end
|
189
|
+
@lineno = token.lineno if token
|
190
|
+
debug("parse(production)",
|
191
|
+
"#{token ? token.representation.inspect : 'nil'}, " +
|
192
|
+
"prod #{todo_stack.last[:prod].inspect}, " +
|
193
|
+
"depth #{depth}")
|
194
|
+
|
195
|
+
# Got an opened production
|
196
|
+
cur_prod = todo_stack.last[:prod]
|
197
|
+
# Got an opened production
|
198
|
+
onStart(cur_prod)
|
199
|
+
break if token.nil?
|
200
|
+
|
201
|
+
if prod_branch = @branch[cur_prod]
|
202
|
+
sequence = prod_branch[token.representation]
|
203
|
+
debug("parse(production)",
|
204
|
+
"#{token.representation.inspect} " +
|
205
|
+
"prod #{cur_prod.inspect}, " +
|
206
|
+
"prod_branch #{prod_branch.keys.inspect}, " +
|
207
|
+
"sequence #{sequence.inspect}")
|
208
|
+
if sequence.nil?
|
209
|
+
if prod_branch.has_key?(:"ebnf:empty")
|
210
|
+
debug("parse(production)", "empty sequence for ebnf:empty")
|
211
|
+
else
|
212
|
+
expected = prod_branch.keys.map {|v| v.inspect}.join(", ")
|
213
|
+
error("parse", "expected one of #{expected}",
|
214
|
+
:production => cur_prod, :token => token)
|
215
|
+
|
216
|
+
# Skip input until we find something that can follow the current production
|
217
|
+
skip_until_follow(todo_stack)
|
218
|
+
todo_stack.last[:terms] = []
|
219
|
+
end
|
220
|
+
end
|
221
|
+
@recovering = false
|
222
|
+
todo_stack.last[:terms] += sequence if sequence
|
223
|
+
else
|
224
|
+
error("parse", "No branches found for #{cur_prod.inspect}",
|
225
|
+
:production => cur_prod, :token => token)
|
226
|
+
todo_stack.last[:terms] = []
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
debug("parse(terms)", "todo #{todo_stack.last.inspect}, depth #{depth}")
|
231
|
+
while !todo_stack.last[:terms].to_a.empty?
|
232
|
+
begin
|
233
|
+
# Get the next term in this sequence
|
234
|
+
term = todo_stack.last[:terms].shift
|
235
|
+
if token = accept(term)
|
236
|
+
debug("parse(token)", "#{token.inspect}, term #{term.inspect}")
|
237
|
+
@lineno = token.lineno if token
|
238
|
+
onToken(term, token)
|
239
|
+
elsif terminals.include?(term)
|
240
|
+
error("parse", "#{term.inspect} expected",
|
241
|
+
:production => todo_stack.last[:prod], :token => @lexer.first)
|
242
|
+
|
243
|
+
# Recover until we find something that can follow this term
|
244
|
+
skip_until_follow(todo_stack)
|
245
|
+
else
|
246
|
+
# If it's not a string (a symbol), it is a non-terminal and we push the new state
|
247
|
+
todo_stack << {:prod => term, :terms => nil}
|
248
|
+
debug("parse(push)", "term #{term.inspect}, depth #{depth}")
|
249
|
+
pushed = true
|
250
|
+
break
|
251
|
+
end
|
252
|
+
rescue RDF::LL1::Lexer::Error => e
|
253
|
+
# Skip forward for acceptable lexer input
|
254
|
+
error("parse", "#{term.inspect} expected: #{e.message}",
|
255
|
+
:production => todo_stack.last[:prod])
|
256
|
+
@lexer.recover
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
# After completing the last production in a sequence, pop down until we find a production
|
261
|
+
#
|
262
|
+
# If in recovery mode, continue popping until we find a term with a follow list
|
263
|
+
while !pushed &&
|
264
|
+
!todo_stack.empty? &&
|
265
|
+
( todo_stack.last[:terms].to_a.empty? ||
|
266
|
+
(@recovering && @follow[todo_stack.last[:term]].nil?))
|
267
|
+
debug("parse(pop)", "todo #{todo_stack.last.inspect}, depth #{depth}, recovering? #{@recovering.inspect}")
|
268
|
+
prod = todo_stack.last[:prod]
|
269
|
+
@recovering = false if @follow[prod] # Stop recovering when we might have a match
|
270
|
+
todo_stack.pop
|
271
|
+
onFinish
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
error("parse(eof)", "Finished processing before end of file", :token => @lexer.first) if @lexer.first
|
276
|
+
|
277
|
+
# Continue popping contexts off of the stack
|
278
|
+
while !todo_stack.empty?
|
279
|
+
debug("parse(eof)", "stack #{todo_stack.last.inspect}, depth #{depth}")
|
280
|
+
todo_stack.pop
|
281
|
+
onFinish
|
282
|
+
end
|
283
|
+
|
284
|
+
rescue RDF::LL1::Lexer::Error => e
|
285
|
+
@lineno = e.lineno
|
286
|
+
error("parse", "With input '#{e.input}': #{e.message}",
|
287
|
+
:production => @productions.last)
|
288
|
+
end
|
289
|
+
|
290
|
+
def depth; (@productions || []).length; end
|
291
|
+
|
292
|
+
private
|
293
|
+
# Start for production
|
294
|
+
def onStart(prod)
|
295
|
+
handler = self.class.production_handlers[prod]
|
296
|
+
@productions << prod
|
297
|
+
if handler
|
298
|
+
# Create a new production data element, potentially allowing handler
|
299
|
+
# to customize before pushing on the @prod_data stack
|
300
|
+
progress("#{prod}(:start):#{@prod_data.length}", @prod_data.last)
|
301
|
+
data = {}
|
302
|
+
handler.call(self, :start, @prod_data.last, data, @parse_callback)
|
303
|
+
@prod_data << data
|
304
|
+
else
|
305
|
+
progress("#{prod}(:start)", '')
|
306
|
+
end
|
307
|
+
#puts @prod_data.inspect
|
308
|
+
end
|
309
|
+
|
310
|
+
# Finish of production
|
311
|
+
def onFinish
|
312
|
+
prod = @productions.last
|
313
|
+
handler = self.class.production_handlers[prod]
|
314
|
+
if handler
|
315
|
+
# Pop production data element from stack, potentially allowing handler to use it
|
316
|
+
data = @prod_data.pop
|
317
|
+
handler.call(self, :finish, @prod_data.last, data, @parse_callback)
|
318
|
+
progress("#{prod}(:finish):#{@prod_data.length}", @prod_data.last)
|
319
|
+
else
|
320
|
+
progress("#{prod}(:finish)", '')
|
321
|
+
end
|
322
|
+
@productions.pop
|
323
|
+
end
|
324
|
+
|
325
|
+
# A token
|
326
|
+
def onToken(prod, token)
|
327
|
+
unless @productions.empty?
|
328
|
+
parentProd = @productions.last
|
329
|
+
handler = self.class.terminal_handlers[prod]
|
330
|
+
handler ||= self.class.terminal_handlers[nil] if prod.is_a?(String) # Allows catch-all for simple string terminals
|
331
|
+
if handler
|
332
|
+
handler.call(self, parentProd, token, @prod_data.last)
|
333
|
+
progress("#{prod}(:token)", "#{token}: #{@prod_data.last}", :depth => (depth + 1))
|
334
|
+
else
|
335
|
+
progress("#{prod}(:token)", token.to_s, :depth => (depth + 1))
|
336
|
+
end
|
337
|
+
else
|
338
|
+
error("#{parentProd}(:token)", "Token has no parent production", :production => prod)
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
# Skip throught the input stream until something is found that follows the last production with a list of follows
|
343
|
+
def skip_until_follow(todo_stack)
|
344
|
+
debug("recovery", "stack follows:")
|
345
|
+
todo_stack.each do |todo|
|
346
|
+
debug("recovery", " #{todo[:prod]}: #{@follow[todo[:prod]].inspect}")
|
347
|
+
end
|
348
|
+
follows = todo_stack.inject([]) do |follow, todo|
|
349
|
+
prod = todo[:prod]
|
350
|
+
follow += @follow[prod] || []
|
351
|
+
end.uniq
|
352
|
+
progress("recovery", "first #{@lexer.first.inspect}, follows: #{follows.inspect}")
|
353
|
+
while (token = @lexer.first) && follows.none? {|t| token === t}
|
354
|
+
skipped = @lexer.shift
|
355
|
+
progress("recovery", "skip #{skipped.inspect}")
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
# @param [String] str Error string
|
360
|
+
# @param [Hash] options
|
361
|
+
# @option options [URI, #to_s] :production
|
362
|
+
# @option options [Token] :token
|
363
|
+
def error(node, message, options = {})
|
364
|
+
return if @recovering
|
365
|
+
@recovering = true
|
366
|
+
message += ", found #{options[:token].representation.inspect}" if options[:token]
|
367
|
+
message += " at line #{@lineno}" if @lineno
|
368
|
+
message += ", production = #{options[:production].inspect}" if options[:production] && options[:debug]
|
369
|
+
if !@options[:validate] && !options[:fatal]
|
370
|
+
debug(node, message, options)
|
371
|
+
else
|
372
|
+
raise Error, message
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
##
|
377
|
+
# Progress output when parsing
|
378
|
+
# @param [String] str
|
379
|
+
def progress(node, message, options = {})
|
380
|
+
return debug(node, message, options) if @options[:debug]
|
381
|
+
return unless @options[:progress]
|
382
|
+
depth = options[:depth] || self.depth
|
383
|
+
str = "[#{@lineno}]#{' ' * depth}#{node}: #{message}"
|
384
|
+
$stderr.puts("[#{@lineno}]#{' ' * depth}#{node}: #{message}")
|
385
|
+
end
|
386
|
+
|
387
|
+
##
|
388
|
+
# Progress output when debugging
|
389
|
+
# @param [String] node Relevant location associated with message
|
390
|
+
# @param [String] message
|
391
|
+
# @param [Hash] options
|
392
|
+
# @option options [Integer] :depth
|
393
|
+
# Recursion depth for indenting output
|
394
|
+
def debug(node, message, options = {})
|
395
|
+
depth = options[:depth] || self.depth
|
396
|
+
str = "[#{@lineno}]#{' ' * depth}#{node}: #{message}"
|
397
|
+
case @options[:debug]
|
398
|
+
when Array
|
399
|
+
@options[:debug] << str
|
400
|
+
when TrueClass
|
401
|
+
$stderr.puts str
|
402
|
+
when :yield
|
403
|
+
@parse_callback.call(:debug, node, message, options)
|
404
|
+
end
|
405
|
+
end
|
406
|
+
|
407
|
+
##
|
408
|
+
# @param [Symbol, String] type_or_value
|
409
|
+
# @return [Token]
|
410
|
+
def accept(type_or_value)
|
411
|
+
if (token = @lexer.first) && token === type_or_value
|
412
|
+
debug("accept", "#{token.inspect} === #{type_or_value}.inspect")
|
413
|
+
@lexer.shift
|
414
|
+
end
|
415
|
+
end
|
416
|
+
public
|
417
|
+
|
418
|
+
##
|
419
|
+
# Raised for errors during parsing.
|
420
|
+
#
|
421
|
+
# @example Raising a parser error
|
422
|
+
# raise Error.new(
|
423
|
+
# "invalid token '%' on line 10",
|
424
|
+
# :token => '%', :lineno => 9, :production => :turtleDoc)
|
425
|
+
#
|
426
|
+
# @see http://ruby-doc.org/core/classes/StandardError.html
|
427
|
+
class Error < StandardError
|
428
|
+
##
|
429
|
+
# The current production.
|
430
|
+
#
|
431
|
+
# @return [Symbol]
|
432
|
+
attr_reader :production
|
433
|
+
|
434
|
+
##
|
435
|
+
# The invalid token which triggered the error.
|
436
|
+
#
|
437
|
+
# @return [String]
|
438
|
+
attr_reader :token
|
439
|
+
|
440
|
+
##
|
441
|
+
# The line number where the error occurred.
|
442
|
+
#
|
443
|
+
# @return [Integer]
|
444
|
+
attr_reader :lineno
|
445
|
+
|
446
|
+
##
|
447
|
+
# Initializes a new lexer error instance.
|
448
|
+
#
|
449
|
+
# @param [String, #to_s] message
|
450
|
+
# @param [Hash{Symbol => Object}] options
|
451
|
+
# @option options [Symbol] :production (nil)
|
452
|
+
# @option options [String] :token (nil)
|
453
|
+
# @option options [Integer] :lineno (nil)
|
454
|
+
def initialize(message, options = {})
|
455
|
+
@production = options[:production]
|
456
|
+
@token = options[:token]
|
457
|
+
@lineno = options[:lineno]
|
458
|
+
super(message.to_s)
|
459
|
+
end
|
460
|
+
end # class Error
|
461
|
+
end # class Reader
|
462
|
+
end # module RDF::Turtle
|