rdf-turtle 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/AUTHORS +1 -0
- data/History +9 -0
- data/README.markdown +142 -0
- data/UNLICENSE +24 -0
- data/VERSION +1 -0
- data/lib/rdf/ll1/lexer.rb +458 -0
- data/lib/rdf/ll1/parser.rb +462 -0
- data/lib/rdf/ll1/scanner.rb +100 -0
- data/lib/rdf/turtle.rb +35 -0
- data/lib/rdf/turtle/format.rb +41 -0
- data/lib/rdf/turtle/meta.rb +1748 -0
- data/lib/rdf/turtle/patches.rb +38 -0
- data/lib/rdf/turtle/reader.rb +362 -0
- data/lib/rdf/turtle/terminals.rb +88 -0
- data/lib/rdf/turtle/writer.rb +562 -0
- metadata +115 -0
@@ -0,0 +1,462 @@
|
|
1
|
+
require 'rdf'
|
2
|
+
require 'rdf/ll1/lexer'
|
3
|
+
|
4
|
+
module RDF::LL1
|
5
|
+
##
|
6
|
+
# A Generic LL1 parser using a lexer and branch tables defined using the SWAP tool chain (modified).
|
7
|
+
module Parser
|
8
|
+
##
|
9
|
+
# @attr [Integer] lineno
|
10
|
+
attr_reader :lineno
|
11
|
+
|
12
|
+
def self.included(base)
|
13
|
+
base.extend(ClassMethods)
|
14
|
+
end
|
15
|
+
|
16
|
+
module ClassMethods
|
17
|
+
def production_handlers; @production_handlers || {}; end
|
18
|
+
def terminal_handlers; @terminal_handlers || {}; end
|
19
|
+
def patterns; @patterns || []; end
|
20
|
+
def unescape_terms; @unescape_terms || []; end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Defines a production called during different phases of parsing
|
24
|
+
# with data from previous production along with data defined for the
|
25
|
+
# current production
|
26
|
+
#
|
27
|
+
# @param [Symbol] term
|
28
|
+
# Term which is a key in the branch table
|
29
|
+
# @yield [reader, phase, input, current]
|
30
|
+
# @yieldparam [RDF::Reader] reader
|
31
|
+
# Reader instance
|
32
|
+
# @yieldparam [Symbol] phase
|
33
|
+
# Phase of parsing, one of :start, or :finish
|
34
|
+
# @yieldparam [Hash] input
|
35
|
+
# A Hash containing input from the parent production
|
36
|
+
# @yieldparam [Hash] current
|
37
|
+
# A Hash defined for the current production, during :start
|
38
|
+
# may be initialized with data to pass to further productions,
|
39
|
+
# during :finish, it contains data placed by earlier productions
|
40
|
+
# @yieldparam [Prod] block
|
41
|
+
# Block passed to initialization for yielding to calling reader.
|
42
|
+
# Should conform to the yield specs for #initialize
|
43
|
+
# Yield to generate a triple
|
44
|
+
def production(term, &block)
|
45
|
+
@production_handlers ||= {}
|
46
|
+
@production_handlers[term] = block
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# Defines the pattern for a terminal node and a block to be invoked
|
51
|
+
# when ther terminal is encountered. If the block is missing, the
|
52
|
+
# value of the terminal will be placed on the input hash to be returned
|
53
|
+
# to a previous production.
|
54
|
+
#
|
55
|
+
# @param [Symbol, String] term
|
56
|
+
# Defines a terminal production, which appears as within a sequence in the branch table
|
57
|
+
# @param [Regexp] regexp
|
58
|
+
# Pattern used to scan for this terminal
|
59
|
+
# @param [Hash] options
|
60
|
+
# @option options [Boolean] :unescape
|
61
|
+
# Cause strings and codepoints to be unescaped.
|
62
|
+
# @yield [reader, term, token, input]
|
63
|
+
# @yieldparam [RDF::Reader] reader
|
64
|
+
# Reader instance
|
65
|
+
# @yieldparam [Symbol] term
|
66
|
+
# A symbol indicating the production which referenced this terminal
|
67
|
+
# @yieldparam [String] token
|
68
|
+
# The scanned token
|
69
|
+
# @yieldparam [Hash] input
|
70
|
+
# A Hash containing input from the parent production
|
71
|
+
# @yieldparam [Prod] block
|
72
|
+
# Block passed to initialization for yielding to calling reader.
|
73
|
+
# Should conform to the yield specs for #initialize
|
74
|
+
def terminal(term, regexp, options = {}, &block)
|
75
|
+
@patterns ||= []
|
76
|
+
@patterns << [term, regexp] # Passed in order to define evaulation sequence
|
77
|
+
@terminal_handlers ||= {}
|
78
|
+
@terminal_handlers[term] = block
|
79
|
+
@unescape_terms ||= []
|
80
|
+
@unescape_terms << term if options[:unescape]
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
##
|
85
|
+
# Initializes a new parser instance.
|
86
|
+
#
|
87
|
+
# Attempts to recover from errors.
|
88
|
+
#
|
89
|
+
# @example
|
90
|
+
# require 'rdf/ll1/parser'
|
91
|
+
#
|
92
|
+
# class Reader << RDF::Reader
|
93
|
+
# include RDF::LL1::Parser
|
94
|
+
#
|
95
|
+
# branch RDF::Turtle::Reader::BRANCH
|
96
|
+
#
|
97
|
+
# ##
|
98
|
+
# # Defines a production called during different phases of parsing
|
99
|
+
# # with data from previous production along with data defined for the
|
100
|
+
# # current production
|
101
|
+
# #
|
102
|
+
# # Yield to generate a triple
|
103
|
+
# production :object do |reader, phase, input, current|
|
104
|
+
# object = current[:resource]
|
105
|
+
# yield :statement, RDF::Statement.new(input[:subject], input[:predicate], object)
|
106
|
+
# end
|
107
|
+
#
|
108
|
+
# ##
|
109
|
+
# # Defines the pattern for a terminal node
|
110
|
+
# terminal :BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL})) do |reader, production, token, input|
|
111
|
+
# input[:BLANK_NODE_LABEL] = RDF::Node.new(token)
|
112
|
+
# end
|
113
|
+
#
|
114
|
+
# ##
|
115
|
+
# # Iterates the given block for each RDF statement in the input.
|
116
|
+
# #
|
117
|
+
# # @yield [statement]
|
118
|
+
# # @yieldparam [RDF::Statement] statement
|
119
|
+
# # @return [void]
|
120
|
+
# def each_statement(&block)
|
121
|
+
# @callback = block
|
122
|
+
#
|
123
|
+
# parse(START.to_sym) do |context, *data|
|
124
|
+
# case context
|
125
|
+
# when :statement
|
126
|
+
# yield *data
|
127
|
+
# end
|
128
|
+
# end
|
129
|
+
# end
|
130
|
+
#
|
131
|
+
# end
|
132
|
+
#
|
133
|
+
# @param [String, #to_s] input
|
134
|
+
# @param [Symbol, #to_s] prod The starting production for the parser.
|
135
|
+
# It may be a URI from the grammar, or a symbol representing the local_name portion of the grammar URI.
|
136
|
+
# @param [Hash{Symbol => Object}] options
|
137
|
+
# @option options [Hash{Symbol,String => Hash{Symbol,String => Array<Symbol,String>}}] :branch
|
138
|
+
# LL1 branch table.
|
139
|
+
# @option options [HHash{Symbol,String => Array<Symbol,String>}] :follow ({})
|
140
|
+
# Lists valid terminals that can follow each production (for error recovery).
|
141
|
+
# @option options [Boolean] :validate (false)
|
142
|
+
# whether to validate the parsed statements and values. If not validating,
|
143
|
+
# the parser will attempt to recover from errors.
|
144
|
+
# @option options [Boolean] :progress
|
145
|
+
# Show progress of parser productions
|
146
|
+
# @option options [Boolean] :debug
|
147
|
+
# Detailed debug output
|
148
|
+
# @yield [context, *data]
|
149
|
+
# Yields for to return data to reader
|
150
|
+
# @yieldparam [:statement, :trace] context
|
151
|
+
# Context for block
|
152
|
+
# @yieldparam [Symbol] *data
|
153
|
+
# Data specific to the call
|
154
|
+
# @return [RDF::LL1::Parser]
|
155
|
+
# @see http://cs.adelaide.edu.au/~charles/lt/Lectures/07-ErrorRecovery.pdf
|
156
|
+
def parse(input = nil, prod = nil, options = {}, &block)
|
157
|
+
@options = options.dup
|
158
|
+
@branch = options[:branch]
|
159
|
+
@follow = options[:follow] ||= {}
|
160
|
+
@lexer = input.is_a?(Lexer) ? input : Lexer.new(input, self.class.patterns, @options.merge(:unescape_terms => self.class.unescape_terms))
|
161
|
+
@productions = []
|
162
|
+
@parse_callback = block
|
163
|
+
@recovering = false
|
164
|
+
terminals = self.class.patterns.map(&:first) # Get defined terminals to help with branching
|
165
|
+
|
166
|
+
# Unrecoverable errors
|
167
|
+
raise Error, "Branch table not defined" unless @branch && @branch.length > 0
|
168
|
+
raise Error, "Starting production not defined" unless prod
|
169
|
+
|
170
|
+
@prod_data = [{}]
|
171
|
+
prod = RDF::URI(prod).fragment.to_sym unless prod.is_a?(Symbol)
|
172
|
+
todo_stack = [{:prod => prod, :terms => nil}]
|
173
|
+
|
174
|
+
while !todo_stack.empty?
|
175
|
+
pushed = false
|
176
|
+
if todo_stack.last[:terms].nil?
|
177
|
+
todo_stack.last[:terms] = []
|
178
|
+
begin
|
179
|
+
token = @lexer.first
|
180
|
+
rescue RDF::LL1::Lexer::Error => e
|
181
|
+
# Recover from lexer error
|
182
|
+
@lineno = e.lineno
|
183
|
+
error("parse(production)", "With input '#{e.input}': #{e.message}",
|
184
|
+
:production => @productions.last)
|
185
|
+
|
186
|
+
# Retrieve next valid token
|
187
|
+
token = @lexer.recover
|
188
|
+
end
|
189
|
+
@lineno = token.lineno if token
|
190
|
+
debug("parse(production)",
|
191
|
+
"#{token ? token.representation.inspect : 'nil'}, " +
|
192
|
+
"prod #{todo_stack.last[:prod].inspect}, " +
|
193
|
+
"depth #{depth}")
|
194
|
+
|
195
|
+
# Got an opened production
|
196
|
+
cur_prod = todo_stack.last[:prod]
|
197
|
+
# Got an opened production
|
198
|
+
onStart(cur_prod)
|
199
|
+
break if token.nil?
|
200
|
+
|
201
|
+
if prod_branch = @branch[cur_prod]
|
202
|
+
sequence = prod_branch[token.representation]
|
203
|
+
debug("parse(production)",
|
204
|
+
"#{token.representation.inspect} " +
|
205
|
+
"prod #{cur_prod.inspect}, " +
|
206
|
+
"prod_branch #{prod_branch.keys.inspect}, " +
|
207
|
+
"sequence #{sequence.inspect}")
|
208
|
+
if sequence.nil?
|
209
|
+
if prod_branch.has_key?(:"ebnf:empty")
|
210
|
+
debug("parse(production)", "empty sequence for ebnf:empty")
|
211
|
+
else
|
212
|
+
expected = prod_branch.keys.map {|v| v.inspect}.join(", ")
|
213
|
+
error("parse", "expected one of #{expected}",
|
214
|
+
:production => cur_prod, :token => token)
|
215
|
+
|
216
|
+
# Skip input until we find something that can follow the current production
|
217
|
+
skip_until_follow(todo_stack)
|
218
|
+
todo_stack.last[:terms] = []
|
219
|
+
end
|
220
|
+
end
|
221
|
+
@recovering = false
|
222
|
+
todo_stack.last[:terms] += sequence if sequence
|
223
|
+
else
|
224
|
+
error("parse", "No branches found for #{cur_prod.inspect}",
|
225
|
+
:production => cur_prod, :token => token)
|
226
|
+
todo_stack.last[:terms] = []
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
debug("parse(terms)", "todo #{todo_stack.last.inspect}, depth #{depth}")
|
231
|
+
while !todo_stack.last[:terms].to_a.empty?
|
232
|
+
begin
|
233
|
+
# Get the next term in this sequence
|
234
|
+
term = todo_stack.last[:terms].shift
|
235
|
+
if token = accept(term)
|
236
|
+
debug("parse(token)", "#{token.inspect}, term #{term.inspect}")
|
237
|
+
@lineno = token.lineno if token
|
238
|
+
onToken(term, token)
|
239
|
+
elsif terminals.include?(term)
|
240
|
+
error("parse", "#{term.inspect} expected",
|
241
|
+
:production => todo_stack.last[:prod], :token => @lexer.first)
|
242
|
+
|
243
|
+
# Recover until we find something that can follow this term
|
244
|
+
skip_until_follow(todo_stack)
|
245
|
+
else
|
246
|
+
# If it's not a string (a symbol), it is a non-terminal and we push the new state
|
247
|
+
todo_stack << {:prod => term, :terms => nil}
|
248
|
+
debug("parse(push)", "term #{term.inspect}, depth #{depth}")
|
249
|
+
pushed = true
|
250
|
+
break
|
251
|
+
end
|
252
|
+
rescue RDF::LL1::Lexer::Error => e
|
253
|
+
# Skip forward for acceptable lexer input
|
254
|
+
error("parse", "#{term.inspect} expected: #{e.message}",
|
255
|
+
:production => todo_stack.last[:prod])
|
256
|
+
@lexer.recover
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
# After completing the last production in a sequence, pop down until we find a production
|
261
|
+
#
|
262
|
+
# If in recovery mode, continue popping until we find a term with a follow list
|
263
|
+
while !pushed &&
|
264
|
+
!todo_stack.empty? &&
|
265
|
+
( todo_stack.last[:terms].to_a.empty? ||
|
266
|
+
(@recovering && @follow[todo_stack.last[:term]].nil?))
|
267
|
+
debug("parse(pop)", "todo #{todo_stack.last.inspect}, depth #{depth}, recovering? #{@recovering.inspect}")
|
268
|
+
prod = todo_stack.last[:prod]
|
269
|
+
@recovering = false if @follow[prod] # Stop recovering when we might have a match
|
270
|
+
todo_stack.pop
|
271
|
+
onFinish
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
error("parse(eof)", "Finished processing before end of file", :token => @lexer.first) if @lexer.first
|
276
|
+
|
277
|
+
# Continue popping contexts off of the stack
|
278
|
+
while !todo_stack.empty?
|
279
|
+
debug("parse(eof)", "stack #{todo_stack.last.inspect}, depth #{depth}")
|
280
|
+
todo_stack.pop
|
281
|
+
onFinish
|
282
|
+
end
|
283
|
+
|
284
|
+
rescue RDF::LL1::Lexer::Error => e
|
285
|
+
@lineno = e.lineno
|
286
|
+
error("parse", "With input '#{e.input}': #{e.message}",
|
287
|
+
:production => @productions.last)
|
288
|
+
end
|
289
|
+
|
290
|
+
def depth; (@productions || []).length; end
|
291
|
+
|
292
|
+
private
|
293
|
+
# Start for production
|
294
|
+
def onStart(prod)
|
295
|
+
handler = self.class.production_handlers[prod]
|
296
|
+
@productions << prod
|
297
|
+
if handler
|
298
|
+
# Create a new production data element, potentially allowing handler
|
299
|
+
# to customize before pushing on the @prod_data stack
|
300
|
+
progress("#{prod}(:start):#{@prod_data.length}", @prod_data.last)
|
301
|
+
data = {}
|
302
|
+
handler.call(self, :start, @prod_data.last, data, @parse_callback)
|
303
|
+
@prod_data << data
|
304
|
+
else
|
305
|
+
progress("#{prod}(:start)", '')
|
306
|
+
end
|
307
|
+
#puts @prod_data.inspect
|
308
|
+
end
|
309
|
+
|
310
|
+
# Finish of production
|
311
|
+
def onFinish
|
312
|
+
prod = @productions.last
|
313
|
+
handler = self.class.production_handlers[prod]
|
314
|
+
if handler
|
315
|
+
# Pop production data element from stack, potentially allowing handler to use it
|
316
|
+
data = @prod_data.pop
|
317
|
+
handler.call(self, :finish, @prod_data.last, data, @parse_callback)
|
318
|
+
progress("#{prod}(:finish):#{@prod_data.length}", @prod_data.last)
|
319
|
+
else
|
320
|
+
progress("#{prod}(:finish)", '')
|
321
|
+
end
|
322
|
+
@productions.pop
|
323
|
+
end
|
324
|
+
|
325
|
+
# A token
|
326
|
+
def onToken(prod, token)
|
327
|
+
unless @productions.empty?
|
328
|
+
parentProd = @productions.last
|
329
|
+
handler = self.class.terminal_handlers[prod]
|
330
|
+
handler ||= self.class.terminal_handlers[nil] if prod.is_a?(String) # Allows catch-all for simple string terminals
|
331
|
+
if handler
|
332
|
+
handler.call(self, parentProd, token, @prod_data.last)
|
333
|
+
progress("#{prod}(:token)", "#{token}: #{@prod_data.last}", :depth => (depth + 1))
|
334
|
+
else
|
335
|
+
progress("#{prod}(:token)", token.to_s, :depth => (depth + 1))
|
336
|
+
end
|
337
|
+
else
|
338
|
+
error("#{parentProd}(:token)", "Token has no parent production", :production => prod)
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
# Skip throught the input stream until something is found that follows the last production with a list of follows
|
343
|
+
def skip_until_follow(todo_stack)
|
344
|
+
debug("recovery", "stack follows:")
|
345
|
+
todo_stack.each do |todo|
|
346
|
+
debug("recovery", " #{todo[:prod]}: #{@follow[todo[:prod]].inspect}")
|
347
|
+
end
|
348
|
+
follows = todo_stack.inject([]) do |follow, todo|
|
349
|
+
prod = todo[:prod]
|
350
|
+
follow += @follow[prod] || []
|
351
|
+
end.uniq
|
352
|
+
progress("recovery", "first #{@lexer.first.inspect}, follows: #{follows.inspect}")
|
353
|
+
while (token = @lexer.first) && follows.none? {|t| token === t}
|
354
|
+
skipped = @lexer.shift
|
355
|
+
progress("recovery", "skip #{skipped.inspect}")
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
# @param [String] str Error string
|
360
|
+
# @param [Hash] options
|
361
|
+
# @option options [URI, #to_s] :production
|
362
|
+
# @option options [Token] :token
|
363
|
+
def error(node, message, options = {})
|
364
|
+
return if @recovering
|
365
|
+
@recovering = true
|
366
|
+
message += ", found #{options[:token].representation.inspect}" if options[:token]
|
367
|
+
message += " at line #{@lineno}" if @lineno
|
368
|
+
message += ", production = #{options[:production].inspect}" if options[:production] && options[:debug]
|
369
|
+
if !@options[:validate] && !options[:fatal]
|
370
|
+
debug(node, message, options)
|
371
|
+
else
|
372
|
+
raise Error, message
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
##
|
377
|
+
# Progress output when parsing
|
378
|
+
# @param [String] str
|
379
|
+
def progress(node, message, options = {})
|
380
|
+
return debug(node, message, options) if @options[:debug]
|
381
|
+
return unless @options[:progress]
|
382
|
+
depth = options[:depth] || self.depth
|
383
|
+
str = "[#{@lineno}]#{' ' * depth}#{node}: #{message}"
|
384
|
+
$stderr.puts("[#{@lineno}]#{' ' * depth}#{node}: #{message}")
|
385
|
+
end
|
386
|
+
|
387
|
+
##
|
388
|
+
# Progress output when debugging
|
389
|
+
# @param [String] node Relevant location associated with message
|
390
|
+
# @param [String] message
|
391
|
+
# @param [Hash] options
|
392
|
+
# @option options [Integer] :depth
|
393
|
+
# Recursion depth for indenting output
|
394
|
+
def debug(node, message, options = {})
|
395
|
+
depth = options[:depth] || self.depth
|
396
|
+
str = "[#{@lineno}]#{' ' * depth}#{node}: #{message}"
|
397
|
+
case @options[:debug]
|
398
|
+
when Array
|
399
|
+
@options[:debug] << str
|
400
|
+
when TrueClass
|
401
|
+
$stderr.puts str
|
402
|
+
when :yield
|
403
|
+
@parse_callback.call(:debug, node, message, options)
|
404
|
+
end
|
405
|
+
end
|
406
|
+
|
407
|
+
##
|
408
|
+
# @param [Symbol, String] type_or_value
|
409
|
+
# @return [Token]
|
410
|
+
def accept(type_or_value)
|
411
|
+
if (token = @lexer.first) && token === type_or_value
|
412
|
+
debug("accept", "#{token.inspect} === #{type_or_value}.inspect")
|
413
|
+
@lexer.shift
|
414
|
+
end
|
415
|
+
end
|
416
|
+
public
|
417
|
+
|
418
|
+
##
|
419
|
+
# Raised for errors during parsing.
|
420
|
+
#
|
421
|
+
# @example Raising a parser error
|
422
|
+
# raise Error.new(
|
423
|
+
# "invalid token '%' on line 10",
|
424
|
+
# :token => '%', :lineno => 9, :production => :turtleDoc)
|
425
|
+
#
|
426
|
+
# @see http://ruby-doc.org/core/classes/StandardError.html
|
427
|
+
class Error < StandardError
|
428
|
+
##
|
429
|
+
# The current production.
|
430
|
+
#
|
431
|
+
# @return [Symbol]
|
432
|
+
attr_reader :production
|
433
|
+
|
434
|
+
##
|
435
|
+
# The invalid token which triggered the error.
|
436
|
+
#
|
437
|
+
# @return [String]
|
438
|
+
attr_reader :token
|
439
|
+
|
440
|
+
##
|
441
|
+
# The line number where the error occurred.
|
442
|
+
#
|
443
|
+
# @return [Integer]
|
444
|
+
attr_reader :lineno
|
445
|
+
|
446
|
+
##
|
447
|
+
# Initializes a new lexer error instance.
|
448
|
+
#
|
449
|
+
# @param [String, #to_s] message
|
450
|
+
# @param [Hash{Symbol => Object}] options
|
451
|
+
# @option options [Symbol] :production (nil)
|
452
|
+
# @option options [String] :token (nil)
|
453
|
+
# @option options [Integer] :lineno (nil)
|
454
|
+
def initialize(message, options = {})
|
455
|
+
@production = options[:production]
|
456
|
+
@token = options[:token]
|
457
|
+
@lineno = options[:lineno]
|
458
|
+
super(message.to_s)
|
459
|
+
end
|
460
|
+
end # class Error
|
461
|
+
end # class Reader
|
462
|
+
end # module RDF::Turtle
|