rdf-turtle 1.0.0 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,475 +0,0 @@
1
- module RDF::LL1
2
- require 'rdf/ll1/scanner' unless defined?(Scanner)
3
-
4
- ##
5
- # A lexical analyzer
6
- #
7
- # @example Tokenizing a Turtle string
8
- # terminals = [
9
- # [:BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL}))],
10
- # ...
11
- # ]
12
- # ttl = "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ."
13
- # lexer = RDF::LL1::Lexer.tokenize(ttl, terminals)
14
- # lexer.each_token do |token|
15
- # puts token.inspect
16
- # end
17
- #
18
- # @example Tokenizing and returning a token stream
19
- # lexer = RDF::LL1::Lexer.tokenize(...)
20
- # while :some-condition
21
- # token = lexer.first # Get the current token
22
- # token = lexer.shift # Get the current token and shift to the next
23
- # end
24
- #
25
- # @example Handling error conditions
26
- # begin
27
- # RDF::Turtle::Lexer.tokenize(query)
28
- # rescue RDF::Turtle::Lexer::Error => error
29
- # warn error.inspect
30
- # end
31
- #
32
- # @see http://en.wikipedia.org/wiki/Lexical_analysis
33
- class Lexer
34
- include Enumerable
35
-
36
- ESCAPE_CHARS = {
37
- '\\t' => "\t", # \u0009 (tab)
38
- '\\n' => "\n", # \u000A (line feed)
39
- '\\r' => "\r", # \u000D (carriage return)
40
- '\\b' => "\b", # \u0008 (backspace)
41
- '\\f' => "\f", # \u000C (form feed)
42
- '\\"' => '"', # \u0022 (quotation mark, double quote mark)
43
- "\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark)
44
- '\\\\' => '\\' # \u005C (backslash)
45
- }
46
- ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/ # \uXXXX
47
- ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/ # \UXXXXXXXX
48
- ECHAR = /\\./ # More liberal unescaping
49
- UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/
50
- COMMENT = /#.*/
51
- WS = / |\t|\r|\n/m
52
-
53
- ML_START = /\'\'\'|\"\"\"/ # Beginning of terminals that may span lines
54
-
55
- ##
56
- # @!attribute whitespace
57
- # @return [Regexp] defines whitespace, defaults to WS
58
- attr_reader :whitespace
59
-
60
- ##
61
- # @!attribute comment
62
- # @return [Regexp] defines single-line comment, defaults to COMMENT
63
- attr_reader :comment
64
-
65
- ##
66
- # Returns a copy of the given `input` string with all `\uXXXX` and
67
- # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
68
- # unescaped UTF-8 character counterparts.
69
- #
70
- # @param [String] string
71
- # @return [String]
72
- # @see http://www.w3.org/TR/rdf-sparql-query/#codepointEscape
73
- def self.unescape_codepoints(string)
74
- # Decode \uXXXX and \UXXXXXXXX code points:
75
- string = string.gsub(UCHAR) do |c|
76
- s = [(c[2..-1]).hex].pack('U*')
77
- s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s
78
- end
79
-
80
- string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding) # Ruby 1.9+
81
- string
82
- end
83
-
84
- ##
85
- # Returns a copy of the given `input` string with all string escape
86
- # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
87
- # character counterparts.
88
- #
89
- # @param [String] input
90
- # @return [String]
91
- # @see http://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
92
- def self.unescape_string(input)
93
- input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
94
- end
95
-
96
- ##
97
- # Tokenizes the given `input` string or stream.
98
- #
99
- # @param [String, #to_s] input
100
- # @param [Array<Array<Symbol, Regexp>>] terminals
101
- # Array of symbol, regexp pairs used to match terminals.
102
- # If the symbol is nil, it defines a Regexp to match string terminals.
103
- # @param [Hash{Symbol => Object}] options
104
- # @yield [lexer]
105
- # @yieldparam [Lexer] lexer
106
- # @return [Lexer]
107
- # @raise [Lexer::Error] on invalid input
108
- def self.tokenize(input, terminals, options = {}, &block)
109
- lexer = self.new(input, terminals, options)
110
- block_given? ? block.call(lexer) : lexer
111
- end
112
-
113
- ##
114
- # Initializes a new lexer instance.
115
- #
116
- # @param [String, #to_s] input
117
- # @param [Array<Array<Symbol, Regexp>>] terminals
118
- # Array of symbol, regexp pairs used to match terminals.
119
- # If the symbol is nil, it defines a Regexp to match string terminals.
120
- # @param [Hash{Symbol => Object}] options
121
- # @option options [Regexp] :whitespace (WS)
122
- # @option options [Regexp] :comment (COMMENT)
123
- # @option options [Array<Symbol>] :unescape_terms ([])
124
- # Regular expression matching the beginning of terminals that may cross newlines
125
- def initialize(input = nil, terminals = nil, options = {})
126
- @options = options.dup
127
- @whitespace = @options[:whitespace] || WS
128
- @comment = @options[:comment] || COMMENT
129
- @unescape_terms = @options[:unescape_terms] || []
130
- @terminals = terminals
131
-
132
- raise Error, "Terminal patterns not defined" unless @terminals && @terminals.length > 0
133
-
134
- @lineno = 1
135
- @scanner = Scanner.new(input) do |string|
136
- string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding) # Ruby 1.9+
137
- string
138
- end
139
- end
140
-
141
- ##
142
- # Any additional options for the lexer.
143
- #
144
- # @return [Hash]
145
- attr_reader :options
146
-
147
- ##
148
- # The current input string being processed.
149
- #
150
- # @return [String]
151
- attr_accessor :input
152
-
153
- ##
154
- # The current line number (zero-based).
155
- #
156
- # @return [Integer]
157
- attr_reader :lineno
158
-
159
- ##
160
- # Returns `true` if the input string is lexically valid.
161
- #
162
- # To be considered valid, the input string must contain more than zero
163
- # terminals, and must not contain any invalid terminals.
164
- #
165
- # @return [Boolean]
166
- def valid?
167
- begin
168
- !count.zero?
169
- rescue Error
170
- false
171
- end
172
- end
173
-
174
- ##
175
- # Enumerates each token in the input string.
176
- #
177
- # @yield [token]
178
- # @yieldparam [Token] token
179
- # @return [Enumerator]
180
- def each_token(&block)
181
- if block_given?
182
- while token = shift
183
- yield token
184
- end
185
- end
186
- enum_for(:each_token)
187
- end
188
- alias_method :each, :each_token
189
-
190
- ##
191
- # Returns first token in input stream
192
- #
193
- # @return [Token]
194
- def first
195
- return nil unless scanner
196
-
197
- @first ||= begin
198
- {} while !scanner.eos? && skip_whitespace
199
- return @scanner = nil if scanner.eos?
200
-
201
- token = match_token
202
-
203
- if token.nil?
204
- lexme = (scanner.rest.split(/#{@whitespace}|#{@comment}/).first rescue nil) || scanner.rest
205
- raise Error.new("Invalid token #{lexme[0..100].inspect}",
206
- :input => scanner.rest[0..100], :token => lexme, :lineno => lineno)
207
- end
208
-
209
- token
210
- end
211
- rescue ArgumentError, Encoding::CompatibilityError => e
212
- raise Error.new("#{e.message} on line #{lineno + 1}",
213
- :input => (scanner.rest[0..100] rescue '??'), :token => lexme, :lineno => lineno)
214
- rescue Error
215
- raise
216
- rescue
217
- STDERR.puts "Expected ArgumentError, got #{$!.class}"
218
- raise
219
- end
220
-
221
- ##
222
- # Returns first token and shifts to next
223
- #
224
- # @return [Token]
225
- def shift
226
- cur = first
227
- @first = nil
228
- cur
229
- end
230
-
231
- ##
232
- # Skip input until a token is matched
233
- #
234
- # @return [Token]
235
- def recover
236
- until scanner.eos? do
237
- begin
238
- shift
239
- return first
240
- rescue Error, ArgumentError
241
- # Ignore errors until something scans, or EOS.
242
- scanner.pos = scanner.pos + 1
243
- end
244
- end
245
- end
246
- protected
247
-
248
- # @return [StringScanner]
249
- attr_reader :scanner
250
-
251
- # Perform string and codepoint unescaping
252
- # @param [String] string
253
- # @return [String]
254
- def unescape(string)
255
- self.class.unescape_string(self.class.unescape_codepoints(string))
256
- end
257
-
258
- ##
259
- # Skip whitespace or comments, as defined through input options or defaults
260
- def skip_whitespace
261
- # skip all white space, but keep track of the current line number
262
- while !scanner.eos?
263
- if matched = scanner.scan(@whitespace)
264
- @lineno += matched.count("\n")
265
- elsif (com = scanner.scan(@comment))
266
- else
267
- return
268
- end
269
- end
270
- end
271
-
272
- ##
273
- # Return the matched token
274
- #
275
- # @return [Token]
276
- def match_token
277
- @terminals.each do |(term, regexp)|
278
- #STDERR.puts "match[#{term}] #{scanner.rest[0..100].inspect} against #{regexp.inspect}" #if term == :STRING_LITERAL_SINGLE_QUOTE
279
- if matched = scanner.scan(regexp)
280
- matched = unescape(matched) if @unescape_terms.include?(term)
281
- #STDERR.puts " unescape? #{@unescape_terms.include?(term).inspect}"
282
- #STDERR.puts " matched #{term.inspect}: #{matched.inspect}"
283
- return token(term, matched)
284
- end
285
- end
286
- nil
287
- end
288
-
289
- protected
290
-
291
- ##
292
- # Constructs a new token object annotated with the current line number.
293
- #
294
- # The parser relies on the type being a symbolized URI and the value being
295
- # a string, if there is no type. If there is a type, then the value takes
296
- # on the native representation appropriate for that type.
297
- #
298
- # @param [Symbol] type
299
- # @param [String] value
300
- # Scanner instance with access to matched groups
301
- # @return [Token]
302
- def token(type, value)
303
- Token.new(type, value, :lineno => lineno)
304
- end
305
-
306
- ##
307
- # Represents a lexer token.
308
- #
309
- # @example Creating a new token
310
- # token = RDF::LL1::Lexer::Token.new(:LANGTAG, "en")
311
- # token.type #=> :LANGTAG
312
- # token.value #=> "en"
313
- #
314
- # @see http://en.wikipedia.org/wiki/Lexical_analysis#Token
315
- class Token
316
- ##
317
- # Initializes a new token instance.
318
- #
319
- # @param [Symbol] type
320
- # @param [String] value
321
- # @param [Hash{Symbol => Object}] options
322
- # @option options [Integer] :lineno (nil)
323
- def initialize(type, value, options = {})
324
- @type, @value = (type ? type.to_s.to_sym : nil), value
325
- @options = options.dup
326
- @lineno = @options.delete(:lineno)
327
- end
328
-
329
- ##
330
- # The token's symbol type.
331
- #
332
- # @return [Symbol]
333
- attr_reader :type
334
-
335
- ##
336
- # The token's value.
337
- #
338
- # @return [String]
339
- attr_reader :value
340
-
341
- ##
342
- # The line number where the token was encountered.
343
- #
344
- # @return [Integer]
345
- attr_reader :lineno
346
-
347
- ##
348
- # Any additional options for the token.
349
- #
350
- # @return [Hash]
351
- attr_reader :options
352
-
353
- ##
354
- # Returns the attribute named by `key`.
355
- #
356
- # @param [Symbol] key
357
- # @return [Object]
358
- def [](key)
359
- key = key.to_s.to_sym unless key.is_a?(Integer) || key.is_a?(Symbol)
360
- case key
361
- when 0, :type then @type
362
- when 1, :value then @value
363
- else nil
364
- end
365
- end
366
-
367
- ##
368
- # Returns `true` if the given `value` matches either the type or value
369
- # of this token.
370
- #
371
- # @example Matching using the symbolic type
372
- # RDF::LL1::Lexer::Token.new(:NIL) === :NIL #=> true
373
- #
374
- # @example Matching using the string value
375
- # RDF::LL1::Lexer::Token.new(nil, "{") === "{" #=> true
376
- #
377
- # @param [Symbol, String] value
378
- # @return [Boolean]
379
- def ===(value)
380
- case value
381
- when Symbol then value == @type
382
- when ::String then value.to_s == @value.to_s
383
- else value == @value
384
- end
385
- end
386
-
387
- ##
388
- # Returns a hash table representation of this token.
389
- #
390
- # @return [Hash]
391
- def to_hash
392
- {:type => @type, :value => @value}
393
- end
394
-
395
- ##
396
- # Readable version of token
397
- def to_s
398
- @type ? @type.inspect : @value
399
- end
400
-
401
- ##
402
- # Returns type, if not nil, otherwise value
403
- def representation
404
- @type ? @type : @value
405
- end
406
-
407
- ##
408
- # Returns an array representation of this token.
409
- #
410
- # @return [Array]
411
- def to_a
412
- [@type, @value]
413
- end
414
-
415
- ##
416
- # Returns a developer-friendly representation of this token.
417
- #
418
- # @return [String]
419
- def inspect
420
- to_hash.inspect
421
- end
422
- end # class Token
423
-
424
- ##
425
- # Raised for errors during lexical analysis.
426
- #
427
- # @example Raising a lexer error
428
- # raise RDF::LL1::Lexer::Error.new(
429
- # "invalid token '%' on line 10",
430
- # :input => query, :token => '%', :lineno => 9)
431
- #
432
- # @see http://ruby-doc.org/core/classes/StandardError.html
433
- class Error < StandardError
434
- ##
435
- # The input string associated with the error.
436
- #
437
- # @return [String]
438
- attr_reader :input
439
-
440
- ##
441
- # The invalid token which triggered the error.
442
- #
443
- # @return [String]
444
- attr_reader :token
445
-
446
- ##
447
- # The line number where the error occurred.
448
- #
449
- # @return [Integer]
450
- attr_reader :lineno
451
-
452
- ##
453
- # Initializes a new lexer error instance.
454
- #
455
- # @param [String, #to_s] message
456
- # @param [Hash{Symbol => Object}] options
457
- # @option options [String] :input (nil)
458
- # @option options [String] :token (nil)
459
- # @option options [Integer] :lineno (nil)
460
- def initialize(message, options = {})
461
- @input = options[:input]
462
- @token = options[:token]
463
- @lineno = options[:lineno]
464
- super(message.to_s)
465
- end
466
- end # class Error
467
-
468
- unless "".respond_to?(:force_encoding)
469
- # Compatibility with 1.9 Encoding
470
- module Encoding
471
- class CompatibilityError < StandardError; end
472
- end
473
- end
474
- end # class Lexer
475
- end # module RDF::Turtle