ebnf 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,475 @@
1
+ module RDF::LL1
2
+ require 'rdf/ll1/scanner' unless defined?(Scanner)
3
+
4
+ ##
5
+ # A lexical analyzer
6
+ #
7
+ # @example Tokenizing a Turtle string
8
+ # terminals = [
9
+ # [:BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL}))],
10
+ # ...
11
+ # ]
12
+ # ttl = "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ."
13
+ # lexer = RDF::LL1::Lexer.tokenize(ttl, terminals)
14
+ # lexer.each_token do |token|
15
+ # puts token.inspect
16
+ # end
17
+ #
18
+ # @example Tokenizing and returning a token stream
19
+ # lexer = RDF::LL1::Lexer.tokenize(...)
20
+ # while :some-condition
21
+ # token = lexer.first # Get the current token
22
+ # token = lexer.shift # Get the current token and shift to the next
23
+ # end
24
+ #
25
+ # @example Handling error conditions
26
+ # begin
27
+ # RDF::Turtle::Lexer.tokenize(query)
28
+ # rescue RDF::Turtle::Lexer::Error => error
29
+ # warn error.inspect
30
+ # end
31
+ #
32
+ # @see http://en.wikipedia.org/wiki/Lexical_analysis
33
+ class Lexer
34
+ include Enumerable
35
+
36
+ ESCAPE_CHARS = {
37
+ '\\t' => "\t", # \u0009 (tab)
38
+ '\\n' => "\n", # \u000A (line feed)
39
+ '\\r' => "\r", # \u000D (carriage return)
40
+ '\\b' => "\b", # \u0008 (backspace)
41
+ '\\f' => "\f", # \u000C (form feed)
42
+ '\\"' => '"', # \u0022 (quotation mark, double quote mark)
43
+ "\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark)
44
+ '\\\\' => '\\' # \u005C (backslash)
45
+ }
46
+ ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/ # \uXXXX
47
+ ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/ # \UXXXXXXXX
48
+ ECHAR = /\\./ # More liberal unescaping
49
+ UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/
50
+ COMMENT = /#.*/
51
+ WS = / |\t|\r|\n/m
52
+
53
+ ML_START = /\'\'\'|\"\"\"/ # Beginning of terminals that may span lines
54
+
55
+ ##
56
+ # @!attribute whitespace
57
+ # @return [Regexp] defines whitespace, defaults to WS
58
+ attr_reader :whitespace
59
+
60
+ ##
61
+ # @!attribute comment
62
+ # @return [Regexp] defines single-line comment, defaults to COMMENT
63
+ attr_reader :comment
64
+
65
+ ##
66
+ # Returns a copy of the given `input` string with all `\uXXXX` and
67
+ # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
68
+ # unescaped UTF-8 character counterparts.
69
+ #
70
+ # @param [String] string
71
+ # @return [String]
72
+ # @see http://www.w3.org/TR/rdf-sparql-query/#codepointEscape
73
+ def self.unescape_codepoints(string)
74
+ # Decode \uXXXX and \UXXXXXXXX code points:
75
+ string = string.gsub(UCHAR) do |c|
76
+ s = [(c[2..-1]).hex].pack('U*')
77
+ s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s
78
+ end
79
+
80
+ string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding) # Ruby 1.9+
81
+ string
82
+ end
83
+
84
+ ##
85
+ # Returns a copy of the given `input` string with all string escape
86
+ # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
87
+ # character counterparts.
88
+ #
89
+ # @param [String] input
90
+ # @return [String]
91
+ # @see http://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
92
+ def self.unescape_string(input)
93
+ input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
94
+ end
95
+
96
+ ##
97
+ # Tokenizes the given `input` string or stream.
98
+ #
99
+ # @param [String, #to_s] input
100
+ # @param [Array<Array<Symbol, Regexp>>] terminals
101
+ # Array of symbol, regexp pairs used to match terminals.
102
+ # If the symbol is nil, it defines a Regexp to match string terminals.
103
+ # @param [Hash{Symbol => Object}] options
104
+ # @yield [lexer]
105
+ # @yieldparam [Lexer] lexer
106
+ # @return [Lexer]
107
+ # @raise [Lexer::Error] on invalid input
108
+ def self.tokenize(input, terminals, options = {}, &block)
109
+ lexer = self.new(input, terminals, options)
110
+ block_given? ? block.call(lexer) : lexer
111
+ end
112
+
113
+ ##
114
+ # Initializes a new lexer instance.
115
+ #
116
+ # @param [String, #to_s] input
117
+ # @param [Array<Array<Symbol, Regexp>>] terminals
118
+ # Array of symbol, regexp pairs used to match terminals.
119
+ # If the symbol is nil, it defines a Regexp to match string terminals.
120
+ # @param [Hash{Symbol => Object}] options
121
+ # @option options [Regexp] :whitespace (WS)
122
+ # @option options [Regexp] :comment (COMMENT)
123
+ # @option options [Array<Symbol>] :unescape_terms ([])
124
+ # Regular expression matching the beginning of terminals that may cross newlines
125
+ def initialize(input = nil, terminals = nil, options = {})
126
+ @options = options.dup
127
+ @whitespace = @options[:whitespace] || WS
128
+ @comment = @options[:comment] || COMMENT
129
+ @unescape_terms = @options[:unescape_terms] || []
130
+ @terminals = terminals
131
+
132
+ raise Error, "Terminal patterns not defined" unless @terminals && @terminals.length > 0
133
+
134
+ @lineno = 1
135
+ @scanner = Scanner.new(input) do |string|
136
+ string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding) # Ruby 1.9+
137
+ string
138
+ end
139
+ end
140
+
141
+ ##
142
+ # Any additional options for the lexer.
143
+ #
144
+ # @return [Hash]
145
+ attr_reader :options
146
+
147
+ ##
148
+ # The current input string being processed.
149
+ #
150
+ # @return [String]
151
+ attr_accessor :input
152
+
153
+ ##
154
+ # The current line number (zero-based).
155
+ #
156
+ # @return [Integer]
157
+ attr_reader :lineno
158
+
159
+ ##
160
+ # Returns `true` if the input string is lexically valid.
161
+ #
162
+ # To be considered valid, the input string must contain more than zero
163
+ # terminals, and must not contain any invalid terminals.
164
+ #
165
+ # @return [Boolean]
166
+ def valid?
167
+ begin
168
+ !count.zero?
169
+ rescue Error
170
+ false
171
+ end
172
+ end
173
+
174
+ ##
175
+ # Enumerates each token in the input string.
176
+ #
177
+ # @yield [token]
178
+ # @yieldparam [Token] token
179
+ # @return [Enumerator]
180
+ def each_token(&block)
181
+ if block_given?
182
+ while token = shift
183
+ yield token
184
+ end
185
+ end
186
+ enum_for(:each_token)
187
+ end
188
+ alias_method :each, :each_token
189
+
190
+ ##
191
+ # Returns first token in input stream
192
+ #
193
+ # @return [Token]
194
+ def first
195
+ return nil unless scanner
196
+
197
+ @first ||= begin
198
+ {} while !scanner.eos? && skip_whitespace
199
+ return @scanner = nil if scanner.eos?
200
+
201
+ token = match_token
202
+
203
+ if token.nil?
204
+ lexme = (scanner.rest.split(/#{@whitespace}|#{@comment}/).first rescue nil) || scanner.rest
205
+ raise Error.new("Invalid token #{lexme[0..100].inspect}",
206
+ :input => scanner.rest[0..100], :token => lexme, :lineno => lineno)
207
+ end
208
+
209
+ token
210
+ end
211
+ rescue ArgumentError, Encoding::CompatibilityError => e
212
+ raise Error.new("#{e.message} on line #{lineno + 1}",
213
+ :input => (scanner.rest[0..100] rescue '??'), :token => lexme, :lineno => lineno)
214
+ rescue Error
215
+ raise
216
+ rescue
217
+ STDERR.puts "Expected ArgumentError, got #{$!.class}"
218
+ raise
219
+ end
220
+
221
+ ##
222
+ # Returns first token and shifts to next
223
+ #
224
+ # @return [Token]
225
+ def shift
226
+ cur = first
227
+ @first = nil
228
+ cur
229
+ end
230
+
231
+ ##
232
+ # Skip input until a token is matched
233
+ #
234
+ # @return [Token]
235
+ def recover
236
+ until scanner.eos? do
237
+ begin
238
+ shift
239
+ return first
240
+ rescue Error, ArgumentError
241
+ # Ignore errors until something scans, or EOS.
242
+ scanner.pos = scanner.pos + 1
243
+ end
244
+ end
245
+ end
246
+ protected
247
+
248
+ # @return [StringScanner]
249
+ attr_reader :scanner
250
+
251
+ # Perform string and codepoint unescaping
252
+ # @param [String] string
253
+ # @return [String]
254
+ def unescape(string)
255
+ self.class.unescape_string(self.class.unescape_codepoints(string))
256
+ end
257
+
258
+ ##
259
+ # Skip whitespace or comments, as defined through input options or defaults
260
+ def skip_whitespace
261
+ # skip all white space, but keep track of the current line number
262
+ while !scanner.eos?
263
+ if matched = scanner.scan(@whitespace)
264
+ @lineno += matched.count("\n")
265
+ elsif (com = scanner.scan(@comment))
266
+ else
267
+ return
268
+ end
269
+ end
270
+ end
271
+
272
+ ##
273
+ # Return the matched token
274
+ #
275
+ # @return [Token]
276
+ def match_token
277
+ @terminals.each do |(term, regexp)|
278
+ #STDERR.puts "match[#{term}] #{scanner.rest[0..100].inspect} against #{regexp.inspect}" #if term == :STRING_LITERAL_SINGLE_QUOTE
279
+ if matched = scanner.scan(regexp)
280
+ matched = unescape(matched) if @unescape_terms.include?(term)
281
+ #STDERR.puts " unescape? #{@unescape_terms.include?(term).inspect}"
282
+ #STDERR.puts " matched #{term.inspect}: #{matched.inspect}"
283
+ return token(term, matched)
284
+ end
285
+ end
286
+ nil
287
+ end
288
+
289
+ protected
290
+
291
+ ##
292
+ # Constructs a new token object annotated with the current line number.
293
+ #
294
+ # The parser relies on the type being a symbolized URI and the value being
295
+ # a string, if there is no type. If there is a type, then the value takes
296
+ # on the native representation appropriate for that type.
297
+ #
298
+ # @param [Symbol] type
299
+ # @param [String] value
300
+ # Scanner instance with access to matched groups
301
+ # @return [Token]
302
+ def token(type, value)
303
+ Token.new(type, value, :lineno => lineno)
304
+ end
305
+
306
+ ##
307
+ # Represents a lexer token.
308
+ #
309
+ # @example Creating a new token
310
+ # token = RDF::LL1::Lexer::Token.new(:LANGTAG, "en")
311
+ # token.type #=> :LANGTAG
312
+ # token.value #=> "en"
313
+ #
314
+ # @see http://en.wikipedia.org/wiki/Lexical_analysis#Token
315
+ class Token
316
+ ##
317
+ # Initializes a new token instance.
318
+ #
319
+ # @param [Symbol] type
320
+ # @param [String] value
321
+ # @param [Hash{Symbol => Object}] options
322
+ # @option options [Integer] :lineno (nil)
323
+ def initialize(type, value, options = {})
324
+ @type, @value = (type ? type.to_s.to_sym : nil), value
325
+ @options = options.dup
326
+ @lineno = @options.delete(:lineno)
327
+ end
328
+
329
+ ##
330
+ # The token's symbol type.
331
+ #
332
+ # @return [Symbol]
333
+ attr_reader :type
334
+
335
+ ##
336
+ # The token's value.
337
+ #
338
+ # @return [String]
339
+ attr_reader :value
340
+
341
+ ##
342
+ # The line number where the token was encountered.
343
+ #
344
+ # @return [Integer]
345
+ attr_reader :lineno
346
+
347
+ ##
348
+ # Any additional options for the token.
349
+ #
350
+ # @return [Hash]
351
+ attr_reader :options
352
+
353
+ ##
354
+ # Returns the attribute named by `key`.
355
+ #
356
+ # @param [Symbol] key
357
+ # @return [Object]
358
+ def [](key)
359
+ key = key.to_s.to_sym unless key.is_a?(Integer) || key.is_a?(Symbol)
360
+ case key
361
+ when 0, :type then @type
362
+ when 1, :value then @value
363
+ else nil
364
+ end
365
+ end
366
+
367
+ ##
368
+ # Returns `true` if the given `value` matches either the type or value
369
+ # of this token.
370
+ #
371
+ # @example Matching using the symbolic type
372
+ # RDF::LL1::Lexer::Token.new(:NIL) === :NIL #=> true
373
+ #
374
+ # @example Matching using the string value
375
+ # RDF::LL1::Lexer::Token.new(nil, "{") === "{" #=> true
376
+ #
377
+ # @param [Symbol, String] value
378
+ # @return [Boolean]
379
+ def ===(value)
380
+ case value
381
+ when Symbol then value == @type
382
+ when ::String then value.to_s == @value.to_s
383
+ else value == @value
384
+ end
385
+ end
386
+
387
+ ##
388
+ # Returns a hash table representation of this token.
389
+ #
390
+ # @return [Hash]
391
+ def to_hash
392
+ {:type => @type, :value => @value}
393
+ end
394
+
395
+ ##
396
+ # Readable version of token
397
+ def to_s
398
+ @type ? @type.inspect : @value
399
+ end
400
+
401
+ ##
402
+ # Returns type, if not nil, otherwise value
403
+ def representation
404
+ @type ? @type : @value
405
+ end
406
+
407
+ ##
408
+ # Returns an array representation of this token.
409
+ #
410
+ # @return [Array]
411
+ def to_a
412
+ [@type, @value]
413
+ end
414
+
415
+ ##
416
+ # Returns a developer-friendly representation of this token.
417
+ #
418
+ # @return [String]
419
+ def inspect
420
+ to_hash.inspect
421
+ end
422
+ end # class Token
423
+
424
+ ##
425
+ # Raised for errors during lexical analysis.
426
+ #
427
+ # @example Raising a lexer error
428
+ # raise RDF::LL1::Lexer::Error.new(
429
+ # "invalid token '%' on line 10",
430
+ # :input => query, :token => '%', :lineno => 9)
431
+ #
432
+ # @see http://ruby-doc.org/core/classes/StandardError.html
433
+ class Error < StandardError
434
+ ##
435
+ # The input string associated with the error.
436
+ #
437
+ # @return [String]
438
+ attr_reader :input
439
+
440
+ ##
441
+ # The invalid token which triggered the error.
442
+ #
443
+ # @return [String]
444
+ attr_reader :token
445
+
446
+ ##
447
+ # The line number where the error occurred.
448
+ #
449
+ # @return [Integer]
450
+ attr_reader :lineno
451
+
452
+ ##
453
+ # Initializes a new lexer error instance.
454
+ #
455
+ # @param [String, #to_s] message
456
+ # @param [Hash{Symbol => Object}] options
457
+ # @option options [String] :input (nil)
458
+ # @option options [String] :token (nil)
459
+ # @option options [Integer] :lineno (nil)
460
+ def initialize(message, options = {})
461
+ @input = options[:input]
462
+ @token = options[:token]
463
+ @lineno = options[:lineno]
464
+ super(message.to_s)
465
+ end
466
+ end # class Error
467
+
468
+ unless "".respond_to?(:force_encoding)
469
+ # Compatibility with 1.9 Encoding
470
+ module Encoding
471
+ class CompatibilityError < StandardError; end
472
+ end
473
+ end
474
+ end # class Lexer
475
+ end # module RDF::Turtle