gammo 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,513 @@
1
+ class Gammo::XPath::Parser
2
+
3
+ token T_SLASH
4
+ T_SLASHSLASH
5
+ T_PIPE
6
+ T_PLUS
7
+ T_MINUS
8
+ T_EQ
9
+ T_NEQ
10
+ T_LT
11
+ T_GT
12
+ T_LTE
13
+ T_GTE
14
+ T_AND
15
+ T_OR
16
+ T_DIV
17
+ T_MOD
18
+ T_MUL
19
+ T_LPAREN
20
+ T_RPAREN
21
+ T_LBRACK
22
+ T_RBRACK
23
+ T_DOT
24
+ T_DOTDOT
25
+ T_AT
26
+ T_COMMA
27
+ T_COLONCOLON
28
+ T_NC_NAME
29
+ T_Q_NAME
30
+ T_FUNCTION_NAME
31
+ T_NAME_TEST
32
+ T_NODE_TYPE
33
+ T_AXIS_NAME
34
+ T_VARIABLE_REFERENCE
35
+ T_LITERAL
36
+ T_NUMBER
37
+
38
+ start expr
39
+
40
+ rule
41
+ location_path:
42
+ relative_location_path {
43
+ result = val[0]
44
+ result.absolute = false
45
+ }
46
+ | absolute_location_path {
47
+ result = val[0]
48
+ result.absolute = true
49
+ }
50
+
51
+ absolute_location_path:
52
+ T_SLASH { result = AST::LocationPath.new }
53
+ | T_SLASH relative_location_path { result = val[1] }
54
+ | descendant_or_self relative_location_path {
55
+ result = val[1]
56
+ result.insert_first_step(val[0])
57
+ }
58
+
59
+ relative_location_path:
60
+ step {
61
+ result = AST::LocationPath.new
62
+ result.append_step(val[0])
63
+ }
64
+ | relative_location_path T_SLASH step {
65
+ result = val[0]
66
+ result.append_step(val[2])
67
+ }
68
+ | relative_location_path T_SLASHSLASH step {
69
+ result = val[0]
70
+ result.append_step(val[1])
71
+ result.append_step(val[2])
72
+ }
73
+
74
+ step:
75
+ node_test optional_predicates {
76
+ result = AST::Axis::Child.new(node_test: val[0], predicates: val[1])
77
+ }
78
+ | axis_specifier node_test optional_predicates {
79
+ axis_base_class = val[0]
80
+ axis_base_class = AST::Axis.fetch(axis_base_class.gsub(/-/, '_')) if axis_base_class.instance_of?(String)
81
+ result = axis_base_class.new(node_test: val[1], predicates: val[2])
82
+ }
83
+ | abbreviated_step
84
+
85
+ axis_specifier:
86
+ T_AXIS_NAME T_COLONCOLON | T_AT { result = AST::Axis::Attribute }
87
+
88
+ node_test:
89
+ T_NAME_TEST {
90
+ local, namespace = expand_qname(val[0])
91
+ result = AST::NodeTest::Name.new(local: local, namespace: namespace)
92
+ }
93
+ | T_NODE_TYPE T_LPAREN T_RPAREN {
94
+ result = AST::NodeTest.fetch(val[0]).new
95
+ }
96
+
97
+ descendant_or_self:
98
+ T_SLASHSLASH {
99
+ result = AST::Axis::DescendantOrSelf.new(node_test: AST::NodeTest::Any.new)
100
+ }
101
+
102
+ # Since there is no way that defining repeated expressions,
103
+ # need to define an original rule for handling that case recursively.
104
+ # TODO(kunpei): need test
105
+ repeatable_predicates:
106
+ predicate { result = [AST::Predicate.new(val[0])] }
107
+ | repeatable_predicates predicate {
108
+ result = val[0]
109
+ result << val[1]
110
+ }
111
+
112
+ optional_predicates:
113
+ | repeatable_predicates { result = val[0] }
114
+
115
+ predicate: T_LBRACK predicate_expr T_RBRACK { result = val[1] }
116
+ predicate_expr: expr
117
+
118
+ abbreviated_step:
119
+ T_DOT { result = AST::Axis::Self.new(node_test: AST::NodeTest::Any.new) }
120
+ | T_DOTDOT { result = AST::Axis::Parent.new(node_test: AST::NodeTest::Any.new) }
121
+
122
+ expr: or_expr
123
+
124
+ primary_expr:
125
+ T_VARIABLE_REFERENCE { result = AST::Value::VariableReference.new(val[0]) }
126
+ | T_LPAREN expr T_RPAREN { result = val[1] }
127
+ | T_LITERAL { result = AST::Value::String.new(val[0].to_s) }
128
+ | T_NUMBER { result = AST::Value::Number.new(val[0].include?(?.) ? val[0].to_f : val[0].to_i) }
129
+ | function_call
130
+
131
+ function_call:
132
+ T_FUNCTION_NAME T_LPAREN arguments T_RPAREN {
133
+ result = AST::Function.fetch(val[0]).new(*val[2])
134
+ }
135
+ | T_FUNCTION_NAME T_LPAREN T_RPAREN {
136
+ result = AST::Function.fetch(val[0]).new
137
+ }
138
+
139
+ argument: expr
140
+
141
+ # Since there is no way that defining repeated expressions,
142
+ # need to define an original rule for handling that case recursively.
143
+ # TODO(kunpei): need test
144
+ arguments:
145
+ argument {
146
+ result = []
147
+ result << val[0]
148
+ }
149
+ | arguments T_COMMA argument {
150
+ result = val[0]
151
+ result << val[2]
152
+ }
153
+
154
+ union_expr:
155
+ path_expr
156
+ | union_expr T_PIPE path_expr {
157
+ result = AST::UnionExpr.new(val[0], val[2])
158
+ }
159
+
160
+ path_expr:
161
+ location_path
162
+ | filter_expr
163
+ | filter_expr T_SLASH relative_location_path {
164
+ val[2].absolute = true
165
+ result = AST::Path.new(val[0], val[2])
166
+ }
167
+ | filter_expr descendant_or_self relative_location_path {
168
+ val[2].insert_first_step(val[1])
169
+ val[2].absolute = true
170
+ result = AST::Path.new(val[0], val[2])
171
+ }
172
+
173
+ filter_expr:
174
+ primary_expr
175
+ | primary_expr repeatable_predicates {
176
+ result = AST::Filter.new(val[0], predicates: val[1])
177
+ }
178
+
179
+ or_expr:
180
+ and_expr
181
+ | or_expr T_OR and_expr { result = AST::OrExpr.new(a: val[0], b: val[2]) }
182
+
183
+ and_expr:
184
+ equality_expr
185
+ | and_expr T_AND equality_expr { result = AST::AndExpr.new(a: val[0], b: val[2]) }
186
+
187
+ equality_expr:
188
+ relational_expr
189
+ | equality_expr T_EQ relational_expr { result = AST::EqExpr.new(val[0], val[2]) }
190
+ | equality_expr T_NEQ relational_expr { result = AST::NeqExpr.new(val[0], val[2]) }
191
+
192
+ relational_expr:
193
+ additive_expr
194
+ | relational_expr T_LT additive_expr { result = AST::LtExpr.new(val[0], val[2]) }
195
+ | relational_expr T_GT additive_expr { result = AST::GtExpr.new(val[0], val[2]) }
196
+ | relational_expr T_LTE additive_expr { result = AST::LteExpr.new(val[0], val[2]) }
197
+ | relational_expr T_GTE additive_expr { result = AST::GteExpr.new(val[0], val[2]) }
198
+
199
+ additive_expr:
200
+ multiplicative_expr
201
+ | additive_expr T_PLUS multiplicative_expr {
202
+ result = AST::PlusExpr.new(val[0], val[2])
203
+ }
204
+ | additive_expr T_MINUS multiplicative_expr {
205
+ result = AST::MinusExpr.new(val[0], val[2])
206
+ }
207
+
208
+ multiplicative_expr:
209
+ unary_expr
210
+ | multiplicative_expr T_MUL unary_expr {
211
+ result = AST::MultiplyExpr.new(val[0], val[2])
212
+ }
213
+ | multiplicative_expr T_DIV unary_expr {
214
+ result = AST::DividedExpr.new(val[0], val[2])
215
+ }
216
+ | multiplicative_expr T_MOD unary_expr {
217
+ result = AST::ModuloExpr.new(val[0], val[2])
218
+ }
219
+
220
+ unary_expr:
221
+ union_expr
222
+ | T_MINUS unary_expr {
223
+ result = AST::Negative.new(val[1])
224
+ }
225
+ end
226
+
227
+ ---- inner
228
+
229
+ # 2.2 Characters (Extensible Markup Language (XML) 1.0 (Fifth Edition))
230
+ #
231
+ # This represents "Char" range defined in 2.2 Characters.
232
+ # [2] Char ::=
233
+ # [#x1-#xD7FF] |
234
+ # [#xE000-#xFFFD] |
235
+ # [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
236
+ #
237
+ # @see https://www.w3.org/TR/xml11/#charsets
238
+ CHAR = /[\x9\xA\xD\u{20}-\u{d7ff}\u{e000}-\u{fffd}\u{10000}-\u{10ffff}]/
239
+
240
+ # 2.3 Common Syntactic Constructs (Extensible Markup Language (XML) 1.0 (Fifth Edition))
241
+ #
242
+ # [3] S ::= (#x20 | #x9 | #xD | #A)+
243
+ #
244
+ # @see https://www.w3.org/TR/xml11/#NT-S
245
+ S = /[\x20\x9\xD\xA]/
246
+
247
+ # [4] NameStartChar ::=
248
+ # ":" |
249
+ # [A-Z] |
250
+ # "_" |
251
+ # [a-z] |
252
+ # [#xC0-#xD6] |
253
+ # [#xD8-#xF6] |
254
+ # [#xF8-#x2FF] |
255
+ # [#x370-#x37D] |
256
+ # [#x37F-#x1FFF] |
257
+ # [#x200C-#x200D] |
258
+ # [#x2070-#x218F] |
259
+ # [#x2C00-#x2FEF] |
260
+ # [#x3001-#xD7FF] |
261
+ # [#xF900-#xFDCF] |
262
+ # [#xFDF0-#xFFFD] |
263
+ # [#x10000-#xEFFFF]
264
+ #
265
+ # @see https://www.w3.org/TR/xml11/#NT-NameStartChar
266
+ name_start_chars = %w[
267
+ :
268
+ a-zA-Z_
269
+ \\u00c0-\\u00d6
270
+ \\u00d8-\\u00f6
271
+ \\u00f8-\\u02ff
272
+ \\u0370-\\u037d
273
+ \\u037f-\\u1fff
274
+ \\u200c-\\u200d
275
+ \\u2070-\\u218f
276
+ \\u2c00-\\u2fef
277
+ \\u3001-\\ud7ff
278
+ \\uf900-\\ufdcf
279
+ \\ufdf0-\\ufffd
280
+ \\u{10000}-\\u{effff}
281
+ ]
282
+ NAME_START_CHARS = /[#{name_start_chars.join}]/
283
+
284
+ # [4a] NameChar ::=
285
+ # NameStartChar |
286
+ # "-" |
287
+ # "." |
288
+ # [0-9] |
289
+ # #xB7 |
290
+ # [#x0300-#x036F] |
291
+ # [#x203F-#x2040]
292
+ #
293
+ # @see https://www.w3.org/TR/xml11/#NT-NameChar
294
+ name_chars = name_start_chars + %w[
295
+ \\-
296
+ \\.
297
+ 0-9
298
+ \\u00b7
299
+ \\u0300-\\u036f
300
+ \\u203f-\\u2040
301
+ ]
302
+ NAME_CHARS = /[#{name_chars.join}]/
303
+
304
+ # [5] Name ::= NameStartChar (NameChar)*
305
+ #
306
+ # @see https://www.w3.org/TR/1999/REC-xpath-19991116/#NT-Name
307
+ NAME = /#{NAME_START_CHARS}#{NAME_CHARS}*/
308
+
309
+ # 2.3. Axes
310
+ #
311
+ # [6] AxisName ::=
312
+ # 'ancestor'
313
+ # | 'ancestor-or-self'
314
+ # | 'attribute'
315
+ # | 'child'
316
+ # | 'descendant'
317
+ # | 'descendant-or-self'
318
+ # | 'following'
319
+ # | 'following-sibling'
320
+ # | 'namespace'
321
+ # | 'parent'
322
+ # | 'preceding'
323
+ # | 'preceding-sibling'
324
+ # | 'self'
325
+ #
326
+ # @see https://www.w3.org/TR/1999/REC-xpath-19991116/#NT-AxisName
327
+ AXES = /
328
+ ancestor-or-self|
329
+ ancestor|
330
+ attribute|
331
+ child|
332
+ descendant-or-self|
333
+ descendant|
334
+ following-sibling|
335
+ following|
336
+ namespace|
337
+ parent|
338
+ preceding-sibling|
339
+ preceding|
340
+ self
341
+ /x
342
+
343
+ # 3 Declaring Namespaces
344
+ #
345
+ # The "NCName" is picked from the section.
346
+ #
347
+ # Note that we need to take care of exceptional handling.
348
+ #
349
+ # [4] NCName ::= NCNameStartChar NCNameChar* /* An XML Name, minus the ":" */
350
+ # [5] NCNamrChar ::= NameChar - ':'
351
+ # [6] NCNameStartChar ::= NameStartChar - ':'
352
+ #
353
+ # @see https://www.w3.org/TR/xml-names11/#ns-decl
354
+ NC_NAME_CHARS = /[#{(name_chars - [':']).join}]/
355
+ NC_NAME_START_CHARS = /[#{(name_start_chars - [':']).join}]/
356
+ NC_NAME = /#{NC_NAME_START_CHARS}#{NC_NAME_CHARS}*/
357
+
358
+ # 4. Qualified Names
359
+ #
360
+ # The rules for "QName", "PrefixedName", "UnprefixedName", "Prefix" and
361
+ # "LocalPart" are picked from the section.
362
+ #
363
+ # [7] QName ::= PrefixedName | UnprefixedName
364
+ # [8] PrefixedName ::= Prefix ':' LocalPart
365
+ # [9] UnprefixedName ::= LocalPart
366
+ # [10] Prefix ::= NCName
367
+ # [11] LocalPart ::= NCName
368
+ #
369
+ # @see https://www.w3.org/TR/xml-names11/#ns-qualnames
370
+ PREFIX = NC_NAME
371
+ LOCAL_PART = NC_NAME
372
+ PREFIXED_NAME = /#{PREFIX}:#{LOCAL_PART}/
373
+ UNPREFIXED_NAME = LOCAL_PART
374
+ Q_NAME = /#{PREFIXED_NAME}|#{UNPREFIXED_NAME}/
375
+
376
+ # 3.7 Lexical Structure
377
+ #
378
+ # The rules for "NodeType" and "Digits" are picked from the section.
379
+ # @see https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
380
+ DIGITS = /[0-9]+/
381
+ NODE_TYPE = /comment|text|processing-instruction|node/
382
+
383
+ # EXPR_TOKENS is defined for tokenizing primitive tokens for "ExprToken",
384
+ # except other rules.
385
+ # @see https://www.w3.org/TR/1999/REC-xpath-19991116/#NT-ExprToken
386
+ EXPR_TOKENS = {
387
+ '(' => :T_LPAREN,
388
+ ')' => :T_RPAREN,
389
+ '[' => :T_LBRACK,
390
+ ']' => :T_RBRACK,
391
+ '.' => :T_DOT,
392
+ '..' => :T_DOTDOT,
393
+ '@' => :T_AT,
394
+ ',' => :T_COMMA,
395
+ '::' => :T_COLONCOLON
396
+ }.freeze
397
+ # Declaring the regexp consisting of EXPR_TOKENS keys to keep the token order.
398
+ EXPRS = /\(|\)|\[|\]|@|,|::|\.\.|\./
399
+
400
+ # OPERATOR_TOKENS is defined for tokenizing primitive tokens for "Operator"
401
+ # and "OperatorName" except other rules.
402
+ # @see https://www.w3.org/TR/1999/REC-xpath-19991116/#NT-Operator
403
+ OPERATOR_TOKENS = {
404
+ 'and' => :T_AND,
405
+ 'or' => :T_OR,
406
+ 'mod' => :T_MOD,
407
+ 'div' => :T_DIV,
408
+ '/' => :T_SLASH,
409
+ '//' => :T_SLASHSLASH,
410
+ "|" => :T_PIPE,
411
+ '+' => :T_PLUS,
412
+ '-' => :T_MINUS,
413
+ '=' => :T_EQ,
414
+ '!=' => :T_NEQ,
415
+ '<' => :T_LT,
416
+ '>' => :T_GT,
417
+ '<=' => :T_LTE,
418
+ '>=' => :T_GTE
419
+ }.freeze
420
+ # Declaring the regexp consisting of OPERATOR_TOKENS keys to keep the token order.
421
+ OPERATORS = /and|or|mod|div|\/\/|\/|\||\+|-|\=|!=|<=|>=|<|>/
422
+
423
+ require 'strscan'
424
+ require 'forwardable'
425
+ require 'gammo/xpath/errors'
426
+ require 'gammo/xpath/ast/axis'
427
+ require 'gammo/xpath/ast/expression'
428
+ require 'gammo/xpath/ast/function'
429
+ require 'gammo/xpath/ast/node_test'
430
+ require 'gammo/xpath/ast/path'
431
+ require 'gammo/xpath/ast/value'
432
+
433
+ extend Forwardable
434
+ def_delegators :@scanner, :scan, :eos?
435
+
436
+ def initialize(input)
437
+ super()
438
+ @yydebug = true
439
+ @input = input
440
+ @scanner = StringScanner.new(input)
441
+ end
442
+
443
+ def parse
444
+ @query = []
445
+ advance { |symbol, val| @query << [symbol, val] }
446
+ do_parse
447
+ end
448
+
449
+ def next_token
450
+ @query.shift
451
+ end
452
+
453
+ def lookup_namespace_uri(prefix)
454
+ prefix == 'xml' ? 'http://www.w3.org/XML/1998/namespace' : nil
455
+ end
456
+
457
+ def expand_qname(qname)
458
+ return [qname, nil] unless colon = qname.index(':')
459
+ namespace_uri = lookup_namespace_uri(qname.slice(0..colon))
460
+ fail ParseError, 'invalid qname: %s' % qname unless namespace_uri
461
+ [qname.slice(colon..-1), namespace_uri]
462
+ end
463
+
464
+ def token(symbol, val, &block)
465
+ @prev_token = symbol
466
+ block.call(symbol, val)
467
+ end
468
+
469
+ def fetch(key, constraints)
470
+ unless symbol = constraints[key]
471
+ fail ParseError, "unexpected token: #{symbol}, want = #{constraints.keys}"
472
+ end
473
+ yield symbol
474
+ end
475
+
476
+ def advance(&block)
477
+ @prev_token = nil
478
+ until eos?
479
+ case
480
+ # Skip whitespace everywhere.
481
+ when scan(/#{S}+/) then next
482
+ when expr = scan(EXPRS)
483
+ fetch(expr, EXPR_TOKENS) do |symbol|
484
+ token(symbol, expr, &block)
485
+ end
486
+ when operator = scan(OPERATORS)
487
+ fetch operator, OPERATOR_TOKENS do |symbol|
488
+ # "div" is available in both operator and name_test tokens.
489
+ if symbol == :T_DIV && @prev_token != :T_NUMBER
490
+ token(:T_NAME_TEST, operator, &block)
491
+ next
492
+ end
493
+ token(symbol, operator, &block)
494
+ end
495
+ when axis = scan(AXES) then token(:T_AXIS_NAME, axis, &block)
496
+ when node_type = scan(NODE_TYPE)
497
+ # NOTE: processing-instruction is not supported by Gammo.
498
+ token(:T_NODE_TYPE, node_type, &block)
499
+ when name = scan(/\*|#{NC_NAME}|#{Q_NAME}/)
500
+ if name == ?* && @prev_token == :T_NUMBER
501
+ token(:T_MUL, name, &block)
502
+ next
503
+ end
504
+ # TODO: Stripping should be taken care by regexp.
505
+ token @scanner.peek(1) == ?( ? :T_FUNCTION_NAME : :T_NAME_TEST, name.strip, &block
506
+ when literal = scan(/"[^"]*"|'[^']*'/) then token(:T_LITERAL, literal, &block)
507
+ when number = scan(/#{DIGITS}(\.(#{DIGITS})?)?/) then token(:T_NUMBER, number, &block)
508
+ when ref = scan(/\$#{Q_NAME}/) then token(:T_VARIABLE_REFERENCE, ref, &block)
509
+ else
510
+ fail ParseError, "unexpected token: #{@scanner.string[@scanner.pos..-1]}"
511
+ end
512
+ end
513
+ end