gammo 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,513 @@
1
+ class Gammo::XPath::Parser
2
+
3
+ token T_SLASH
4
+ T_SLASHSLASH
5
+ T_PIPE
6
+ T_PLUS
7
+ T_MINUS
8
+ T_EQ
9
+ T_NEQ
10
+ T_LT
11
+ T_GT
12
+ T_LTE
13
+ T_GTE
14
+ T_AND
15
+ T_OR
16
+ T_DIV
17
+ T_MOD
18
+ T_MUL
19
+ T_LPAREN
20
+ T_RPAREN
21
+ T_LBRACK
22
+ T_RBRACK
23
+ T_DOT
24
+ T_DOTDOT
25
+ T_AT
26
+ T_COMMA
27
+ T_COLONCOLON
28
+ T_NC_NAME
29
+ T_Q_NAME
30
+ T_FUNCTION_NAME
31
+ T_NAME_TEST
32
+ T_NODE_TYPE
33
+ T_AXIS_NAME
34
+ T_VARIABLE_REFERENCE
35
+ T_LITERAL
36
+ T_NUMBER
37
+
38
+ start expr
39
+
40
+ rule
41
+ location_path:
42
+ relative_location_path {
43
+ result = val[0]
44
+ result.absolute = false
45
+ }
46
+ | absolute_location_path {
47
+ result = val[0]
48
+ result.absolute = true
49
+ }
50
+
51
+ absolute_location_path:
52
+ T_SLASH { result = AST::LocationPath.new }
53
+ | T_SLASH relative_location_path { result = val[1] }
54
+ | descendant_or_self relative_location_path {
55
+ result = val[1]
56
+ result.insert_first_step(val[0])
57
+ }
58
+
59
+ relative_location_path:
60
+ step {
61
+ result = AST::LocationPath.new
62
+ result.append_step(val[0])
63
+ }
64
+ | relative_location_path T_SLASH step {
65
+ result = val[0]
66
+ result.append_step(val[2])
67
+ }
68
+ | relative_location_path T_SLASHSLASH step {
69
+ result = val[0]
70
+ result.append_step(val[1])
71
+ result.append_step(val[2])
72
+ }
73
+
74
+ step:
75
+ node_test optional_predicates {
76
+ result = AST::Axis::Child.new(node_test: val[0], predicates: val[1])
77
+ }
78
+ | axis_specifier node_test optional_predicates {
79
+ axis_base_class = val[0]
80
+ axis_base_class = AST::Axis.fetch(axis_base_class.gsub(/-/, '_')) if axis_base_class.instance_of?(String)
81
+ result = axis_base_class.new(node_test: val[1], predicates: val[2])
82
+ }
83
+ | abbreviated_step
84
+
85
+ axis_specifier:
86
+ T_AXIS_NAME T_COLONCOLON | T_AT { result = AST::Axis::Attribute }
87
+
88
+ node_test:
89
+ T_NAME_TEST {
90
+ local, namespace = expand_qname(val[0])
91
+ result = AST::NodeTest::Name.new(local: local, namespace: namespace)
92
+ }
93
+ | T_NODE_TYPE T_LPAREN T_RPAREN {
94
+ result = AST::NodeTest.fetch(val[0]).new
95
+ }
96
+
97
+ descendant_or_self:
98
+ T_SLASHSLASH {
99
+ result = AST::Axis::DescendantOrSelf.new(node_test: AST::NodeTest::Any.new)
100
+ }
101
+
102
+ # Since there is no way that defining repeated expressions,
103
+ # need to define an original rule for handling that case recursively.
104
+ # TODO(kunpei): need test
105
+ repeatable_predicates:
106
+ predicate { result = [AST::Predicate.new(val[0])] }
107
+ | repeatable_predicates predicate {
108
+ result = val[0]
109
+ result << val[1]
110
+ }
111
+
112
+ optional_predicates:
113
+ | repeatable_predicates { result = val[0] }
114
+
115
+ predicate: T_LBRACK predicate_expr T_RBRACK { result = val[1] }
116
+ predicate_expr: expr
117
+
118
+ abbreviated_step:
119
+ T_DOT { result = AST::Axis::Self.new(node_test: AST::NodeTest::Any.new) }
120
+ | T_DOTDOT { result = AST::Axis::Parent.new(node_test: AST::NodeTest::Any.new) }
121
+
122
+ expr: or_expr
123
+
124
+ primary_expr:
125
+ T_VARIABLE_REFERENCE { result = AST::Value::VariableReference.new(val[0]) }
126
+ | T_LPAREN expr T_RPAREN { result = val[1] }
127
+ | T_LITERAL { result = AST::Value::String.new(val[0].to_s) }
128
+ | T_NUMBER { result = AST::Value::Number.new(val[0].include?(?.) ? val[0].to_f : val[0].to_i) }
129
+ | function_call
130
+
131
+ function_call:
132
+ T_FUNCTION_NAME T_LPAREN arguments T_RPAREN {
133
+ result = AST::Function.fetch(val[0]).new(*val[2])
134
+ }
135
+ | T_FUNCTION_NAME T_LPAREN T_RPAREN {
136
+ result = AST::Function.fetch(val[0]).new
137
+ }
138
+
139
+ argument: expr
140
+
141
+ # Since there is no way that defining repeated expressions,
142
+ # need to define an original rule for handling that case recursively.
143
+ # TODO(kunpei): need test
144
+ arguments:
145
+ argument {
146
+ result = []
147
+ result << val[0]
148
+ }
149
+ | arguments T_COMMA argument {
150
+ result = val[0]
151
+ result << val[2]
152
+ }
153
+
154
+ union_expr:
155
+ path_expr
156
+ | union_expr T_PIPE path_expr {
157
+ result = AST::UnionExpr.new(val[0], val[2])
158
+ }
159
+
160
+ path_expr:
161
+ location_path
162
+ | filter_expr
163
+ | filter_expr T_SLASH relative_location_path {
164
+ val[2].absolute = true
165
+ result = AST::Path.new(val[0], val[2])
166
+ }
167
+ | filter_expr descendant_or_self relative_location_path {
168
+ val[2].insert_first_step(val[1])
169
+ val[2].absolute = true
170
+ result = AST::Path.new(val[0], val[2])
171
+ }
172
+
173
+ filter_expr:
174
+ primary_expr
175
+ | primary_expr repeatable_predicates {
176
+ result = AST::Filter.new(val[0], predicates: val[1])
177
+ }
178
+
179
+ or_expr:
180
+ and_expr
181
+ | or_expr T_OR and_expr { result = AST::OrExpr.new(a: val[0], b: val[2]) }
182
+
183
+ and_expr:
184
+ equality_expr
185
+ | and_expr T_AND equality_expr { result = AST::AndExpr.new(a: val[0], b: val[2]) }
186
+
187
+ equality_expr:
188
+ relational_expr
189
+ | equality_expr T_EQ relational_expr { result = AST::EqExpr.new(val[0], val[2]) }
190
+ | equality_expr T_NEQ relational_expr { result = AST::NeqExpr.new(val[0], val[2]) }
191
+
192
+ relational_expr:
193
+ additive_expr
194
+ | relational_expr T_LT additive_expr { result = AST::LtExpr.new(val[0], val[2]) }
195
+ | relational_expr T_GT additive_expr { result = AST::GtExpr.new(val[0], val[2]) }
196
+ | relational_expr T_LTE additive_expr { result = AST::LteExpr.new(val[0], val[2]) }
197
+ | relational_expr T_GTE additive_expr { result = AST::GteExpr.new(val[0], val[2]) }
198
+
199
+ additive_expr:
200
+ multiplicative_expr
201
+ | additive_expr T_PLUS multiplicative_expr {
202
+ result = AST::PlusExpr.new(val[0], val[2])
203
+ }
204
+ | additive_expr T_MINUS multiplicative_expr {
205
+ result = AST::MinusExpr.new(val[0], val[2])
206
+ }
207
+
208
+ multiplicative_expr:
209
+ unary_expr
210
+ | multiplicative_expr T_MUL unary_expr {
211
+ result = AST::MultiplyExpr.new(val[0], val[2])
212
+ }
213
+ | multiplicative_expr T_DIV unary_expr {
214
+ result = AST::DividedExpr.new(val[0], val[2])
215
+ }
216
+ | multiplicative_expr T_MOD unary_expr {
217
+ result = AST::ModuloExpr.new(val[0], val[2])
218
+ }
219
+
220
+ unary_expr:
221
+ union_expr
222
+ | T_MINUS unary_expr {
223
+ result = AST::Negative.new(val[1])
224
+ }
225
+ end
226
+
227
+ ---- inner
228
+
229
+ # 2.2 Characters (Extensible Markup Language (XML) 1.0 (Fifth Edition))
230
+ #
231
+ # This represents "Char" range defined in 2.2 Characters.
232
+ # [2] Char ::=
233
+ # [#x1-#xD7FF] |
234
+ # [#xE000-#xFFFD] |
235
+ # [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
236
+ #
237
+ # @see https://www.w3.org/TR/xml11/#charsets
238
+ CHAR = /[\x9\xA\xD\u{20}-\u{d7ff}\u{e000}-\u{fffd}\u{10000}-\u{10ffff}]/
239
+
240
+ # 2.3 Common Syntactic Constructs (Extensible Markup Language (XML) 1.0 (Fifth Edition))
241
+ #
242
+ # [3] S ::= (#x20 | #x9 | #xD | #A)+
243
+ #
244
+ # @see https://www.w3.org/TR/xml11/#NT-S
245
+ S = /[\x20\x9\xD\xA]/
246
+
247
+ # [4] NameStartChar ::=
248
+ # ":" |
249
+ # [A-Z] |
250
+ # "_" |
251
+ # [a-z] |
252
+ # [#xC0-#xD6] |
253
+ # [#xD8-#xF6] |
254
+ # [#xF8-#x2FF] |
255
+ # [#x370-#x37D] |
256
+ # [#x37F-#x1FFF] |
257
+ # [#x200C-#x200D] |
258
+ # [#x2070-#x218F] |
259
+ # [#x2C00-#x2FEF] |
260
+ # [#x3001-#xD7FF] |
261
+ # [#xF900-#xFDCF] |
262
+ # [#xFDF0-#xFFFD] |
263
+ # [#x10000-#xEFFFF]
264
+ #
265
+ # @see https://www.w3.org/TR/xml11/#NT-NameStartChar
266
+ name_start_chars = %w[
267
+ :
268
+ a-zA-Z_
269
+ \\u00c0-\\u00d6
270
+ \\u00d8-\\u00f6
271
+ \\u00f8-\\u02ff
272
+ \\u0370-\\u037d
273
+ \\u037f-\\u1fff
274
+ \\u200c-\\u200d
275
+ \\u2070-\\u218f
276
+ \\u2c00-\\u2fef
277
+ \\u3001-\\ud7ff
278
+ \\uf900-\\ufdcf
279
+ \\ufdf0-\\ufffd
280
+ \\u{10000}-\\u{effff}
281
+ ]
282
+ NAME_START_CHARS = /[#{name_start_chars.join}]/
283
+
284
+ # [4a] NameChar ::=
285
+ # NameStartChar |
286
+ # "-" |
287
+ # "." |
288
+ # [0-9] |
289
+ # #xB7 |
290
+ # [#x0300-#x036F] |
291
+ # [#x203F-#x2040]
292
+ #
293
+ # @see https://www.w3.org/TR/xml11/#NT-NameChar
294
+ name_chars = name_start_chars + %w[
295
+ \\-
296
+ \\.
297
+ 0-9
298
+ \\u00b7
299
+ \\u0300-\\u036f
300
+ \\u203f-\\u2040
301
+ ]
302
+ NAME_CHARS = /[#{name_chars.join}]/
303
+
304
+ # [5] Name ::= NameStartChar (NameChar)*
305
+ #
306
+ # @see https://www.w3.org/TR/1999/REC-xpath-19991116/#NT-Name
307
+ NAME = /#{NAME_START_CHARS}#{NAME_CHARS}*/
308
+
309
+ # 2.3. Axes
310
+ #
311
+ # [6] AxisName ::=
312
+ # 'ancestor'
313
+ # | 'ancestor-or-self'
314
+ # | 'attribute'
315
+ # | 'child'
316
+ # | 'descendant'
317
+ # | 'descendant-or-self'
318
+ # | 'following'
319
+ # | 'following-sibling'
320
+ # | 'namespace'
321
+ # | 'parent'
322
+ # | 'preceding'
323
+ # | 'preceding-sibling'
324
+ # | 'self'
325
+ #
326
+ # @see https://www.w3.org/TR/1999/REC-xpath-19991116/#NT-AxisName
327
+ AXES = /
328
+ ancestor-or-self|
329
+ ancestor|
330
+ attribute|
331
+ child|
332
+ descendant-or-self|
333
+ descendant|
334
+ following-sibling|
335
+ following|
336
+ namespace|
337
+ parent|
338
+ preceding-sibling|
339
+ preceding|
340
+ self
341
+ /x
342
+
343
+ # 3 Declaring Namespaces
344
+ #
345
+ # The "NCName" is picked from the section.
346
+ #
347
+ # Note that we need to take care of exceptional handling.
348
+ #
349
+ # [4] NCName ::= NCNameStartChar NCNameChar* /* An XML Name, minus the ":" */
350
+ # [5] NCNamrChar ::= NameChar - ':'
351
+ # [6] NCNameStartChar ::= NameStartChar - ':'
352
+ #
353
+ # @see https://www.w3.org/TR/xml-names11/#ns-decl
354
+ NC_NAME_CHARS = /[#{(name_chars - [':']).join}]/
355
+ NC_NAME_START_CHARS = /[#{(name_start_chars - [':']).join}]/
356
+ NC_NAME = /#{NC_NAME_START_CHARS}#{NC_NAME_CHARS}*/
357
+
358
+ # 4. Qualified Names
359
+ #
360
+ # The rules for "QName", "PrefixedName", "UnprefixedName", "Prefix" and
361
+ # "LocalPart" are picked from the section.
362
+ #
363
+ # [7] QName ::= PrefixedName | UnprefixedName
364
+ # [8] PrefixedName ::= Prefix ':' LocalPart
365
+ # [9] UnprefixedName ::= LocalPart
366
+ # [10] Prefix ::= NCName
367
+ # [11] LocalPart ::= NCName
368
+ #
369
+ # @see https://www.w3.org/TR/xml-names11/#ns-qualnames
370
+ PREFIX = NC_NAME
371
+ LOCAL_PART = NC_NAME
372
+ PREFIXED_NAME = /#{PREFIX}:#{LOCAL_PART}/
373
+ UNPREFIXED_NAME = LOCAL_PART
374
+ Q_NAME = /#{PREFIXED_NAME}|#{UNPREFIXED_NAME}/
375
+
376
+ # 3.7 Lexical Structure
377
+ #
378
+ # The rules for "NodeType" and "Digits" are picked from the section.
379
+ # @see https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
380
+ DIGITS = /[0-9]+/
381
+ NODE_TYPE = /comment|text|processing-instruction|node/
382
+
383
+ # EXPR_TOKENS is defined for tokenizing primitive tokens for "ExprToken",
384
+ # except other rules.
385
+ # @see https://www.w3.org/TR/1999/REC-xpath-19991116/#NT-ExprToken
386
+ EXPR_TOKENS = {
387
+ '(' => :T_LPAREN,
388
+ ')' => :T_RPAREN,
389
+ '[' => :T_LBRACK,
390
+ ']' => :T_RBRACK,
391
+ '.' => :T_DOT,
392
+ '..' => :T_DOTDOT,
393
+ '@' => :T_AT,
394
+ ',' => :T_COMMA,
395
+ '::' => :T_COLONCOLON
396
+ }.freeze
397
+ # Declaring the regexp consisting of EXPR_TOKENS keys to keep the token order.
398
+ EXPRS = /\(|\)|\[|\]|@|,|::|\.\.|\./
399
+
400
+ # OPERATOR_TOKENS is defined for tokenizing primitive tokens for "Operator"
401
+ # and "OperatorName" except other rules.
402
+ # @see https://www.w3.org/TR/1999/REC-xpath-19991116/#NT-Operator
403
+ OPERATOR_TOKENS = {
404
+ 'and' => :T_AND,
405
+ 'or' => :T_OR,
406
+ 'mod' => :T_MOD,
407
+ 'div' => :T_DIV,
408
+ '/' => :T_SLASH,
409
+ '//' => :T_SLASHSLASH,
410
+ "|" => :T_PIPE,
411
+ '+' => :T_PLUS,
412
+ '-' => :T_MINUS,
413
+ '=' => :T_EQ,
414
+ '!=' => :T_NEQ,
415
+ '<' => :T_LT,
416
+ '>' => :T_GT,
417
+ '<=' => :T_LTE,
418
+ '>=' => :T_GTE
419
+ }.freeze
420
+ # Declaring the regexp consisting of OPERATOR_TOKENS keys to keep the token order.
421
+ OPERATORS = /and|or|mod|div|\/\/|\/|\||\+|-|\=|!=|<=|>=|<|>/
422
+
423
+ require 'strscan'
424
+ require 'forwardable'
425
+ require 'gammo/xpath/errors'
426
+ require 'gammo/xpath/ast/axis'
427
+ require 'gammo/xpath/ast/expression'
428
+ require 'gammo/xpath/ast/function'
429
+ require 'gammo/xpath/ast/node_test'
430
+ require 'gammo/xpath/ast/path'
431
+ require 'gammo/xpath/ast/value'
432
+
433
+ extend Forwardable
434
+ def_delegators :@scanner, :scan, :eos?
435
+
436
+ def initialize(input)
437
+ super()
438
+ @yydebug = true
439
+ @input = input
440
+ @scanner = StringScanner.new(input)
441
+ end
442
+
443
+ def parse
444
+ @query = []
445
+ advance { |symbol, val| @query << [symbol, val] }
446
+ do_parse
447
+ end
448
+
449
+ def next_token
450
+ @query.shift
451
+ end
452
+
453
+ def lookup_namespace_uri(prefix)
454
+ prefix == 'xml' ? 'http://www.w3.org/XML/1998/namespace' : nil
455
+ end
456
+
457
+ def expand_qname(qname)
458
+ return [qname, nil] unless colon = qname.index(':')
459
+ namespace_uri = lookup_namespace_uri(qname.slice(0..colon))
460
+ fail ParseError, 'invalid qname: %s' % qname unless namespace_uri
461
+ [qname.slice(colon..-1), namespace_uri]
462
+ end
463
+
464
+ def token(symbol, val, &block)
465
+ @prev_token = symbol
466
+ block.call(symbol, val)
467
+ end
468
+
469
+ def fetch(key, constraints)
470
+ unless symbol = constraints[key]
471
+ fail ParseError, "unexpected token: #{symbol}, want = #{constraints.keys}"
472
+ end
473
+ yield symbol
474
+ end
475
+
476
+ def advance(&block)
477
+ @prev_token = nil
478
+ until eos?
479
+ case
480
+ # Skip whitespace everywhere.
481
+ when scan(/#{S}+/) then next
482
+ when expr = scan(EXPRS)
483
+ fetch(expr, EXPR_TOKENS) do |symbol|
484
+ token(symbol, expr, &block)
485
+ end
486
+ when operator = scan(OPERATORS)
487
+ fetch operator, OPERATOR_TOKENS do |symbol|
488
+ # "div" is available in both operator and name_test tokens.
489
+ if symbol == :T_DIV && @prev_token != :T_NUMBER
490
+ token(:T_NAME_TEST, operator, &block)
491
+ next
492
+ end
493
+ token(symbol, operator, &block)
494
+ end
495
+ when axis = scan(AXES) then token(:T_AXIS_NAME, axis, &block)
496
+ when node_type = scan(NODE_TYPE)
497
+ # NOTE: processing-instruction is not supported by Gammo.
498
+ token(:T_NODE_TYPE, node_type, &block)
499
+ when name = scan(/\*|#{NC_NAME}|#{Q_NAME}/)
500
+ if name == ?* && @prev_token == :T_NUMBER
501
+ token(:T_MUL, name, &block)
502
+ next
503
+ end
504
+ # TODO: Stripping should be taken care by regexp.
505
+ token @scanner.peek(1) == ?( ? :T_FUNCTION_NAME : :T_NAME_TEST, name.strip, &block
506
+ when literal = scan(/"[^"]*"|'[^']*'/) then token(:T_LITERAL, literal, &block)
507
+ when number = scan(/#{DIGITS}(\.(#{DIGITS})?)?/) then token(:T_NUMBER, number, &block)
508
+ when ref = scan(/\$#{Q_NAME}/) then token(:T_VARIABLE_REFERENCE, ref, &block)
509
+ else
510
+ fail ParseError, "unexpected token: #{@scanner.string[@scanner.pos..-1]}"
511
+ end
512
+ end
513
+ end