scrapetor 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +242 -0
- data/LICENSE +21 -0
- data/README.md +440 -0
- data/bin/scrapetor +190 -0
- data/bin/scrapetor-bench +5 -0
- data/ext/scrapetor/README.md +53 -0
- data/ext/scrapetor/native/extconf.rb +67 -0
- data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
- data/ext/scrapetor/native/scrapetor_http.c +2591 -0
- data/ext/scrapetor/native/scrapetor_native.c +1156 -0
- data/lib/scrapetor/builder.rb +158 -0
- data/lib/scrapetor/cleaner.rb +10 -0
- data/lib/scrapetor/comment_node.rb +67 -0
- data/lib/scrapetor/document.rb +457 -0
- data/lib/scrapetor/dom/parser.rb +69 -0
- data/lib/scrapetor/dom/selectors.rb +208 -0
- data/lib/scrapetor/dom.rb +563 -0
- data/lib/scrapetor/encoding.rb +85 -0
- data/lib/scrapetor/entities.rb +90 -0
- data/lib/scrapetor/errors.rb +12 -0
- data/lib/scrapetor/extractor.rb +147 -0
- data/lib/scrapetor/fetcher.rb +390 -0
- data/lib/scrapetor/fingerprint.rb +29 -0
- data/lib/scrapetor/form.rb +141 -0
- data/lib/scrapetor/http.rb +114 -0
- data/lib/scrapetor/microdata.rb +132 -0
- data/lib/scrapetor/money.rb +30 -0
- data/lib/scrapetor/native.rb +291 -0
- data/lib/scrapetor/native_dom.rb +2258 -0
- data/lib/scrapetor/node.rb +539 -0
- data/lib/scrapetor/node_set.rb +301 -0
- data/lib/scrapetor/page_type.rb +95 -0
- data/lib/scrapetor/pagination.rb +109 -0
- data/lib/scrapetor/persistent_cache.rb +130 -0
- data/lib/scrapetor/robots.rb +159 -0
- data/lib/scrapetor/sax.rb +285 -0
- data/lib/scrapetor/schema.rb +144 -0
- data/lib/scrapetor/selector.rb +576 -0
- data/lib/scrapetor/session.rb +141 -0
- data/lib/scrapetor/sitemap.rb +52 -0
- data/lib/scrapetor/stream.rb +111 -0
- data/lib/scrapetor/structured_data.rb +74 -0
- data/lib/scrapetor/template_registry.rb +24 -0
- data/lib/scrapetor/text_node.rb +101 -0
- data/lib/scrapetor/url.rb +21 -0
- data/lib/scrapetor/version.rb +5 -0
- data/lib/scrapetor/xpath.rb +1603 -0
- data/lib/scrapetor.rb +167 -0
- data/scrapetor.gemspec +77 -0
- metadata +200 -0
|
@@ -0,0 +1,1603 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
# Full XPath 1.0 expression engine.
|
|
5
|
+
#
|
|
6
|
+
# Pipeline:
|
|
7
|
+
# 1. Tokenizer -> array of [:type, value] tokens
|
|
8
|
+
# 2. Parser -> AST (recursive-descent, full XPath 1.0 grammar)
|
|
9
|
+
# 3. Evaluator -> walks the AST against a Scrapetor::Document/Node
|
|
10
|
+
#
|
|
11
|
+
# Axis traversals dispatch to native C primitives on the arena DOM
|
|
12
|
+
# (`node_following_sibling_ids`, `node_ancestor_ids`, `node_following_ids`,
|
|
13
|
+
# `node_preceding_ids`, `node_descendant_comment_ids`, …) so the hot
|
|
14
|
+
# path stays in C even though the AST walk runs in Ruby.
|
|
15
|
+
#
|
|
16
|
+
# Compiled ASTs are cached on the module (LRU-bounded) so repeated
|
|
17
|
+
# queries — typical in scraping pipelines that run the same parser
|
|
18
|
+
# against thousands of pages — only pay the tokenize/parse cost once.
|
|
19
|
+
module XPath
|
|
20
|
+
class UnsupportedError < StandardError; end
|
|
21
|
+
class ParseError < StandardError; end
|
|
22
|
+
|
|
23
|
+
AST_CACHE_CAP = 1024
|
|
24
|
+
@ast_cache = {}
|
|
25
|
+
@ast_cache_mutex = Mutex.new
|
|
26
|
+
|
|
27
|
+
def self.evaluate(context, expr)
|
|
28
|
+
expr_s = expr.to_s
|
|
29
|
+
# Memo the AST + CSS-translation result together so the per-call
|
|
30
|
+
# overhead on the hot path collapses to one Hash lookup. The first
|
|
31
|
+
# call for a new expression pays parse + translate; every later
|
|
32
|
+
# call gets the cached descriptor or `false` (= no CSS fast path).
|
|
33
|
+
entry = @ast_cache[expr_s] || cache_compile(expr_s)
|
|
34
|
+
if (css = entry[:css])
|
|
35
|
+
return run_via_css(context, css)
|
|
36
|
+
end
|
|
37
|
+
Evaluator.new(context).eval_program(entry[:ast])
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def self.compile(expr)
|
|
41
|
+
cache_compile(expr.to_s)[:ast]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def self.cache_compile(expr)
|
|
45
|
+
cached = @ast_cache[expr]
|
|
46
|
+
return cached if cached
|
|
47
|
+
@ast_cache_mutex.synchronize do
|
|
48
|
+
cached = @ast_cache[expr]
|
|
49
|
+
return cached if cached
|
|
50
|
+
ast = Parser.new(Tokenizer.tokenize(expr), expr).parse_expr
|
|
51
|
+
css = CssTranslator.translate(ast)
|
|
52
|
+
entry = { ast: ast, css: css }
|
|
53
|
+
@ast_cache.shift if @ast_cache.size >= AST_CACHE_CAP
|
|
54
|
+
@ast_cache[expr] = entry
|
|
55
|
+
entry
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Execute a translated CSS chain. Handles the ::attr / ::text
|
|
60
|
+
# tail forms that CssTranslator emits for `/@x` and `/text()`
|
|
61
|
+
# terminations. Returns an Array (XPath shape) regardless of
|
|
62
|
+
# the underlying CSS return type.
|
|
63
|
+
def self.run_via_css(context, css_descriptor)
|
|
64
|
+
sel = css_descriptor[:sel]
|
|
65
|
+
kind = css_descriptor[:kind] # :nodes / :attr / :text
|
|
66
|
+
result = context.css(sel)
|
|
67
|
+
arr = result.respond_to?(:to_a) ? result.to_a : Array(result)
|
|
68
|
+
arr
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# ============================================================
|
|
72
|
+
# Tokenizer
|
|
73
|
+
# ============================================================
|
|
74
|
+
|
|
75
|
+
module Tokenizer
|
|
76
|
+
OPERATORS = %w[// / .. . :: @ ( ) [ ] , | + - = != <= >= < > * div mod and or].freeze
|
|
77
|
+
|
|
78
|
+
def self.tokenize(s)
|
|
79
|
+
tokens = []
|
|
80
|
+
i = 0
|
|
81
|
+
len = s.length
|
|
82
|
+
while i < len
|
|
83
|
+
c = s[i]
|
|
84
|
+
case c
|
|
85
|
+
when " ", "\t", "\n", "\r"
|
|
86
|
+
i += 1
|
|
87
|
+
when "/"
|
|
88
|
+
if s[i + 1] == "/"
|
|
89
|
+
tokens << [:slash_slash, "//"]; i += 2
|
|
90
|
+
else
|
|
91
|
+
tokens << [:slash, "/"]; i += 1
|
|
92
|
+
end
|
|
93
|
+
when "("
|
|
94
|
+
tokens << [:lparen, "("]; i += 1
|
|
95
|
+
when ")"
|
|
96
|
+
tokens << [:rparen, ")"]; i += 1
|
|
97
|
+
when "["
|
|
98
|
+
tokens << [:lbracket, "["]; i += 1
|
|
99
|
+
when "]"
|
|
100
|
+
tokens << [:rbracket, "]"]; i += 1
|
|
101
|
+
when ","
|
|
102
|
+
tokens << [:comma, ","]; i += 1
|
|
103
|
+
when "@"
|
|
104
|
+
tokens << [:at, "@"]; i += 1
|
|
105
|
+
when "|"
|
|
106
|
+
tokens << [:pipe, "|"]; i += 1
|
|
107
|
+
when "+"
|
|
108
|
+
tokens << [:plus, "+"]; i += 1
|
|
109
|
+
when "-"
|
|
110
|
+
# `-` is a tricky one. In XPath 1.0 it's only an operator
|
|
111
|
+
# when the preceding token is one of: another operator, `(`,
|
|
112
|
+
# `[`, `,`, or nothing (start of expression). Otherwise it's
|
|
113
|
+
# part of a name. NameTest disambiguates downstream.
|
|
114
|
+
prev = tokens.last
|
|
115
|
+
if prev.nil? || %i[lparen lbracket comma op slash slash_slash pipe at plus minus eq neq lt gt le ge star and_op or_op].include?(prev[0])
|
|
116
|
+
tokens << [:minus, "-"]
|
|
117
|
+
else
|
|
118
|
+
tokens << [:minus, "-"]
|
|
119
|
+
end
|
|
120
|
+
i += 1
|
|
121
|
+
when "="
|
|
122
|
+
tokens << [:eq, "="]; i += 1
|
|
123
|
+
when "!"
|
|
124
|
+
if s[i + 1] == "="
|
|
125
|
+
tokens << [:neq, "!="]; i += 2
|
|
126
|
+
else
|
|
127
|
+
raise ParseError, "stray `!` in `#{s}`"
|
|
128
|
+
end
|
|
129
|
+
when "<"
|
|
130
|
+
if s[i + 1] == "="
|
|
131
|
+
tokens << [:le, "<="]; i += 2
|
|
132
|
+
else
|
|
133
|
+
tokens << [:lt, "<"]; i += 1
|
|
134
|
+
end
|
|
135
|
+
when ">"
|
|
136
|
+
if s[i + 1] == "="
|
|
137
|
+
tokens << [:ge, ">="]; i += 2
|
|
138
|
+
else
|
|
139
|
+
tokens << [:gt, ">"]; i += 1
|
|
140
|
+
end
|
|
141
|
+
when ":"
|
|
142
|
+
if s[i + 1] == ":"
|
|
143
|
+
tokens << [:axis_sep, "::"]; i += 2
|
|
144
|
+
else
|
|
145
|
+
tokens << [:colon, ":"]; i += 1
|
|
146
|
+
end
|
|
147
|
+
when "*"
|
|
148
|
+
# `*` is multiplicative when prev is a value-producing token;
|
|
149
|
+
# otherwise it's NameTest "any element".
|
|
150
|
+
prev = tokens.last
|
|
151
|
+
if prev && %i[name number string rparen rbracket dot at_attr_done].include?(prev[0])
|
|
152
|
+
tokens << [:star_mul, "*"]
|
|
153
|
+
else
|
|
154
|
+
tokens << [:star, "*"]
|
|
155
|
+
end
|
|
156
|
+
i += 1
|
|
157
|
+
when "."
|
|
158
|
+
if s[i + 1] == "."
|
|
159
|
+
tokens << [:dot_dot, ".."]; i += 2
|
|
160
|
+
elsif s[i + 1] && (s[i + 1] >= "0" && s[i + 1] <= "9")
|
|
161
|
+
# Numeric literal starting with .
|
|
162
|
+
j = i + 1
|
|
163
|
+
j += 1 while j < len && s[j] >= "0" && s[j] <= "9"
|
|
164
|
+
tokens << [:number, s[i...j].to_f]; i = j
|
|
165
|
+
else
|
|
166
|
+
tokens << [:dot, "."]; i += 1
|
|
167
|
+
end
|
|
168
|
+
when "'", '"'
|
|
169
|
+
quote = c
|
|
170
|
+
j = i + 1
|
|
171
|
+
j += 1 while j < len && s[j] != quote
|
|
172
|
+
raise ParseError, "unterminated string in `#{s}`" if j >= len
|
|
173
|
+
tokens << [:string, s[(i + 1)...j]]
|
|
174
|
+
i = j + 1
|
|
175
|
+
when "0".."9"
|
|
176
|
+
j = i
|
|
177
|
+
j += 1 while j < len && s[j] >= "0" && s[j] <= "9"
|
|
178
|
+
if j < len && s[j] == "." && (j + 1 >= len || (s[j + 1] >= "0" && s[j + 1] <= "9"))
|
|
179
|
+
j += 1
|
|
180
|
+
j += 1 while j < len && s[j] >= "0" && s[j] <= "9"
|
|
181
|
+
tokens << [:number, s[i...j].to_f]
|
|
182
|
+
else
|
|
183
|
+
tokens << [:number, s[i...j].to_i]
|
|
184
|
+
end
|
|
185
|
+
i = j
|
|
186
|
+
else
|
|
187
|
+
# Name token: NCName chars (letters, digits, _, -). XPath
|
|
188
|
+
# operators `div`, `mod`, `and`, `or` are name-shaped; we
|
|
189
|
+
# classify them post-hoc based on context.
|
|
190
|
+
if c =~ /[A-Za-z_]/
|
|
191
|
+
j = i
|
|
192
|
+
j += 1 while j < len && s[j] =~ /[A-Za-z0-9_\-]/
|
|
193
|
+
name = s[i...j]
|
|
194
|
+
prev = tokens.last
|
|
195
|
+
# Operator names only kick in when the prior token suggests
|
|
196
|
+
# we're in an operator position (after a value-producing token).
|
|
197
|
+
op_position = prev && %i[name number string rparen rbracket star_mul dot_dot dot].include?(prev[0])
|
|
198
|
+
if op_position && name == "and"
|
|
199
|
+
tokens << [:and_op, "and"]
|
|
200
|
+
elsif op_position && name == "or"
|
|
201
|
+
tokens << [:or_op, "or"]
|
|
202
|
+
elsif op_position && name == "div"
|
|
203
|
+
tokens << [:div_op, "div"]
|
|
204
|
+
elsif op_position && name == "mod"
|
|
205
|
+
tokens << [:mod_op, "mod"]
|
|
206
|
+
else
|
|
207
|
+
tokens << [:name, name]
|
|
208
|
+
end
|
|
209
|
+
i = j
|
|
210
|
+
else
|
|
211
|
+
raise ParseError, "unrecognised char `#{c}` at #{i} in `#{s}`"
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
tokens << [:eof, nil]
|
|
216
|
+
tokens
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# ============================================================
|
|
221
|
+
# Parser → AST
|
|
222
|
+
#
|
|
223
|
+
# AST node shapes (all Hashes):
|
|
224
|
+
# { t: :path, steps: [step,...], absolute: bool, double_slash: bool }
|
|
225
|
+
# step: { axis: :child|:descendant_or_self|..., nt: nodetest, preds: [Expr,...] }
|
|
226
|
+
# nodetest: :any_element | :text | :comment | :node | :pi(name=...) | {name: 'tag'} | :attr({name})
|
|
227
|
+
# { t: :or, l:, r: }
|
|
228
|
+
# { t: :and, l:, r: }
|
|
229
|
+
# { t: :cmp, op: :eq|:neq|:lt|:le|:gt|:ge, l:, r: }
|
|
230
|
+
# { t: :add, op: :plus|:minus, l:, r: }
|
|
231
|
+
# { t: :mul, op: :mul|:div|:mod, l:, r: }
|
|
232
|
+
# { t: :neg, e: }
|
|
233
|
+
# { t: :union, ops: [Expr,...] }
|
|
234
|
+
# { t: :filter, primary: Expr, preds: [Expr,...] } chained with /path
|
|
235
|
+
# { t: :func, name: 'count', args: [Expr,...] }
|
|
236
|
+
# { t: :num, v: }
|
|
237
|
+
# { t: :str, v: }
|
|
238
|
+
# ============================================================
|
|
239
|
+
|
|
240
|
+
class Parser
|
|
241
|
+
def initialize(tokens, raw)
|
|
242
|
+
@tokens = tokens
|
|
243
|
+
@pos = 0
|
|
244
|
+
@raw = raw
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def peek(k = 0); @tokens[@pos + k]; end
|
|
248
|
+
def consume; t = @tokens[@pos]; @pos += 1; t; end
|
|
249
|
+
def expect(type)
|
|
250
|
+
t = @tokens[@pos]
|
|
251
|
+
unless t && t[0] == type
|
|
252
|
+
raise ParseError, "expected #{type} got #{t.inspect} in `#{@raw}`"
|
|
253
|
+
end
|
|
254
|
+
@pos += 1
|
|
255
|
+
t
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
def parse_expr
|
|
259
|
+
parse_or
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
def parse_or
|
|
263
|
+
l = parse_and
|
|
264
|
+
while peek[0] == :or_op
|
|
265
|
+
consume
|
|
266
|
+
l = { t: :or, l: l, r: parse_and }
|
|
267
|
+
end
|
|
268
|
+
l
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
def parse_and
|
|
272
|
+
l = parse_equality
|
|
273
|
+
while peek[0] == :and_op
|
|
274
|
+
consume
|
|
275
|
+
l = { t: :and, l: l, r: parse_equality }
|
|
276
|
+
end
|
|
277
|
+
l
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
def parse_equality
|
|
281
|
+
l = parse_relational
|
|
282
|
+
while %i[eq neq].include?(peek[0])
|
|
283
|
+
op = consume[0]
|
|
284
|
+
l = { t: :cmp, op: op, l: l, r: parse_relational }
|
|
285
|
+
end
|
|
286
|
+
l
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
def parse_relational
|
|
290
|
+
l = parse_additive
|
|
291
|
+
while %i[lt le gt ge].include?(peek[0])
|
|
292
|
+
op = consume[0]
|
|
293
|
+
l = { t: :cmp, op: op, l: l, r: parse_additive }
|
|
294
|
+
end
|
|
295
|
+
l
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
def parse_additive
|
|
299
|
+
l = parse_multiplicative
|
|
300
|
+
while %i[plus minus].include?(peek[0])
|
|
301
|
+
op = consume[0] == :plus ? :plus : :minus
|
|
302
|
+
l = { t: :add, op: op, l: l, r: parse_multiplicative }
|
|
303
|
+
end
|
|
304
|
+
l
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
def parse_multiplicative
|
|
308
|
+
l = parse_unary
|
|
309
|
+
loop do
|
|
310
|
+
tk = peek[0]
|
|
311
|
+
break unless %i[star_mul div_op mod_op].include?(tk)
|
|
312
|
+
consume
|
|
313
|
+
op = tk == :star_mul ? :mul : (tk == :div_op ? :div : :mod)
|
|
314
|
+
l = { t: :mul, op: op, l: l, r: parse_unary }
|
|
315
|
+
end
|
|
316
|
+
l
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
def parse_unary
|
|
320
|
+
if peek[0] == :minus
|
|
321
|
+
consume
|
|
322
|
+
{ t: :neg, e: parse_unary }
|
|
323
|
+
else
|
|
324
|
+
parse_union
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
def parse_union
|
|
329
|
+
l = parse_path
|
|
330
|
+
if peek[0] == :pipe
|
|
331
|
+
ops = [l]
|
|
332
|
+
while peek[0] == :pipe
|
|
333
|
+
consume
|
|
334
|
+
ops << parse_path
|
|
335
|
+
end
|
|
336
|
+
{ t: :union, ops: ops }
|
|
337
|
+
else
|
|
338
|
+
l
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
# PathExpr := LocationPath | FilterExpr ('/' RelativeLocationPath | '//' RelativeLocationPath)?
|
|
343
|
+
def parse_path
|
|
344
|
+
if location_path_start?
|
|
345
|
+
parse_location_path
|
|
346
|
+
else
|
|
347
|
+
primary = parse_primary
|
|
348
|
+
# PrimaryExpr Predicate*
|
|
349
|
+
preds = []
|
|
350
|
+
while peek[0] == :lbracket
|
|
351
|
+
consume
|
|
352
|
+
preds << parse_expr
|
|
353
|
+
expect(:rbracket)
|
|
354
|
+
end
|
|
355
|
+
base = preds.empty? ? primary : { t: :filter, primary: primary, preds: preds }
|
|
356
|
+
# Optional /RelativeLocationPath or //RelativeLocationPath
|
|
357
|
+
if %i[slash slash_slash].include?(peek[0])
|
|
358
|
+
steps = []
|
|
359
|
+
consume_path_separator(steps)
|
|
360
|
+
parse_relative_location_path_into(steps)
|
|
361
|
+
{ t: :filter_path, primary: base, steps: steps }
|
|
362
|
+
else
|
|
363
|
+
base
|
|
364
|
+
end
|
|
365
|
+
end
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
def location_path_start?
|
|
369
|
+
tk = peek[0]
|
|
370
|
+
return true if %i[slash slash_slash dot dot_dot at star].include?(tk)
|
|
371
|
+
# name token followed by something that looks like a node-test
|
|
372
|
+
# (axis_sep, paren-without-args-as-function, slash, predicate)
|
|
373
|
+
if tk == :name
|
|
374
|
+
n2 = peek(1)[0]
|
|
375
|
+
# `name::` is an axis
|
|
376
|
+
return true if n2 == :axis_sep
|
|
377
|
+
# `name(...)` -> function or nodetype()
|
|
378
|
+
if n2 == :lparen
|
|
379
|
+
# Distinguish nodetype() vs function call. Nodetypes are:
|
|
380
|
+
# text, comment, node, processing-instruction.
|
|
381
|
+
nm = peek[1]
|
|
382
|
+
return %w[text comment node processing-instruction].include?(nm)
|
|
383
|
+
end
|
|
384
|
+
# bare name like `div` — that's a location step (child::div)
|
|
385
|
+
return true
|
|
386
|
+
end
|
|
387
|
+
false
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
def parse_location_path
|
|
391
|
+
steps = []
|
|
392
|
+
absolute = false
|
|
393
|
+
double = false
|
|
394
|
+
if peek[0] == :slash_slash
|
|
395
|
+
absolute = true; double = true
|
|
396
|
+
steps << { axis: :descendant_or_self, nt: :node, preds: [] }
|
|
397
|
+
consume
|
|
398
|
+
elsif peek[0] == :slash
|
|
399
|
+
absolute = true
|
|
400
|
+
consume
|
|
401
|
+
# If the next token doesn't start a step, this is just '/'.
|
|
402
|
+
if !step_start?
|
|
403
|
+
return { t: :path, steps: steps, absolute: absolute, double_slash: false }
|
|
404
|
+
end
|
|
405
|
+
end
|
|
406
|
+
parse_relative_location_path_into(steps)
|
|
407
|
+
{ t: :path, steps: steps, absolute: absolute, double_slash: double }
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
def parse_relative_location_path_into(steps)
|
|
411
|
+
steps << parse_step
|
|
412
|
+
loop do
|
|
413
|
+
break unless %i[slash slash_slash].include?(peek[0])
|
|
414
|
+
consume_path_separator(steps)
|
|
415
|
+
steps << parse_step
|
|
416
|
+
end
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
def consume_path_separator(steps)
|
|
420
|
+
tk = consume[0]
|
|
421
|
+
if tk == :slash_slash
|
|
422
|
+
steps << { axis: :descendant_or_self, nt: :node, preds: [] }
|
|
423
|
+
end
|
|
424
|
+
end
|
|
425
|
+
|
|
426
|
+
def step_start?
|
|
427
|
+
%i[name star at dot dot_dot].include?(peek[0])
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
AXIS_SYMBOLS = {
|
|
431
|
+
"child" => :child,
|
|
432
|
+
"descendant" => :descendant,
|
|
433
|
+
"descendant-or-self" => :descendant_or_self,
|
|
434
|
+
"parent" => :parent,
|
|
435
|
+
"self" => :self,
|
|
436
|
+
"ancestor" => :ancestor,
|
|
437
|
+
"ancestor-or-self" => :ancestor_or_self,
|
|
438
|
+
"following-sibling" => :following_sibling,
|
|
439
|
+
"preceding-sibling" => :preceding_sibling,
|
|
440
|
+
"following" => :following,
|
|
441
|
+
"preceding" => :preceding,
|
|
442
|
+
"attribute" => :attribute,
|
|
443
|
+
"namespace" => :namespace
|
|
444
|
+
}.freeze
|
|
445
|
+
|
|
446
|
+
def parse_step
|
|
447
|
+
if peek[0] == :dot
|
|
448
|
+
consume
|
|
449
|
+
return { axis: :self, nt: :node, preds: [] }
|
|
450
|
+
end
|
|
451
|
+
if peek[0] == :dot_dot
|
|
452
|
+
consume
|
|
453
|
+
return { axis: :parent, nt: :node, preds: [] }
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
axis = :child
|
|
457
|
+
if peek[0] == :at
|
|
458
|
+
consume
|
|
459
|
+
axis = :attribute
|
|
460
|
+
elsif peek[0] == :name && peek(1)[0] == :axis_sep
|
|
461
|
+
name = consume[1]
|
|
462
|
+
axis_sym = AXIS_SYMBOLS[name]
|
|
463
|
+
raise UnsupportedError, "unknown axis `#{name}` in `#{@raw}`" unless axis_sym
|
|
464
|
+
axis = axis_sym
|
|
465
|
+
expect(:axis_sep)
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
nt = parse_node_test
|
|
469
|
+
preds = []
|
|
470
|
+
while peek[0] == :lbracket
|
|
471
|
+
consume
|
|
472
|
+
preds << parse_expr
|
|
473
|
+
expect(:rbracket)
|
|
474
|
+
end
|
|
475
|
+
{ axis: axis, nt: nt, preds: preds }
|
|
476
|
+
end
|
|
477
|
+
|
|
478
|
+
def parse_node_test
|
|
479
|
+
if peek[0] == :star
|
|
480
|
+
consume
|
|
481
|
+
return :any_element
|
|
482
|
+
end
|
|
483
|
+
if peek[0] == :name && peek(1)[0] == :lparen
|
|
484
|
+
nm = consume[1]
|
|
485
|
+
expect(:lparen)
|
|
486
|
+
case nm
|
|
487
|
+
when "node"
|
|
488
|
+
expect(:rparen)
|
|
489
|
+
return :node
|
|
490
|
+
when "text"
|
|
491
|
+
expect(:rparen)
|
|
492
|
+
return :text
|
|
493
|
+
when "comment"
|
|
494
|
+
expect(:rparen)
|
|
495
|
+
return :comment
|
|
496
|
+
when "processing-instruction"
|
|
497
|
+
arg = nil
|
|
498
|
+
if peek[0] == :string
|
|
499
|
+
arg = consume[1]
|
|
500
|
+
end
|
|
501
|
+
expect(:rparen)
|
|
502
|
+
return { pi: arg }
|
|
503
|
+
else
|
|
504
|
+
raise ParseError, "unexpected `#{nm}(` as node test in `#{@raw}`"
|
|
505
|
+
end
|
|
506
|
+
end
|
|
507
|
+
if peek[0] == :name
|
|
508
|
+
# tag name. Support qname (prefix:local) — we ignore the prefix.
|
|
509
|
+
name = consume[1]
|
|
510
|
+
if peek[0] == :colon && peek(1)[0] == :name
|
|
511
|
+
consume
|
|
512
|
+
name = consume[1]
|
|
513
|
+
end
|
|
514
|
+
return { name: name }
|
|
515
|
+
end
|
|
516
|
+
raise ParseError, "expected node test, got #{peek.inspect} in `#{@raw}`"
|
|
517
|
+
end
|
|
518
|
+
|
|
519
|
+
def parse_primary
|
|
520
|
+
tk = peek[0]
|
|
521
|
+
case tk
|
|
522
|
+
when :string
|
|
523
|
+
{ t: :str, v: consume[1] }
|
|
524
|
+
when :number
|
|
525
|
+
{ t: :num, v: consume[1] }
|
|
526
|
+
when :lparen
|
|
527
|
+
consume
|
|
528
|
+
e = parse_expr
|
|
529
|
+
expect(:rparen)
|
|
530
|
+
e
|
|
531
|
+
when :name
|
|
532
|
+
# FunctionCall
|
|
533
|
+
fname = consume[1]
|
|
534
|
+
expect(:lparen)
|
|
535
|
+
args = []
|
|
536
|
+
unless peek[0] == :rparen
|
|
537
|
+
args << parse_expr
|
|
538
|
+
while peek[0] == :comma
|
|
539
|
+
consume
|
|
540
|
+
args << parse_expr
|
|
541
|
+
end
|
|
542
|
+
end
|
|
543
|
+
expect(:rparen)
|
|
544
|
+
{ t: :func, name: fname, args: args }
|
|
545
|
+
else
|
|
546
|
+
raise ParseError, "unexpected `#{peek.inspect}` in `#{@raw}`"
|
|
547
|
+
end
|
|
548
|
+
end
|
|
549
|
+
end
|
|
550
|
+
|
|
551
|
+
# ============================================================
|
|
552
|
+
# CSS Translator: convert simple XPath AST shapes to CSS selectors
|
|
553
|
+
# so the heavily-optimised native CSS matcher answers them in one
|
|
554
|
+
# C call. Returns nil if the AST contains anything that doesn't
|
|
555
|
+
# round-trip cleanly to CSS (boolean predicates, position()/last()
|
|
556
|
+
# functions, sibling/ancestor axes, etc.) — caller then falls back
|
|
557
|
+
# to the full evaluator.
|
|
558
|
+
# ============================================================
|
|
559
|
+
|
|
560
|
+
module CssTranslator
|
|
561
|
+
# Returns { sel: "...", kind: :nodes|:attr|:text } or nil.
|
|
562
|
+
def self.translate(ast)
|
|
563
|
+
return nil unless ast.is_a?(Hash) && ast[:t] == :path
|
|
564
|
+
steps = ast[:steps]
|
|
565
|
+
return nil if steps.empty?
|
|
566
|
+
|
|
567
|
+
# We support these path patterns:
|
|
568
|
+
# absolute (/, //) and relative (.//, scoped from current node).
|
|
569
|
+
# The leading `descendant-or-self any-element` step that // injects
|
|
570
|
+
# gets collapsed with the next step: //tag becomes "tag", //a/b
|
|
571
|
+
# becomes "a > b" only when an explicit child separator follows.
|
|
572
|
+
idx = 0
|
|
573
|
+
css_parts = []
|
|
574
|
+
prev_was_descendant = ast[:absolute]
|
|
575
|
+
# If absolute starts with a single `/`, the first real step is at
|
|
576
|
+
# the document root child level → tighten with `> tag`. // (double_slash)
|
|
577
|
+
# already injected a descendant-or-self step.
|
|
578
|
+
|
|
579
|
+
while idx < steps.length
|
|
580
|
+
st = steps[idx]
|
|
581
|
+
axis = st[:axis]
|
|
582
|
+
nt = st[:nt]
|
|
583
|
+
preds = st[:preds]
|
|
584
|
+
|
|
585
|
+
# `descendant-or-self node()` (from //) — combiner only.
|
|
586
|
+
if axis == :descendant_or_self && nt == :node && preds.empty?
|
|
587
|
+
prev_was_descendant = true
|
|
588
|
+
idx += 1
|
|
589
|
+
next
|
|
590
|
+
end
|
|
591
|
+
|
|
592
|
+
# Tail extractions: @attr and text() must be the final step.
|
|
593
|
+
last = idx == steps.length - 1
|
|
594
|
+
if axis == :attribute && nt.is_a?(Hash) && nt[:name] && last
|
|
595
|
+
base = css_parts.join
|
|
596
|
+
return nil if base.empty?
|
|
597
|
+
return { sel: "#{base}::attr(#{nt[:name]})", kind: :attr }
|
|
598
|
+
end
|
|
599
|
+
# XPath text() returns one TextNode per literal text segment;
|
|
600
|
+
# CSS `::text` concatenates a node's textContent. The
|
|
601
|
+
# semantics diverge whenever an element has mixed text+inline
|
|
602
|
+
# children, so we never route text() through CSS — the full
|
|
603
|
+
# evaluator walks the arena and emits separate TextNodes per
|
|
604
|
+
# text node id, which matches XPath / Nokogiri semantics.
|
|
605
|
+
|
|
606
|
+
# Following-sibling axis: CSS `~` (general sibling) when the
|
|
607
|
+
# name test is concrete, equivalently `* + tag` for the [1]
|
|
608
|
+
# case (adjacent sibling). XPath following-sibling::name and
|
|
609
|
+
# CSS `~ name` both select siblings of the context node that
|
|
610
|
+
# come after it and match name, regardless of intervening
|
|
611
|
+
# nodes — identical semantics.
|
|
612
|
+
if axis == :following_sibling
|
|
613
|
+
tag = sibling_axis_tag(nt)
|
|
614
|
+
return nil unless tag
|
|
615
|
+
pred_strs = collect_pred_strs(preds)
|
|
616
|
+
return nil if pred_strs.nil?
|
|
617
|
+
return nil if css_parts.empty?
|
|
618
|
+
css_parts << " ~ " << tag
|
|
619
|
+
pred_strs.each { |ps| css_parts << ps }
|
|
620
|
+
prev_was_descendant = false
|
|
621
|
+
idx += 1
|
|
622
|
+
next
|
|
623
|
+
end
|
|
624
|
+
|
|
625
|
+
# Only handle child axis for intermediate steps in the CSS path.
|
|
626
|
+
return nil unless axis == :child
|
|
627
|
+
|
|
628
|
+
# Node test must be a tag name or `*`.
|
|
629
|
+
tag =
|
|
630
|
+
case nt
|
|
631
|
+
when :any_element then "*"
|
|
632
|
+
when Hash
|
|
633
|
+
return nil unless nt[:name]
|
|
634
|
+
nt[:name]
|
|
635
|
+
else
|
|
636
|
+
return nil
|
|
637
|
+
end
|
|
638
|
+
|
|
639
|
+
# Predicates: translate each to CSS bracket / pseudo if possible.
|
|
640
|
+
pred_strs = collect_pred_strs(preds)
|
|
641
|
+
return nil if pred_strs.nil?
|
|
642
|
+
|
|
643
|
+
if css_parts.empty?
|
|
644
|
+
css_parts << tag
|
|
645
|
+
elsif prev_was_descendant
|
|
646
|
+
css_parts << " " << tag
|
|
647
|
+
else
|
|
648
|
+
css_parts << " > " << tag
|
|
649
|
+
end
|
|
650
|
+
pred_strs.each { |ps| css_parts << ps }
|
|
651
|
+
|
|
652
|
+
prev_was_descendant = false
|
|
653
|
+
idx += 1
|
|
654
|
+
end
|
|
655
|
+
|
|
656
|
+
sel = css_parts.join
|
|
657
|
+
return nil if sel.empty?
|
|
658
|
+
{ sel: sel, kind: :nodes }
|
|
659
|
+
end
|
|
660
|
+
|
|
661
|
+
# Convert a predicate AST to a CSS bracket / pseudo selector
|
|
662
|
+
# fragment. Returns nil if the predicate uses anything CSS can't
|
|
663
|
+
# express (booleans, position()/last() functions, text() etc.).
|
|
664
|
+
def self.translate_predicate(ast)
|
|
665
|
+
case ast[:t]
|
|
666
|
+
when :path
|
|
667
|
+
# @attr alone: a path with one step axis=:attribute, nt={name:...}.
|
|
668
|
+
steps = ast[:steps]
|
|
669
|
+
return nil unless steps.length == 1
|
|
670
|
+
st = steps[0]
|
|
671
|
+
if st[:axis] == :attribute && st[:nt].is_a?(Hash) && st[:nt][:name] && st[:preds].empty?
|
|
672
|
+
return "[#{st[:nt][:name]}]"
|
|
673
|
+
end
|
|
674
|
+
nil
|
|
675
|
+
when :num
|
|
676
|
+
# Positional predicate [N]: XPath `child::tag[N]` ≡ "Nth tag
|
|
677
|
+
# child" which matches CSS `:nth-of-type(N)` exactly (both pick
|
|
678
|
+
# the Nth member of the same-tag children of the parent).
|
|
679
|
+
n = ast[:v].to_i
|
|
680
|
+
return nil unless n >= 1
|
|
681
|
+
":nth-of-type(#{n})"
|
|
682
|
+
when :cmp
|
|
683
|
+
# position() comparisons: XPath position() refers to the
|
|
684
|
+
# context position within the parent's same-tag children
|
|
685
|
+
# (matching CSS :nth-of-type semantics). These translate to
|
|
686
|
+
# the corresponding :nth-of-type formulas:
|
|
687
|
+
# position() = N → :nth-of-type(N)
|
|
688
|
+
# position() > N → :nth-of-type(n+N+1)
|
|
689
|
+
# position() >= N → :nth-of-type(n+N)
|
|
690
|
+
# position() < N → :nth-of-type(-n+N-1)
|
|
691
|
+
# position() <= N → :nth-of-type(-n+N)
|
|
692
|
+
if is_position_func?(ast[:l]) && (n = const_int(ast[:r]))
|
|
693
|
+
return nth_for(ast[:op], n)
|
|
694
|
+
elsif is_position_func?(ast[:r]) && (n = const_int(ast[:l]))
|
|
695
|
+
return nth_for(flip_cmp(ast[:op]), n)
|
|
696
|
+
end
|
|
697
|
+
return nil unless ast[:op] == :eq
|
|
698
|
+
attr_name = extract_attr_name(ast[:l])
|
|
699
|
+
val_lit = extract_string_literal(ast[:r])
|
|
700
|
+
# try the other order
|
|
701
|
+
if attr_name.nil?
|
|
702
|
+
attr_name = extract_attr_name(ast[:r])
|
|
703
|
+
val_lit = extract_string_literal(ast[:l])
|
|
704
|
+
end
|
|
705
|
+
return nil if attr_name.nil? || val_lit.nil?
|
|
706
|
+
"[#{attr_name}=#{quote_css(val_lit)}]"
|
|
707
|
+
when :and
|
|
708
|
+
# Boolean AND of two simpler predicates → just concatenate the
|
|
709
|
+
# CSS fragments (CSS treats `[a][b]` as logical AND).
|
|
710
|
+
l = translate_predicate(ast[:l])
|
|
711
|
+
r = translate_predicate(ast[:r])
|
|
712
|
+
return nil if l.nil? || r.nil?
|
|
713
|
+
"#{l}#{r}"
|
|
714
|
+
when :func
|
|
715
|
+
case ast[:name]
|
|
716
|
+
when "contains"
|
|
717
|
+
a = extract_attr_name(ast[:args][0])
|
|
718
|
+
v = extract_string_literal(ast[:args][1])
|
|
719
|
+
return nil unless a && v
|
|
720
|
+
"[#{a}*=#{quote_css(v)}]"
|
|
721
|
+
when "starts-with"
|
|
722
|
+
a = extract_attr_name(ast[:args][0])
|
|
723
|
+
v = extract_string_literal(ast[:args][1])
|
|
724
|
+
return nil unless a && v
|
|
725
|
+
"[#{a}^=#{quote_css(v)}]"
|
|
726
|
+
else
|
|
727
|
+
nil
|
|
728
|
+
end
|
|
729
|
+
else
|
|
730
|
+
nil
|
|
731
|
+
end
|
|
732
|
+
end
|
|
733
|
+
|
|
734
|
+
def self.extract_attr_name(ast)
|
|
735
|
+
return nil unless ast.is_a?(Hash) && ast[:t] == :path
|
|
736
|
+
steps = ast[:steps]
|
|
737
|
+
return nil unless steps.length == 1
|
|
738
|
+
st = steps[0]
|
|
739
|
+
return nil unless st[:axis] == :attribute && st[:nt].is_a?(Hash) && st[:nt][:name]
|
|
740
|
+
st[:nt][:name]
|
|
741
|
+
end
|
|
742
|
+
|
|
743
|
+
def self.extract_string_literal(ast)
|
|
744
|
+
return nil unless ast.is_a?(Hash)
|
|
745
|
+
ast[:t] == :str ? ast[:v] : nil
|
|
746
|
+
end
|
|
747
|
+
|
|
748
|
+
def self.quote_css(s)
|
|
749
|
+
s.match?(/[\s\[\]'"=]/) ? "\"#{s.gsub('"', '\\"')}\"" : "'#{s}'"
|
|
750
|
+
end
|
|
751
|
+
|
|
752
|
+
def self.sibling_axis_tag(nt)
|
|
753
|
+
case nt
|
|
754
|
+
when :any_element then "*"
|
|
755
|
+
when Hash
|
|
756
|
+
nt[:name]
|
|
757
|
+
end
|
|
758
|
+
end
|
|
759
|
+
|
|
760
|
+
def self.collect_pred_strs(preds)
|
|
761
|
+
out = []
|
|
762
|
+
preds.each do |p|
|
|
763
|
+
cs = translate_predicate(p)
|
|
764
|
+
return nil if cs.nil?
|
|
765
|
+
out << cs
|
|
766
|
+
end
|
|
767
|
+
out
|
|
768
|
+
end
|
|
769
|
+
|
|
770
|
+
def self.is_position_func?(ast)
|
|
771
|
+
ast.is_a?(Hash) && ast[:t] == :func && ast[:name] == "position" && ast[:args].empty?
|
|
772
|
+
end
|
|
773
|
+
|
|
774
|
+
def self.const_int(ast)
|
|
775
|
+
return nil unless ast.is_a?(Hash) && ast[:t] == :num
|
|
776
|
+
n = ast[:v]
|
|
777
|
+
n.respond_to?(:to_i) ? n.to_i : nil
|
|
778
|
+
end
|
|
779
|
+
|
|
780
|
+
def self.flip_cmp(op)
|
|
781
|
+
# Flip the operator when operands are swapped
|
|
782
|
+
case op
|
|
783
|
+
when :lt then :gt
|
|
784
|
+
when :le then :ge
|
|
785
|
+
when :gt then :lt
|
|
786
|
+
when :ge then :le
|
|
787
|
+
else op
|
|
788
|
+
end
|
|
789
|
+
end
|
|
790
|
+
|
|
791
|
+
def self.nth_for(op, n)
|
|
792
|
+
case op
|
|
793
|
+
when :eq then ":nth-of-type(#{n})"
|
|
794
|
+
when :gt then ":nth-of-type(n+#{n + 1})"
|
|
795
|
+
when :ge then ":nth-of-type(n+#{n})"
|
|
796
|
+
when :lt
|
|
797
|
+
# All positions strictly less than n. CSS `-n+N` matches 1..N.
|
|
798
|
+
return nil if n <= 1
|
|
799
|
+
":nth-of-type(-n+#{n - 1})"
|
|
800
|
+
when :le
|
|
801
|
+
return nil if n < 1
|
|
802
|
+
":nth-of-type(-n+#{n})"
|
|
803
|
+
end
|
|
804
|
+
end
|
|
805
|
+
end
|
|
806
|
+
|
|
807
|
+
# ============================================================
|
|
808
|
+
# Evaluator
|
|
809
|
+
# ============================================================
|
|
810
|
+
|
|
811
|
+
class Evaluator
|
|
812
|
+
def initialize(context)
|
|
813
|
+
@document = context.is_a?(Scrapetor::Document) ? context : context.document
|
|
814
|
+
@native_doc, @native_root_id = native_handles_for(context)
|
|
815
|
+
@context_input = context
|
|
816
|
+
end
|
|
817
|
+
|
|
818
|
+
def eval_program(ast)
|
|
819
|
+
result = eval_expr(ast, [ initial_context_node ], nil)
|
|
820
|
+
# Flatten singleton arrays produced by terminal extractions.
|
|
821
|
+
result.is_a?(Array) ? result : [result]
|
|
822
|
+
end
|
|
823
|
+
|
|
824
|
+
def initial_context_node
|
|
825
|
+
# For Document inputs: the context is the document wrapper (so we
|
|
826
|
+
# can descend into its children via the arena). For Node inputs:
|
|
827
|
+
# the context is the node itself.
|
|
828
|
+
@initial =
|
|
829
|
+
if @context_input.is_a?(Scrapetor::Document)
|
|
830
|
+
@context_input.backing
|
|
831
|
+
else
|
|
832
|
+
@context_input
|
|
833
|
+
end
|
|
834
|
+
end
|
|
835
|
+
|
|
836
|
+
# eval_expr returns one of:
|
|
837
|
+
# Array<Node|String> (node-set or string-set for /@x and /text())
|
|
838
|
+
# String / Numeric / TrueClass / FalseClass / NilClass (scalar)
|
|
839
|
+
def eval_expr(ast, context_set, position_info)
|
|
840
|
+
case ast[:t]
|
|
841
|
+
when :path
|
|
842
|
+
eval_path(ast, context_set, position_info)
|
|
843
|
+
when :filter
|
|
844
|
+
base = eval_expr(ast[:primary], context_set, position_info)
|
|
845
|
+
apply_predicates(base, ast[:preds])
|
|
846
|
+
when :filter_path
|
|
847
|
+
base = eval_expr(ast[:primary], context_set, position_info)
|
|
848
|
+
eval_steps_against(base, ast[:steps])
|
|
849
|
+
when :union
|
|
850
|
+
out = []
|
|
851
|
+
seen = {}
|
|
852
|
+
ast[:ops].each do |op|
|
|
853
|
+
r = eval_expr(op, context_set, position_info)
|
|
854
|
+
r = [r] unless r.is_a?(Array)
|
|
855
|
+
r.each do |n|
|
|
856
|
+
key = node_identity(n)
|
|
857
|
+
next if seen[key]
|
|
858
|
+
seen[key] = true
|
|
859
|
+
out << n
|
|
860
|
+
end
|
|
861
|
+
end
|
|
862
|
+
out
|
|
863
|
+
when :or
|
|
864
|
+
xpath_boolean(eval_expr(ast[:l], context_set, position_info)) ||
|
|
865
|
+
xpath_boolean(eval_expr(ast[:r], context_set, position_info))
|
|
866
|
+
when :and
|
|
867
|
+
xpath_boolean(eval_expr(ast[:l], context_set, position_info)) &&
|
|
868
|
+
xpath_boolean(eval_expr(ast[:r], context_set, position_info))
|
|
869
|
+
when :cmp
|
|
870
|
+
do_compare(ast[:op],
|
|
871
|
+
eval_expr(ast[:l], context_set, position_info),
|
|
872
|
+
eval_expr(ast[:r], context_set, position_info))
|
|
873
|
+
when :add
|
|
874
|
+
l = xpath_number(eval_expr(ast[:l], context_set, position_info))
|
|
875
|
+
r = xpath_number(eval_expr(ast[:r], context_set, position_info))
|
|
876
|
+
return Float::NAN if l.respond_to?(:nan?) && (l.nan? || r.nan?)
|
|
877
|
+
ast[:op] == :plus ? (l + r) : (l - r)
|
|
878
|
+
when :mul
|
|
879
|
+
l = xpath_number(eval_expr(ast[:l], context_set, position_info))
|
|
880
|
+
r = xpath_number(eval_expr(ast[:r], context_set, position_info))
|
|
881
|
+
case ast[:op]
|
|
882
|
+
when :mul then l * r
|
|
883
|
+
when :div
|
|
884
|
+
r.zero? ? (l.zero? ? Float::NAN : (l.positive? ? Float::INFINITY : -Float::INFINITY)) : (l.to_f / r.to_f)
|
|
885
|
+
when :mod
|
|
886
|
+
r.zero? ? Float::NAN : (l - (l.to_i / r.to_i) * r)
|
|
887
|
+
end
|
|
888
|
+
when :neg
|
|
889
|
+
-xpath_number(eval_expr(ast[:e], context_set, position_info))
|
|
890
|
+
when :num
|
|
891
|
+
ast[:v]
|
|
892
|
+
when :str
|
|
893
|
+
ast[:v]
|
|
894
|
+
when :func
|
|
895
|
+
call_function(ast[:name], ast[:args], context_set, position_info)
|
|
896
|
+
else
|
|
897
|
+
raise UnsupportedError, "unknown AST node: #{ast[:t]}"
|
|
898
|
+
end
|
|
899
|
+
end
|
|
900
|
+
|
|
901
|
+
def eval_path(ast, context_set, position_info)
|
|
902
|
+
nodes =
|
|
903
|
+
if ast[:absolute]
|
|
904
|
+
[ root_for_context ]
|
|
905
|
+
else
|
|
906
|
+
context_set
|
|
907
|
+
end
|
|
908
|
+
eval_steps_against(nodes, ast[:steps])
|
|
909
|
+
end
|
|
910
|
+
|
|
911
|
+
def eval_steps_against(nodes, steps)
|
|
912
|
+
current = nodes
|
|
913
|
+
steps.each do |st|
|
|
914
|
+
current = step_walk(current, st)
|
|
915
|
+
current = apply_step_predicates(current, st[:preds]) unless st[:preds].empty?
|
|
916
|
+
end
|
|
917
|
+
current
|
|
918
|
+
end
|
|
919
|
+
|
|
920
|
+
def step_walk(current, st)
|
|
921
|
+
axis = st[:axis]
|
|
922
|
+
nt = st[:nt]
|
|
923
|
+
out = []
|
|
924
|
+
current.each do |n|
|
|
925
|
+
case axis
|
|
926
|
+
when :child
|
|
927
|
+
collect_children(n, nt, out)
|
|
928
|
+
when :descendant
|
|
929
|
+
collect_descendants(n, nt, out)
|
|
930
|
+
when :descendant_or_self
|
|
931
|
+
collect_self(n, nt, out)
|
|
932
|
+
collect_descendants(n, nt, out)
|
|
933
|
+
when :parent
|
|
934
|
+
p = parent_of(n)
|
|
935
|
+
push_if_matches(p, nt, out) if p
|
|
936
|
+
when :self
|
|
937
|
+
collect_self(n, nt, out)
|
|
938
|
+
when :ancestor
|
|
939
|
+
ancestors_of(n).each { |a| push_if_matches(a, nt, out) }
|
|
940
|
+
when :ancestor_or_self
|
|
941
|
+
ancestors_of(n).each { |a| push_if_matches(a, nt, out) }
|
|
942
|
+
collect_self(n, nt, out)
|
|
943
|
+
when :following_sibling
|
|
944
|
+
following_siblings_of(n).each { |s| push_if_matches(s, nt, out) }
|
|
945
|
+
when :preceding_sibling
|
|
946
|
+
preceding_siblings_of(n).each { |s| push_if_matches(s, nt, out) }
|
|
947
|
+
when :following
|
|
948
|
+
following_of(n).each { |s| push_if_matches(s, nt, out) }
|
|
949
|
+
when :preceding
|
|
950
|
+
preceding_of(n).each { |s| push_if_matches(s, nt, out) }
|
|
951
|
+
when :attribute
|
|
952
|
+
collect_attributes(n, nt, out)
|
|
953
|
+
when :namespace
|
|
954
|
+
# No-op: we don't model namespace nodes.
|
|
955
|
+
end
|
|
956
|
+
end
|
|
957
|
+
# Per XPath 1.0 §2.1: every axis step produces a node-set
|
|
958
|
+
# (i.e. duplicate-free, document-ordered). When the input
|
|
959
|
+
# context set has multiple nodes, the axis walks can produce
|
|
960
|
+
# overlapping results — e.g. //dt/following-sibling::dd from
|
|
961
|
+
# 50 sibling dts each emits a long suffix of overlapping dds.
|
|
962
|
+
# Deduplicate by node identity so callers see set semantics.
|
|
963
|
+
dedupe_node_set(out)
|
|
964
|
+
end
|
|
965
|
+
|
|
966
|
+
def dedupe_node_set(nodes)
|
|
967
|
+
return nodes if nodes.length < 2
|
|
968
|
+
seen = {}
|
|
969
|
+
out = []
|
|
970
|
+
nodes.each do |n|
|
|
971
|
+
key = node_identity(n)
|
|
972
|
+
next if seen[key]
|
|
973
|
+
seen[key] = true
|
|
974
|
+
out << n
|
|
975
|
+
end
|
|
976
|
+
out
|
|
977
|
+
end
|
|
978
|
+
|
|
979
|
+
def apply_step_predicates(nodes, preds)
|
|
980
|
+
preds.each do |pred_ast|
|
|
981
|
+
filtered = []
|
|
982
|
+
total = nodes.length
|
|
983
|
+
nodes.each_with_index do |n, idx|
|
|
984
|
+
ctx_info = { position: idx + 1, last: total }
|
|
985
|
+
r = eval_expr(pred_ast, [n], ctx_info)
|
|
986
|
+
keep =
|
|
987
|
+
if r.is_a?(Numeric)
|
|
988
|
+
# Numeric predicate: positional
|
|
989
|
+
r.to_i == idx + 1
|
|
990
|
+
else
|
|
991
|
+
xpath_boolean(r)
|
|
992
|
+
end
|
|
993
|
+
filtered << n if keep
|
|
994
|
+
end
|
|
995
|
+
nodes = filtered
|
|
996
|
+
end
|
|
997
|
+
nodes
|
|
998
|
+
end
|
|
999
|
+
|
|
1000
|
+
def apply_predicates(base, preds)
|
|
1001
|
+
nodes = base.is_a?(Array) ? base : [base]
|
|
1002
|
+
apply_step_predicates(nodes, preds)
|
|
1003
|
+
end
|
|
1004
|
+
|
|
1005
|
+
# ---- Node identity / wrapping ------------------------------------
|
|
1006
|
+
|
|
1007
|
+
def root_for_context
|
|
1008
|
+
@document.backing
|
|
1009
|
+
end
|
|
1010
|
+
|
|
1011
|
+
def parent_of(n)
|
|
1012
|
+
return nil if n.nil?
|
|
1013
|
+
if native_node?(n)
|
|
1014
|
+
pid = n.doc.node_parent(n.id)
|
|
1015
|
+
pid ? wrap_native(pid) : @document
|
|
1016
|
+
elsif n.is_a?(Scrapetor::Native::DocumentWrapper) || n.is_a?(Scrapetor::Document)
|
|
1017
|
+
nil
|
|
1018
|
+
elsif n.is_a?(Scrapetor::Node)
|
|
1019
|
+
n.parent
|
|
1020
|
+
elsif n.respond_to?(:parent)
|
|
1021
|
+
n.parent
|
|
1022
|
+
end
|
|
1023
|
+
end
|
|
1024
|
+
|
|
1025
|
+
def collect_self(n, nt, out)
|
|
1026
|
+
push_if_matches(n, nt, out)
|
|
1027
|
+
end
|
|
1028
|
+
|
|
1029
|
+
def collect_children(n, nt, out)
|
|
1030
|
+
nd, rid, wrapper = arena_handle_for(n)
|
|
1031
|
+
if nd
|
|
1032
|
+
nd.node_children(rid).each do |cid|
|
|
1033
|
+
type = nd.node_type(cid)
|
|
1034
|
+
wrapped = wrap_native_typed_with(nd, cid, type, wrapper: wrapper)
|
|
1035
|
+
push_if_matches(wrapped, nt, out) if wrapped
|
|
1036
|
+
end
|
|
1037
|
+
return
|
|
1038
|
+
end
|
|
1039
|
+
if n.is_a?(Scrapetor::Document)
|
|
1040
|
+
n.backing.respond_to?(:children) ? n.backing.children.each { |c| push_if_matches(wrap_dom(c), nt, out) } : nil
|
|
1041
|
+
elsif n.is_a?(Scrapetor::Node)
|
|
1042
|
+
n.backing_node.children.each { |c| push_if_matches(wrap_dom(c), nt, out) }
|
|
1043
|
+
end
|
|
1044
|
+
end
|
|
1045
|
+
|
|
1046
|
+
# Returns [native_doc, node_id, wrapper] when the node lives in
|
|
1047
|
+
# the arena, nil otherwise. Handles all three native carriers:
|
|
1048
|
+
# Scrapetor::Node wrapping a Native::Element, the
|
|
1049
|
+
# Native::DocumentWrapper itself (root context), and a raw
|
|
1050
|
+
# Native::Element.
|
|
1051
|
+
def arena_handle_for(n)
|
|
1052
|
+
if n.is_a?(Scrapetor::Node)
|
|
1053
|
+
bk = n.backing_node
|
|
1054
|
+
if bk.respond_to?(:id) && bk.respond_to?(:doc) && bk.doc.respond_to?(:node_following_sibling_ids)
|
|
1055
|
+
return [bk.doc, bk.id, (bk.respond_to?(:wrapper) ? bk.wrapper : nil)]
|
|
1056
|
+
end
|
|
1057
|
+
elsif n.is_a?(Scrapetor::Native::DocumentWrapper)
|
|
1058
|
+
return [n.native, 0, n]
|
|
1059
|
+
elsif n.respond_to?(:id) && n.respond_to?(:doc) && n.doc.respond_to?(:node_following_sibling_ids)
|
|
1060
|
+
# raw Native::Element
|
|
1061
|
+
return [n.doc, n.id, (n.respond_to?(:wrapper) ? n.wrapper : nil)]
|
|
1062
|
+
end
|
|
1063
|
+
nil
|
|
1064
|
+
end
|
|
1065
|
+
|
|
1066
|
+
def collect_descendants(n, nt, out)
|
|
1067
|
+
nd, rid, wrapper = arena_handle_for(n)
|
|
1068
|
+
if nd
|
|
1069
|
+
# Comment-specific fast path: dedicated C primitive.
|
|
1070
|
+
if nt == :comment
|
|
1071
|
+
nd.node_descendant_comment_ids(rid).each { |cid|
|
|
1072
|
+
out << wrap_native_typed_with(nd, cid, 8, wrapper: wrapper)
|
|
1073
|
+
}
|
|
1074
|
+
return
|
|
1075
|
+
end
|
|
1076
|
+
collect_descendant_ids(nd, rid, nt, out, wrapper)
|
|
1077
|
+
return
|
|
1078
|
+
end
|
|
1079
|
+
if n.respond_to?(:children)
|
|
1080
|
+
stack = n.children.to_a.reverse
|
|
1081
|
+
while (c = stack.pop)
|
|
1082
|
+
push_if_matches(c.is_a?(Scrapetor::Node) ? c : wrap_dom(c), nt, out)
|
|
1083
|
+
if c.respond_to?(:children)
|
|
1084
|
+
kids = c.children.to_a
|
|
1085
|
+
stack.concat(kids.reverse)
|
|
1086
|
+
end
|
|
1087
|
+
end
|
|
1088
|
+
end
|
|
1089
|
+
end
|
|
1090
|
+
|
|
1091
|
+
def collect_descendant_ids(nd, rid, nt, out, wrapper)
|
|
1092
|
+
# Range walk: ids (rid+1 .. dfs_out(rid)] are descendants. Filter
|
|
1093
|
+
# by node test and push wrapped results. Skips non-elements
|
|
1094
|
+
# unless the test wants them.
|
|
1095
|
+
# For DocumentWrapper rid=0, we want to enumerate all descendants
|
|
1096
|
+
# which means everything in the arena from id 1 up.
|
|
1097
|
+
size = nd.size
|
|
1098
|
+
lo = rid + 1
|
|
1099
|
+
hi = size - 1
|
|
1100
|
+
# We can fall back to a generic stack walk if needed, but ids
|
|
1101
|
+
# are pre-order in the unmutated case, so the range walk is exact.
|
|
1102
|
+
# node_type call avoids loading nodes we'll skip immediately.
|
|
1103
|
+
case nt
|
|
1104
|
+
when :any_element, :node
|
|
1105
|
+
(lo..hi).each do |k|
|
|
1106
|
+
t = nd.node_type(k)
|
|
1107
|
+
next unless t == 1 || (nt == :node && (t == 1 || t == 3 || t == 8))
|
|
1108
|
+
out << wrap_native_typed_with(nd, k, t, wrapper: wrapper)
|
|
1109
|
+
end
|
|
1110
|
+
when :text
|
|
1111
|
+
(lo..hi).each do |k|
|
|
1112
|
+
t = nd.node_type(k)
|
|
1113
|
+
next unless t == 3
|
|
1114
|
+
out << wrap_native_typed_with(nd, k, t, wrapper: wrapper)
|
|
1115
|
+
end
|
|
1116
|
+
when :comment
|
|
1117
|
+
(lo..hi).each do |k|
|
|
1118
|
+
t = nd.node_type(k)
|
|
1119
|
+
next unless t == 8
|
|
1120
|
+
out << wrap_native_typed_with(nd, k, t, wrapper: wrapper)
|
|
1121
|
+
end
|
|
1122
|
+
when Hash
|
|
1123
|
+
if (name = nt[:name])
|
|
1124
|
+
target = name.downcase
|
|
1125
|
+
(lo..hi).each do |k|
|
|
1126
|
+
next unless nd.node_type(k) == 1
|
|
1127
|
+
n = nd.node_name(k)
|
|
1128
|
+
next unless n.casecmp(target).zero?
|
|
1129
|
+
out << wrap_native_typed_with(nd, k, 1, wrapper: wrapper)
|
|
1130
|
+
end
|
|
1131
|
+
end
|
|
1132
|
+
end
|
|
1133
|
+
end
|
|
1134
|
+
|
|
1135
|
+
def ancestors_of(n)
|
|
1136
|
+
nd, rid, wrapper = arena_handle_for(n)
|
|
1137
|
+
if nd
|
|
1138
|
+
nd.node_ancestor_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
|
|
1139
|
+
elsif n.is_a?(Scrapetor::Node)
|
|
1140
|
+
list = []
|
|
1141
|
+
cur = n.parent
|
|
1142
|
+
while cur
|
|
1143
|
+
list << cur
|
|
1144
|
+
cur = cur.parent
|
|
1145
|
+
end
|
|
1146
|
+
list.reverse
|
|
1147
|
+
else
|
|
1148
|
+
[]
|
|
1149
|
+
end
|
|
1150
|
+
end
|
|
1151
|
+
|
|
1152
|
+
def following_siblings_of(n)
|
|
1153
|
+
nd, rid, wrapper = arena_handle_for(n)
|
|
1154
|
+
if nd
|
|
1155
|
+
nd.node_following_sibling_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
|
|
1156
|
+
elsif n.is_a?(Scrapetor::Node)
|
|
1157
|
+
out = []
|
|
1158
|
+
cur = n.next_sibling
|
|
1159
|
+
while cur
|
|
1160
|
+
out << cur if cur.respond_to?(:element?) && cur.element?
|
|
1161
|
+
cur = cur.respond_to?(:next_sibling) ? cur.next_sibling : nil
|
|
1162
|
+
end
|
|
1163
|
+
out
|
|
1164
|
+
else
|
|
1165
|
+
[]
|
|
1166
|
+
end
|
|
1167
|
+
end
|
|
1168
|
+
|
|
1169
|
+
def preceding_siblings_of(n)
|
|
1170
|
+
nd, rid, wrapper = arena_handle_for(n)
|
|
1171
|
+
if nd
|
|
1172
|
+
nd.node_preceding_sibling_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
|
|
1173
|
+
elsif n.is_a?(Scrapetor::Node)
|
|
1174
|
+
out = []
|
|
1175
|
+
cur = n.previous_sibling
|
|
1176
|
+
while cur
|
|
1177
|
+
out.unshift(cur) if cur.respond_to?(:element?) && cur.element?
|
|
1178
|
+
cur = cur.respond_to?(:previous_sibling) ? cur.previous_sibling : nil
|
|
1179
|
+
end
|
|
1180
|
+
out
|
|
1181
|
+
else
|
|
1182
|
+
[]
|
|
1183
|
+
end
|
|
1184
|
+
end
|
|
1185
|
+
|
|
1186
|
+
def following_of(n)
|
|
1187
|
+
nd, rid, wrapper = arena_handle_for(n)
|
|
1188
|
+
return [] unless nd
|
|
1189
|
+
nd.node_following_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
|
|
1190
|
+
end
|
|
1191
|
+
|
|
1192
|
+
def preceding_of(n)
|
|
1193
|
+
nd, rid, wrapper = arena_handle_for(n)
|
|
1194
|
+
return [] unless nd
|
|
1195
|
+
nd.node_preceding_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
|
|
1196
|
+
end
|
|
1197
|
+
|
|
1198
|
+
def collect_attributes(n, nt, out)
|
|
1199
|
+
if native_node?(n)
|
|
1200
|
+
attrs = n.doc.node_attributes(n.id)
|
|
1201
|
+
attrs.each do |name, val|
|
|
1202
|
+
case nt
|
|
1203
|
+
when :any_element, :any_node, :node
|
|
1204
|
+
out << val
|
|
1205
|
+
when Hash
|
|
1206
|
+
if nt[:name].nil? || nt[:name] == "*" || name.casecmp(nt[:name]).zero?
|
|
1207
|
+
out << val
|
|
1208
|
+
end
|
|
1209
|
+
end
|
|
1210
|
+
end
|
|
1211
|
+
elsif n.is_a?(Scrapetor::Node)
|
|
1212
|
+
attrs = n.attributes
|
|
1213
|
+
attrs.each do |name, val|
|
|
1214
|
+
case nt
|
|
1215
|
+
when Hash
|
|
1216
|
+
if nt[:name].nil? || nt[:name] == "*" || name.casecmp(nt[:name]).zero?
|
|
1217
|
+
out << val
|
|
1218
|
+
end
|
|
1219
|
+
else
|
|
1220
|
+
out << val
|
|
1221
|
+
end
|
|
1222
|
+
end
|
|
1223
|
+
end
|
|
1224
|
+
end
|
|
1225
|
+
|
|
1226
|
+
def push_if_matches(n, nt, out)
|
|
1227
|
+
return unless matches_node_test?(n, nt)
|
|
1228
|
+
out << n
|
|
1229
|
+
end
|
|
1230
|
+
|
|
1231
|
+
def matches_node_test?(n, nt)
|
|
1232
|
+
case nt
|
|
1233
|
+
when :any_element
|
|
1234
|
+
n.respond_to?(:element?) ? n.element? : (n.respond_to?(:name) && !n.name.start_with?("#"))
|
|
1235
|
+
when :text
|
|
1236
|
+
n.respond_to?(:text?) && n.text?
|
|
1237
|
+
when :comment
|
|
1238
|
+
n.respond_to?(:comment?) && n.comment?
|
|
1239
|
+
when :node
|
|
1240
|
+
true
|
|
1241
|
+
when Hash
|
|
1242
|
+
return false unless n.respond_to?(:name)
|
|
1243
|
+
target = nt[:name]
|
|
1244
|
+
return false if target.nil?
|
|
1245
|
+
return true if target == "*"
|
|
1246
|
+
name = n.name
|
|
1247
|
+
return false if name.nil? || name.start_with?("#")
|
|
1248
|
+
name.casecmp(target).zero?
|
|
1249
|
+
else
|
|
1250
|
+
false
|
|
1251
|
+
end
|
|
1252
|
+
end
|
|
1253
|
+
|
|
1254
|
+
# ---- Native wrapping helpers --------------------------------------
|
|
1255
|
+
|
|
1256
|
+
def native_handles_for(context)
|
|
1257
|
+
if context.is_a?(Scrapetor::Document)
|
|
1258
|
+
bk = context.backing
|
|
1259
|
+
if defined?(Scrapetor::Native::DocumentWrapper) && bk.is_a?(Scrapetor::Native::DocumentWrapper) &&
|
|
1260
|
+
bk.native.respond_to?(:node_following_sibling_ids)
|
|
1261
|
+
return [bk.native, 0]
|
|
1262
|
+
end
|
|
1263
|
+
elsif context.is_a?(Scrapetor::Node)
|
|
1264
|
+
bk = context.backing_node
|
|
1265
|
+
if bk.respond_to?(:id) && bk.respond_to?(:doc) && bk.doc.respond_to?(:node_following_sibling_ids)
|
|
1266
|
+
return [bk.doc, bk.id]
|
|
1267
|
+
end
|
|
1268
|
+
end
|
|
1269
|
+
[nil, nil]
|
|
1270
|
+
end
|
|
1271
|
+
|
|
1272
|
+
def native_node?(n)
|
|
1273
|
+
return false unless n
|
|
1274
|
+
n.respond_to?(:id) && n.respond_to?(:doc) && n.doc.respond_to?(:node_following_sibling_ids)
|
|
1275
|
+
end
|
|
1276
|
+
|
|
1277
|
+
def native_wrapper_for(n)
|
|
1278
|
+
return n if n.is_a?(Scrapetor::Native::DocumentWrapper)
|
|
1279
|
+
return n.wrapper if n.respond_to?(:wrapper)
|
|
1280
|
+
nil
|
|
1281
|
+
end
|
|
1282
|
+
|
|
1283
|
+
def wrap_native(id)
|
|
1284
|
+
return nil if @native_doc.nil? || id.nil?
|
|
1285
|
+
wrap_native_typed(id, @native_doc.node_type(id))
|
|
1286
|
+
end
|
|
1287
|
+
|
|
1288
|
+
def wrap_native_typed(id, type)
|
|
1289
|
+
wrap_native_typed_with(@native_doc, id, type, wrapper: @initial.respond_to?(:wrapper) ? @initial.wrapper : nil)
|
|
1290
|
+
end
|
|
1291
|
+
|
|
1292
|
+
def wrap_native_typed_with(nd, id, type, wrapper: nil)
|
|
1293
|
+
case type
|
|
1294
|
+
when 1
|
|
1295
|
+
Scrapetor::Node.new(@document, Scrapetor::Native::Element.new(nd, id, wrapper))
|
|
1296
|
+
when 8
|
|
1297
|
+
Scrapetor::CommentNode.new(@document, nd.node_comment_text(id))
|
|
1298
|
+
when 3
|
|
1299
|
+
# text node — use TextNode (String subclass that responds to
|
|
1300
|
+
# text?, name, etc.) so XPath predicates against text-node
|
|
1301
|
+
# sets behave like Nokogiri's.
|
|
1302
|
+
Scrapetor::TextNode.new(nd.node_text(id))
|
|
1303
|
+
else
|
|
1304
|
+
# doc / unknown
|
|
1305
|
+
nil
|
|
1306
|
+
end
|
|
1307
|
+
end
|
|
1308
|
+
|
|
1309
|
+
def wrap_dom(node)
|
|
1310
|
+
return node if node.is_a?(Scrapetor::Node) || node.is_a?(Scrapetor::CommentNode)
|
|
1311
|
+
if node.respond_to?(:comment?) && node.comment?
|
|
1312
|
+
Scrapetor::CommentNode.new(@document, node.respond_to?(:content) ? node.content : node.to_s)
|
|
1313
|
+
elsif node.respond_to?(:text?) && node.text?
|
|
1314
|
+
node.respond_to?(:content) ? node.content : node.to_s
|
|
1315
|
+
elsif node.respond_to?(:element?) && node.element?
|
|
1316
|
+
Scrapetor::Node.new(@document, node)
|
|
1317
|
+
else
|
|
1318
|
+
node
|
|
1319
|
+
end
|
|
1320
|
+
end
|
|
1321
|
+
|
|
1322
|
+
def node_identity(n)
|
|
1323
|
+
if n.is_a?(Scrapetor::Node)
|
|
1324
|
+
bk = n.backing_node
|
|
1325
|
+
bk.respond_to?(:id) ? [:nat, bk.respond_to?(:doc) ? bk.doc.object_id : nil, bk.id] : bk.object_id
|
|
1326
|
+
else
|
|
1327
|
+
n.object_id
|
|
1328
|
+
end
|
|
1329
|
+
end
|
|
1330
|
+
|
|
1331
|
+
# ---- XPath type coercions ----------------------------------------
|
|
1332
|
+
|
|
1333
|
+
def xpath_boolean(v)
|
|
1334
|
+
case v
|
|
1335
|
+
when nil then false
|
|
1336
|
+
when true, false then v
|
|
1337
|
+
when Numeric then !(v.zero? || (v.respond_to?(:nan?) && v.nan?))
|
|
1338
|
+
when String then !v.empty?
|
|
1339
|
+
when Array then !v.empty?
|
|
1340
|
+
else true
|
|
1341
|
+
end
|
|
1342
|
+
end
|
|
1343
|
+
|
|
1344
|
+
def xpath_string(v)
|
|
1345
|
+
case v
|
|
1346
|
+
when nil then ""
|
|
1347
|
+
when String then v
|
|
1348
|
+
when true then "true"
|
|
1349
|
+
when false then "false"
|
|
1350
|
+
when Float
|
|
1351
|
+
if v.nan? then "NaN"
|
|
1352
|
+
elsif v.infinite? then v.positive? ? "Infinity" : "-Infinity"
|
|
1353
|
+
elsif v == v.to_i then v.to_i.to_s
|
|
1354
|
+
else v.to_s
|
|
1355
|
+
end
|
|
1356
|
+
when Numeric then v.to_s
|
|
1357
|
+
when Array
|
|
1358
|
+
n = v.first
|
|
1359
|
+
xpath_string_for_node(n)
|
|
1360
|
+
else
|
|
1361
|
+
xpath_string_for_node(v)
|
|
1362
|
+
end
|
|
1363
|
+
end
|
|
1364
|
+
|
|
1365
|
+
def xpath_string_for_node(n)
|
|
1366
|
+
return "" if n.nil?
|
|
1367
|
+
return n if n.is_a?(String)
|
|
1368
|
+
if n.respond_to?(:text)
|
|
1369
|
+
n.text.to_s
|
|
1370
|
+
elsif n.respond_to?(:to_s)
|
|
1371
|
+
n.to_s
|
|
1372
|
+
else
|
|
1373
|
+
""
|
|
1374
|
+
end
|
|
1375
|
+
end
|
|
1376
|
+
|
|
1377
|
+
def xpath_number(v)
|
|
1378
|
+
case v
|
|
1379
|
+
when nil then Float::NAN
|
|
1380
|
+
when Numeric then v
|
|
1381
|
+
when true then 1
|
|
1382
|
+
when false then 0
|
|
1383
|
+
when String
|
|
1384
|
+
s = v.strip
|
|
1385
|
+
return Float::NAN if s.empty?
|
|
1386
|
+
if s =~ /\A-?\d+\.?\d*\z/ || s =~ /\A-?\.\d+\z/
|
|
1387
|
+
s.include?(".") ? s.to_f : s.to_i
|
|
1388
|
+
else
|
|
1389
|
+
Float::NAN
|
|
1390
|
+
end
|
|
1391
|
+
when Array
|
|
1392
|
+
xpath_number(xpath_string(v))
|
|
1393
|
+
else
|
|
1394
|
+
xpath_number(xpath_string(v))
|
|
1395
|
+
end
|
|
1396
|
+
end
|
|
1397
|
+
|
|
1398
|
+
# ---- Comparison rules (XPath 1.0 §3.4) --------------------------
|
|
1399
|
+
|
|
1400
|
+
def do_compare(op, l, r)
|
|
1401
|
+
# If either operand is a node-set, the comparison is true if any
|
|
1402
|
+
# node satisfies the condition against the other operand.
|
|
1403
|
+
if l.is_a?(Array) || r.is_a?(Array)
|
|
1404
|
+
a = l.is_a?(Array) ? l : [l]
|
|
1405
|
+
b = r.is_a?(Array) ? r : [r]
|
|
1406
|
+
return compare_node_sets(op, a, b)
|
|
1407
|
+
end
|
|
1408
|
+
case op
|
|
1409
|
+
when :eq, :neq
|
|
1410
|
+
# If neither is a node-set, type coercion:
|
|
1411
|
+
# - if either is boolean → both booleans
|
|
1412
|
+
# - else if either is number → both numbers
|
|
1413
|
+
# - else → both strings
|
|
1414
|
+
if l.is_a?(TrueClass) || l.is_a?(FalseClass) ||
|
|
1415
|
+
r.is_a?(TrueClass) || r.is_a?(FalseClass)
|
|
1416
|
+
res = xpath_boolean(l) == xpath_boolean(r)
|
|
1417
|
+
elsif l.is_a?(Numeric) || r.is_a?(Numeric)
|
|
1418
|
+
res = xpath_number(l) == xpath_number(r)
|
|
1419
|
+
else
|
|
1420
|
+
res = xpath_string(l) == xpath_string(r)
|
|
1421
|
+
end
|
|
1422
|
+
op == :eq ? res : !res
|
|
1423
|
+
else
|
|
1424
|
+
ln = xpath_number(l); rn = xpath_number(r)
|
|
1425
|
+
return false if (ln.is_a?(Float) && ln.nan?) || (rn.is_a?(Float) && rn.nan?)
|
|
1426
|
+
case op
|
|
1427
|
+
when :lt then ln < rn
|
|
1428
|
+
when :le then ln <= rn
|
|
1429
|
+
when :gt then ln > rn
|
|
1430
|
+
when :ge then ln >= rn
|
|
1431
|
+
end
|
|
1432
|
+
end
|
|
1433
|
+
end
|
|
1434
|
+
|
|
1435
|
+
def compare_node_sets(op, a, b)
|
|
1436
|
+
case op
|
|
1437
|
+
when :eq, :neq
|
|
1438
|
+
# Stringify each side; check if any pair matches under XPath rules.
|
|
1439
|
+
a.each do |x|
|
|
1440
|
+
sx = xpath_string(x)
|
|
1441
|
+
b.each do |y|
|
|
1442
|
+
sy = xpath_string(y)
|
|
1443
|
+
hit = sx == sy
|
|
1444
|
+
return op == :eq if hit && op == :eq
|
|
1445
|
+
return op == :neq if !hit && op == :neq
|
|
1446
|
+
end
|
|
1447
|
+
end
|
|
1448
|
+
op == :neq && a.empty? && b.empty? ? false : (op == :neq ? a.any? { |x| b.any? { |y| xpath_string(x) != xpath_string(y) } } : false)
|
|
1449
|
+
else
|
|
1450
|
+
# Numeric: any pair satisfies the comparison.
|
|
1451
|
+
a.each do |x|
|
|
1452
|
+
nx = xpath_number(x)
|
|
1453
|
+
next if nx.is_a?(Float) && nx.nan?
|
|
1454
|
+
b.each do |y|
|
|
1455
|
+
ny = xpath_number(y)
|
|
1456
|
+
next if ny.is_a?(Float) && ny.nan?
|
|
1457
|
+
ok =
|
|
1458
|
+
case op
|
|
1459
|
+
when :lt then nx < ny
|
|
1460
|
+
when :le then nx <= ny
|
|
1461
|
+
when :gt then nx > ny
|
|
1462
|
+
when :ge then nx >= ny
|
|
1463
|
+
end
|
|
1464
|
+
return true if ok
|
|
1465
|
+
end
|
|
1466
|
+
end
|
|
1467
|
+
false
|
|
1468
|
+
end
|
|
1469
|
+
end
|
|
1470
|
+
|
|
1471
|
+
# ---- Functions ----------------------------------------------------
|
|
1472
|
+
|
|
1473
|
+
def call_function(name, args, context_set, position_info)
|
|
1474
|
+
case name
|
|
1475
|
+
# node-set
|
|
1476
|
+
when "last"
|
|
1477
|
+
(position_info && position_info[:last]) || context_set.length
|
|
1478
|
+
when "position"
|
|
1479
|
+
(position_info && position_info[:position]) || 1
|
|
1480
|
+
when "count"
|
|
1481
|
+
v = eval_expr(args[0], context_set, position_info)
|
|
1482
|
+
v.is_a?(Array) ? v.length : 0
|
|
1483
|
+
when "id"
|
|
1484
|
+
# id('foo') — return element with that id from the document.
|
|
1485
|
+
v = eval_expr(args[0], context_set, position_info)
|
|
1486
|
+
ids = v.is_a?(Array) ? v.map { |x| xpath_string(x) }.flat_map { |s| s.split(/\s+/) } : xpath_string(v).split(/\s+/)
|
|
1487
|
+
out = []
|
|
1488
|
+
ids.each do |id_str|
|
|
1489
|
+
hit = @document.at_css("##{id_str}") rescue nil
|
|
1490
|
+
out << hit if hit
|
|
1491
|
+
end
|
|
1492
|
+
out
|
|
1493
|
+
when "local-name"
|
|
1494
|
+
n = arg_first_node(args, context_set, position_info)
|
|
1495
|
+
n && n.respond_to?(:name) ? n.name.split(":").last.to_s : ""
|
|
1496
|
+
when "name"
|
|
1497
|
+
n = arg_first_node(args, context_set, position_info)
|
|
1498
|
+
n && n.respond_to?(:name) ? n.name.to_s : ""
|
|
1499
|
+
when "namespace-uri"
|
|
1500
|
+
"" # we don't model namespaces in HTML
|
|
1501
|
+
# string
|
|
1502
|
+
when "string"
|
|
1503
|
+
xpath_string(args.empty? ? context_set.first : eval_expr(args[0], context_set, position_info))
|
|
1504
|
+
when "concat"
|
|
1505
|
+
args.map { |a| xpath_string(eval_expr(a, context_set, position_info)) }.join
|
|
1506
|
+
when "starts-with"
|
|
1507
|
+
xpath_string(eval_expr(args[0], context_set, position_info))
|
|
1508
|
+
.start_with?(xpath_string(eval_expr(args[1], context_set, position_info)))
|
|
1509
|
+
when "contains"
|
|
1510
|
+
xpath_string(eval_expr(args[0], context_set, position_info))
|
|
1511
|
+
.include?(xpath_string(eval_expr(args[1], context_set, position_info)))
|
|
1512
|
+
when "substring-before"
|
|
1513
|
+
a = xpath_string(eval_expr(args[0], context_set, position_info))
|
|
1514
|
+
b = xpath_string(eval_expr(args[1], context_set, position_info))
|
|
1515
|
+
idx = a.index(b)
|
|
1516
|
+
idx ? a[0...idx] : ""
|
|
1517
|
+
when "substring-after"
|
|
1518
|
+
a = xpath_string(eval_expr(args[0], context_set, position_info))
|
|
1519
|
+
b = xpath_string(eval_expr(args[1], context_set, position_info))
|
|
1520
|
+
idx = a.index(b)
|
|
1521
|
+
idx ? a[(idx + b.length)..] || "" : ""
|
|
1522
|
+
when "substring"
|
|
1523
|
+
s = xpath_string(eval_expr(args[0], context_set, position_info))
|
|
1524
|
+
start = xpath_number(eval_expr(args[1], context_set, position_info))
|
|
1525
|
+
# XPath substring is 1-based, rounding to nearest integer.
|
|
1526
|
+
start_i = start.respond_to?(:round) ? start.round.to_i : start.to_i
|
|
1527
|
+
if args.size > 2
|
|
1528
|
+
len = xpath_number(eval_expr(args[2], context_set, position_info))
|
|
1529
|
+
len_i = len.respond_to?(:round) ? len.round.to_i : len.to_i
|
|
1530
|
+
from = [start_i, 1].max
|
|
1531
|
+
to = start_i + len_i
|
|
1532
|
+
from_i = from - 1
|
|
1533
|
+
to_i = [to - 1, s.length].min
|
|
1534
|
+
s[from_i...to_i] || ""
|
|
1535
|
+
else
|
|
1536
|
+
from = [start_i, 1].max
|
|
1537
|
+
s[(from - 1)..] || ""
|
|
1538
|
+
end
|
|
1539
|
+
when "string-length"
|
|
1540
|
+
s = args.empty? ?
|
|
1541
|
+
xpath_string(context_set.first) :
|
|
1542
|
+
xpath_string(eval_expr(args[0], context_set, position_info))
|
|
1543
|
+
s.length
|
|
1544
|
+
when "normalize-space"
|
|
1545
|
+
s = args.empty? ?
|
|
1546
|
+
xpath_string(context_set.first) :
|
|
1547
|
+
xpath_string(eval_expr(args[0], context_set, position_info))
|
|
1548
|
+
s.strip.gsub(/\s+/, " ")
|
|
1549
|
+
when "translate"
|
|
1550
|
+
s = xpath_string(eval_expr(args[0], context_set, position_info))
|
|
1551
|
+
from = xpath_string(eval_expr(args[1], context_set, position_info))
|
|
1552
|
+
to = xpath_string(eval_expr(args[2], context_set, position_info))
|
|
1553
|
+
# Per XPath: characters in `from` are replaced by the same-index
|
|
1554
|
+
# char in `to`; characters in `from` past `to`'s length are deleted.
|
|
1555
|
+
map = {}
|
|
1556
|
+
from.each_char.with_index { |c, i| map[c] = i < to.length ? to[i] : nil }
|
|
1557
|
+
s.chars.map { |c| map.key?(c) ? map[c] : c }.compact.join
|
|
1558
|
+
# boolean
|
|
1559
|
+
when "boolean" then xpath_boolean(eval_expr(args[0], context_set, position_info))
|
|
1560
|
+
when "not" then !xpath_boolean(eval_expr(args[0], context_set, position_info))
|
|
1561
|
+
when "true" then true
|
|
1562
|
+
when "false" then false
|
|
1563
|
+
when "lang"
|
|
1564
|
+
# lang('en') — true if context node's xml:lang ancestor-or-self
|
|
1565
|
+
# starts with 'en' (case-insensitive). HTML: also `lang` attr.
|
|
1566
|
+
target = xpath_string(eval_expr(args[0], context_set, position_info)).downcase
|
|
1567
|
+
n = context_set.first
|
|
1568
|
+
n = n.is_a?(Array) ? n.first : n
|
|
1569
|
+
while n
|
|
1570
|
+
lang = nil
|
|
1571
|
+
if n.respond_to?(:[])
|
|
1572
|
+
lang = (n["xml:lang"] || n["lang"]) rescue nil
|
|
1573
|
+
end
|
|
1574
|
+
return true if lang && (lang.downcase == target || lang.downcase.start_with?("#{target}-"))
|
|
1575
|
+
n = parent_of(n)
|
|
1576
|
+
end
|
|
1577
|
+
false
|
|
1578
|
+
# number
|
|
1579
|
+
when "number"
|
|
1580
|
+
xpath_number(args.empty? ? context_set.first : eval_expr(args[0], context_set, position_info))
|
|
1581
|
+
when "sum"
|
|
1582
|
+
v = eval_expr(args[0], context_set, position_info)
|
|
1583
|
+
v = [v] unless v.is_a?(Array)
|
|
1584
|
+
v.inject(0.0) { |acc, x| acc + xpath_number(x).to_f }
|
|
1585
|
+
when "floor"
|
|
1586
|
+
xpath_number(eval_expr(args[0], context_set, position_info)).floor
|
|
1587
|
+
when "ceiling"
|
|
1588
|
+
xpath_number(eval_expr(args[0], context_set, position_info)).ceil
|
|
1589
|
+
when "round"
|
|
1590
|
+
n = xpath_number(eval_expr(args[0], context_set, position_info))
|
|
1591
|
+
n.is_a?(Float) && n.nan? ? n : n.round
|
|
1592
|
+
else
|
|
1593
|
+
raise UnsupportedError, "unknown XPath function `#{name}()`"
|
|
1594
|
+
end
|
|
1595
|
+
end
|
|
1596
|
+
|
|
1597
|
+
def arg_first_node(args, context_set, position_info)
|
|
1598
|
+
v = args.empty? ? context_set : eval_expr(args[0], context_set, position_info)
|
|
1599
|
+
v.is_a?(Array) ? v.first : v
|
|
1600
|
+
end
|
|
1601
|
+
end
|
|
1602
|
+
end
|
|
1603
|
+
end
|