scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,1603 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Full XPath 1.0 expression engine.
5
+ #
6
+ # Pipeline:
7
+ # 1. Tokenizer -> array of [:type, value] tokens
8
+ # 2. Parser -> AST (recursive-descent, full XPath 1.0 grammar)
9
+ # 3. Evaluator -> walks the AST against a Scrapetor::Document/Node
10
+ #
11
+ # Axis traversals dispatch to native C primitives on the arena DOM
12
+ # (`node_following_sibling_ids`, `node_ancestor_ids`, `node_following_ids`,
13
+ # `node_preceding_ids`, `node_descendant_comment_ids`, …) so the hot
14
+ # path stays in C even though the AST walk runs in Ruby.
15
+ #
16
+ # Compiled ASTs are cached on the module (LRU-bounded) so repeated
17
+ # queries — typical in scraping pipelines that run the same parser
18
+ # against thousands of pages — only pay the tokenize/parse cost once.
19
+ module XPath
20
+ class UnsupportedError < StandardError; end
21
+ class ParseError < StandardError; end
22
+
23
+ AST_CACHE_CAP = 1024
24
+ @ast_cache = {}
25
+ @ast_cache_mutex = Mutex.new
26
+
27
+ def self.evaluate(context, expr)
28
+ expr_s = expr.to_s
29
+ # Memo the AST + CSS-translation result together so the per-call
30
+ # overhead on the hot path collapses to one Hash lookup. The first
31
+ # call for a new expression pays parse + translate; every later
32
+ # call gets the cached descriptor or `false` (= no CSS fast path).
33
+ entry = @ast_cache[expr_s] || cache_compile(expr_s)
34
+ if (css = entry[:css])
35
+ return run_via_css(context, css)
36
+ end
37
+ Evaluator.new(context).eval_program(entry[:ast])
38
+ end
39
+
40
+ def self.compile(expr)
41
+ cache_compile(expr.to_s)[:ast]
42
+ end
43
+
44
+ def self.cache_compile(expr)
45
+ cached = @ast_cache[expr]
46
+ return cached if cached
47
+ @ast_cache_mutex.synchronize do
48
+ cached = @ast_cache[expr]
49
+ return cached if cached
50
+ ast = Parser.new(Tokenizer.tokenize(expr), expr).parse_expr
51
+ css = CssTranslator.translate(ast)
52
+ entry = { ast: ast, css: css }
53
+ @ast_cache.shift if @ast_cache.size >= AST_CACHE_CAP
54
+ @ast_cache[expr] = entry
55
+ entry
56
+ end
57
+ end
58
+
59
+ # Execute a translated CSS chain. Handles the ::attr / ::text
60
+ # tail forms that CssTranslator emits for `/@x` and `/text()`
61
+ # terminations. Returns an Array (XPath shape) regardless of
62
+ # the underlying CSS return type.
63
+ def self.run_via_css(context, css_descriptor)
64
+ sel = css_descriptor[:sel]
65
+ kind = css_descriptor[:kind] # :nodes / :attr / :text
66
+ result = context.css(sel)
67
+ arr = result.respond_to?(:to_a) ? result.to_a : Array(result)
68
+ arr
69
+ end
70
+
71
+ # ============================================================
72
+ # Tokenizer
73
+ # ============================================================
74
+
75
+ module Tokenizer
76
+ OPERATORS = %w[// / .. . :: @ ( ) [ ] , | + - = != <= >= < > * div mod and or].freeze
77
+
78
+ def self.tokenize(s)
79
+ tokens = []
80
+ i = 0
81
+ len = s.length
82
+ while i < len
83
+ c = s[i]
84
+ case c
85
+ when " ", "\t", "\n", "\r"
86
+ i += 1
87
+ when "/"
88
+ if s[i + 1] == "/"
89
+ tokens << [:slash_slash, "//"]; i += 2
90
+ else
91
+ tokens << [:slash, "/"]; i += 1
92
+ end
93
+ when "("
94
+ tokens << [:lparen, "("]; i += 1
95
+ when ")"
96
+ tokens << [:rparen, ")"]; i += 1
97
+ when "["
98
+ tokens << [:lbracket, "["]; i += 1
99
+ when "]"
100
+ tokens << [:rbracket, "]"]; i += 1
101
+ when ","
102
+ tokens << [:comma, ","]; i += 1
103
+ when "@"
104
+ tokens << [:at, "@"]; i += 1
105
+ when "|"
106
+ tokens << [:pipe, "|"]; i += 1
107
+ when "+"
108
+ tokens << [:plus, "+"]; i += 1
109
+ when "-"
110
+ # `-` is a tricky one. In XPath 1.0 it's only an operator
111
+ # when the preceding token is one of: another operator, `(`,
112
+ # `[`, `,`, or nothing (start of expression). Otherwise it's
113
+ # part of a name. NameTest disambiguates downstream.
114
+ prev = tokens.last
115
+ if prev.nil? || %i[lparen lbracket comma op slash slash_slash pipe at plus minus eq neq lt gt le ge star and_op or_op].include?(prev[0])
116
+ tokens << [:minus, "-"]
117
+ else
118
+ tokens << [:minus, "-"]
119
+ end
120
+ i += 1
121
+ when "="
122
+ tokens << [:eq, "="]; i += 1
123
+ when "!"
124
+ if s[i + 1] == "="
125
+ tokens << [:neq, "!="]; i += 2
126
+ else
127
+ raise ParseError, "stray `!` in `#{s}`"
128
+ end
129
+ when "<"
130
+ if s[i + 1] == "="
131
+ tokens << [:le, "<="]; i += 2
132
+ else
133
+ tokens << [:lt, "<"]; i += 1
134
+ end
135
+ when ">"
136
+ if s[i + 1] == "="
137
+ tokens << [:ge, ">="]; i += 2
138
+ else
139
+ tokens << [:gt, ">"]; i += 1
140
+ end
141
+ when ":"
142
+ if s[i + 1] == ":"
143
+ tokens << [:axis_sep, "::"]; i += 2
144
+ else
145
+ tokens << [:colon, ":"]; i += 1
146
+ end
147
+ when "*"
148
+ # `*` is multiplicative when prev is a value-producing token;
149
+ # otherwise it's NameTest "any element".
150
+ prev = tokens.last
151
+ if prev && %i[name number string rparen rbracket dot at_attr_done].include?(prev[0])
152
+ tokens << [:star_mul, "*"]
153
+ else
154
+ tokens << [:star, "*"]
155
+ end
156
+ i += 1
157
+ when "."
158
+ if s[i + 1] == "."
159
+ tokens << [:dot_dot, ".."]; i += 2
160
+ elsif s[i + 1] && (s[i + 1] >= "0" && s[i + 1] <= "9")
161
+ # Numeric literal starting with .
162
+ j = i + 1
163
+ j += 1 while j < len && s[j] >= "0" && s[j] <= "9"
164
+ tokens << [:number, s[i...j].to_f]; i = j
165
+ else
166
+ tokens << [:dot, "."]; i += 1
167
+ end
168
+ when "'", '"'
169
+ quote = c
170
+ j = i + 1
171
+ j += 1 while j < len && s[j] != quote
172
+ raise ParseError, "unterminated string in `#{s}`" if j >= len
173
+ tokens << [:string, s[(i + 1)...j]]
174
+ i = j + 1
175
+ when "0".."9"
176
+ j = i
177
+ j += 1 while j < len && s[j] >= "0" && s[j] <= "9"
178
+ if j < len && s[j] == "." && (j + 1 >= len || (s[j + 1] >= "0" && s[j + 1] <= "9"))
179
+ j += 1
180
+ j += 1 while j < len && s[j] >= "0" && s[j] <= "9"
181
+ tokens << [:number, s[i...j].to_f]
182
+ else
183
+ tokens << [:number, s[i...j].to_i]
184
+ end
185
+ i = j
186
+ else
187
+ # Name token: NCName chars (letters, digits, _, -). XPath
188
+ # operators `div`, `mod`, `and`, `or` are name-shaped; we
189
+ # classify them post-hoc based on context.
190
+ if c =~ /[A-Za-z_]/
191
+ j = i
192
+ j += 1 while j < len && s[j] =~ /[A-Za-z0-9_\-]/
193
+ name = s[i...j]
194
+ prev = tokens.last
195
+ # Operator names only kick in when the prior token suggests
196
+ # we're in an operator position (after a value-producing token).
197
+ op_position = prev && %i[name number string rparen rbracket star_mul dot_dot dot].include?(prev[0])
198
+ if op_position && name == "and"
199
+ tokens << [:and_op, "and"]
200
+ elsif op_position && name == "or"
201
+ tokens << [:or_op, "or"]
202
+ elsif op_position && name == "div"
203
+ tokens << [:div_op, "div"]
204
+ elsif op_position && name == "mod"
205
+ tokens << [:mod_op, "mod"]
206
+ else
207
+ tokens << [:name, name]
208
+ end
209
+ i = j
210
+ else
211
+ raise ParseError, "unrecognised char `#{c}` at #{i} in `#{s}`"
212
+ end
213
+ end
214
+ end
215
+ tokens << [:eof, nil]
216
+ tokens
217
+ end
218
+ end
219
+
220
+ # ============================================================
221
+ # Parser → AST
222
+ #
223
+ # AST node shapes (all Hashes):
224
+ # { t: :path, steps: [step,...], absolute: bool, double_slash: bool }
225
+ # step: { axis: :child|:descendant_or_self|..., nt: nodetest, preds: [Expr,...] }
226
+ # nodetest: :any_element | :text | :comment | :node | :pi(name=...) | {name: 'tag'} | :attr({name})
227
+ # { t: :or, l:, r: }
228
+ # { t: :and, l:, r: }
229
+ # { t: :cmp, op: :eq|:neq|:lt|:le|:gt|:ge, l:, r: }
230
+ # { t: :add, op: :plus|:minus, l:, r: }
231
+ # { t: :mul, op: :mul|:div|:mod, l:, r: }
232
+ # { t: :neg, e: }
233
+ # { t: :union, ops: [Expr,...] }
234
+ # { t: :filter, primary: Expr, preds: [Expr,...] } chained with /path
235
+ # { t: :func, name: 'count', args: [Expr,...] }
236
+ # { t: :num, v: }
237
+ # { t: :str, v: }
238
+ # ============================================================
239
+
240
+ class Parser
241
+ def initialize(tokens, raw)
242
+ @tokens = tokens
243
+ @pos = 0
244
+ @raw = raw
245
+ end
246
+
247
+ def peek(k = 0); @tokens[@pos + k]; end
248
+ def consume; t = @tokens[@pos]; @pos += 1; t; end
249
+ def expect(type)
250
+ t = @tokens[@pos]
251
+ unless t && t[0] == type
252
+ raise ParseError, "expected #{type} got #{t.inspect} in `#{@raw}`"
253
+ end
254
+ @pos += 1
255
+ t
256
+ end
257
+
258
+ def parse_expr
259
+ parse_or
260
+ end
261
+
262
+ def parse_or
263
+ l = parse_and
264
+ while peek[0] == :or_op
265
+ consume
266
+ l = { t: :or, l: l, r: parse_and }
267
+ end
268
+ l
269
+ end
270
+
271
+ def parse_and
272
+ l = parse_equality
273
+ while peek[0] == :and_op
274
+ consume
275
+ l = { t: :and, l: l, r: parse_equality }
276
+ end
277
+ l
278
+ end
279
+
280
+ def parse_equality
281
+ l = parse_relational
282
+ while %i[eq neq].include?(peek[0])
283
+ op = consume[0]
284
+ l = { t: :cmp, op: op, l: l, r: parse_relational }
285
+ end
286
+ l
287
+ end
288
+
289
+ def parse_relational
290
+ l = parse_additive
291
+ while %i[lt le gt ge].include?(peek[0])
292
+ op = consume[0]
293
+ l = { t: :cmp, op: op, l: l, r: parse_additive }
294
+ end
295
+ l
296
+ end
297
+
298
+ def parse_additive
299
+ l = parse_multiplicative
300
+ while %i[plus minus].include?(peek[0])
301
+ op = consume[0] == :plus ? :plus : :minus
302
+ l = { t: :add, op: op, l: l, r: parse_multiplicative }
303
+ end
304
+ l
305
+ end
306
+
307
+ def parse_multiplicative
308
+ l = parse_unary
309
+ loop do
310
+ tk = peek[0]
311
+ break unless %i[star_mul div_op mod_op].include?(tk)
312
+ consume
313
+ op = tk == :star_mul ? :mul : (tk == :div_op ? :div : :mod)
314
+ l = { t: :mul, op: op, l: l, r: parse_unary }
315
+ end
316
+ l
317
+ end
318
+
319
+ def parse_unary
320
+ if peek[0] == :minus
321
+ consume
322
+ { t: :neg, e: parse_unary }
323
+ else
324
+ parse_union
325
+ end
326
+ end
327
+
328
+ def parse_union
329
+ l = parse_path
330
+ if peek[0] == :pipe
331
+ ops = [l]
332
+ while peek[0] == :pipe
333
+ consume
334
+ ops << parse_path
335
+ end
336
+ { t: :union, ops: ops }
337
+ else
338
+ l
339
+ end
340
+ end
341
+
342
+ # PathExpr := LocationPath | FilterExpr ('/' RelativeLocationPath | '//' RelativeLocationPath)?
343
+ def parse_path
344
+ if location_path_start?
345
+ parse_location_path
346
+ else
347
+ primary = parse_primary
348
+ # PrimaryExpr Predicate*
349
+ preds = []
350
+ while peek[0] == :lbracket
351
+ consume
352
+ preds << parse_expr
353
+ expect(:rbracket)
354
+ end
355
+ base = preds.empty? ? primary : { t: :filter, primary: primary, preds: preds }
356
+ # Optional /RelativeLocationPath or //RelativeLocationPath
357
+ if %i[slash slash_slash].include?(peek[0])
358
+ steps = []
359
+ consume_path_separator(steps)
360
+ parse_relative_location_path_into(steps)
361
+ { t: :filter_path, primary: base, steps: steps }
362
+ else
363
+ base
364
+ end
365
+ end
366
+ end
367
+
368
+ def location_path_start?
369
+ tk = peek[0]
370
+ return true if %i[slash slash_slash dot dot_dot at star].include?(tk)
371
+ # name token followed by something that looks like a node-test
372
+ # (axis_sep, paren-without-args-as-function, slash, predicate)
373
+ if tk == :name
374
+ n2 = peek(1)[0]
375
+ # `name::` is an axis
376
+ return true if n2 == :axis_sep
377
+ # `name(...)` -> function or nodetype()
378
+ if n2 == :lparen
379
+ # Distinguish nodetype() vs function call. Nodetypes are:
380
+ # text, comment, node, processing-instruction.
381
+ nm = peek[1]
382
+ return %w[text comment node processing-instruction].include?(nm)
383
+ end
384
+ # bare name like `div` — that's a location step (child::div)
385
+ return true
386
+ end
387
+ false
388
+ end
389
+
390
+ def parse_location_path
391
+ steps = []
392
+ absolute = false
393
+ double = false
394
+ if peek[0] == :slash_slash
395
+ absolute = true; double = true
396
+ steps << { axis: :descendant_or_self, nt: :node, preds: [] }
397
+ consume
398
+ elsif peek[0] == :slash
399
+ absolute = true
400
+ consume
401
+ # If the next token doesn't start a step, this is just '/'.
402
+ if !step_start?
403
+ return { t: :path, steps: steps, absolute: absolute, double_slash: false }
404
+ end
405
+ end
406
+ parse_relative_location_path_into(steps)
407
+ { t: :path, steps: steps, absolute: absolute, double_slash: double }
408
+ end
409
+
410
+ def parse_relative_location_path_into(steps)
411
+ steps << parse_step
412
+ loop do
413
+ break unless %i[slash slash_slash].include?(peek[0])
414
+ consume_path_separator(steps)
415
+ steps << parse_step
416
+ end
417
+ end
418
+
419
+ def consume_path_separator(steps)
420
+ tk = consume[0]
421
+ if tk == :slash_slash
422
+ steps << { axis: :descendant_or_self, nt: :node, preds: [] }
423
+ end
424
+ end
425
+
426
+ def step_start?
427
+ %i[name star at dot dot_dot].include?(peek[0])
428
+ end
429
+
430
+ AXIS_SYMBOLS = {
431
+ "child" => :child,
432
+ "descendant" => :descendant,
433
+ "descendant-or-self" => :descendant_or_self,
434
+ "parent" => :parent,
435
+ "self" => :self,
436
+ "ancestor" => :ancestor,
437
+ "ancestor-or-self" => :ancestor_or_self,
438
+ "following-sibling" => :following_sibling,
439
+ "preceding-sibling" => :preceding_sibling,
440
+ "following" => :following,
441
+ "preceding" => :preceding,
442
+ "attribute" => :attribute,
443
+ "namespace" => :namespace
444
+ }.freeze
445
+
446
+ def parse_step
447
+ if peek[0] == :dot
448
+ consume
449
+ return { axis: :self, nt: :node, preds: [] }
450
+ end
451
+ if peek[0] == :dot_dot
452
+ consume
453
+ return { axis: :parent, nt: :node, preds: [] }
454
+ end
455
+
456
+ axis = :child
457
+ if peek[0] == :at
458
+ consume
459
+ axis = :attribute
460
+ elsif peek[0] == :name && peek(1)[0] == :axis_sep
461
+ name = consume[1]
462
+ axis_sym = AXIS_SYMBOLS[name]
463
+ raise UnsupportedError, "unknown axis `#{name}` in `#{@raw}`" unless axis_sym
464
+ axis = axis_sym
465
+ expect(:axis_sep)
466
+ end
467
+
468
+ nt = parse_node_test
469
+ preds = []
470
+ while peek[0] == :lbracket
471
+ consume
472
+ preds << parse_expr
473
+ expect(:rbracket)
474
+ end
475
+ { axis: axis, nt: nt, preds: preds }
476
+ end
477
+
478
+ def parse_node_test
479
+ if peek[0] == :star
480
+ consume
481
+ return :any_element
482
+ end
483
+ if peek[0] == :name && peek(1)[0] == :lparen
484
+ nm = consume[1]
485
+ expect(:lparen)
486
+ case nm
487
+ when "node"
488
+ expect(:rparen)
489
+ return :node
490
+ when "text"
491
+ expect(:rparen)
492
+ return :text
493
+ when "comment"
494
+ expect(:rparen)
495
+ return :comment
496
+ when "processing-instruction"
497
+ arg = nil
498
+ if peek[0] == :string
499
+ arg = consume[1]
500
+ end
501
+ expect(:rparen)
502
+ return { pi: arg }
503
+ else
504
+ raise ParseError, "unexpected `#{nm}(` as node test in `#{@raw}`"
505
+ end
506
+ end
507
+ if peek[0] == :name
508
+ # tag name. Support qname (prefix:local) — we ignore the prefix.
509
+ name = consume[1]
510
+ if peek[0] == :colon && peek(1)[0] == :name
511
+ consume
512
+ name = consume[1]
513
+ end
514
+ return { name: name }
515
+ end
516
+ raise ParseError, "expected node test, got #{peek.inspect} in `#{@raw}`"
517
+ end
518
+
519
+ def parse_primary
520
+ tk = peek[0]
521
+ case tk
522
+ when :string
523
+ { t: :str, v: consume[1] }
524
+ when :number
525
+ { t: :num, v: consume[1] }
526
+ when :lparen
527
+ consume
528
+ e = parse_expr
529
+ expect(:rparen)
530
+ e
531
+ when :name
532
+ # FunctionCall
533
+ fname = consume[1]
534
+ expect(:lparen)
535
+ args = []
536
+ unless peek[0] == :rparen
537
+ args << parse_expr
538
+ while peek[0] == :comma
539
+ consume
540
+ args << parse_expr
541
+ end
542
+ end
543
+ expect(:rparen)
544
+ { t: :func, name: fname, args: args }
545
+ else
546
+ raise ParseError, "unexpected `#{peek.inspect}` in `#{@raw}`"
547
+ end
548
+ end
549
+ end
550
+
551
+ # ============================================================
552
+ # CSS Translator: convert simple XPath AST shapes to CSS selectors
553
+ # so the heavily-optimised native CSS matcher answers them in one
554
+ # C call. Returns nil if the AST contains anything that doesn't
555
+ # round-trip cleanly to CSS (boolean predicates, position()/last()
556
+ # functions, sibling/ancestor axes, etc.) — caller then falls back
557
+ # to the full evaluator.
558
+ # ============================================================
559
+
560
+ module CssTranslator
561
+ # Returns { sel: "...", kind: :nodes|:attr|:text } or nil.
562
+ def self.translate(ast)
563
+ return nil unless ast.is_a?(Hash) && ast[:t] == :path
564
+ steps = ast[:steps]
565
+ return nil if steps.empty?
566
+
567
+ # We support these path patterns:
568
+ # absolute (/, //) and relative (.//, scoped from current node).
569
+ # The leading `descendant-or-self any-element` step that // injects
570
+ # gets collapsed with the next step: //tag becomes "tag", //a/b
571
+ # becomes "a > b" only when an explicit child separator follows.
572
+ idx = 0
573
+ css_parts = []
574
+ prev_was_descendant = ast[:absolute]
575
+ # If absolute starts with a single `/`, the first real step is at
576
+ # the document root child level → tighten with `> tag`. // (double_slash)
577
+ # already injected a descendant-or-self step.
578
+
579
+ while idx < steps.length
580
+ st = steps[idx]
581
+ axis = st[:axis]
582
+ nt = st[:nt]
583
+ preds = st[:preds]
584
+
585
+ # `descendant-or-self node()` (from //) — combiner only.
586
+ if axis == :descendant_or_self && nt == :node && preds.empty?
587
+ prev_was_descendant = true
588
+ idx += 1
589
+ next
590
+ end
591
+
592
+ # Tail extractions: @attr and text() must be the final step.
593
+ last = idx == steps.length - 1
594
+ if axis == :attribute && nt.is_a?(Hash) && nt[:name] && last
595
+ base = css_parts.join
596
+ return nil if base.empty?
597
+ return { sel: "#{base}::attr(#{nt[:name]})", kind: :attr }
598
+ end
599
+ # XPath text() returns one TextNode per literal text segment;
600
+ # CSS `::text` concatenates a node's textContent. The
601
+ # semantics diverge whenever an element has mixed text+inline
602
+ # children, so we never route text() through CSS — the full
603
+ # evaluator walks the arena and emits separate TextNodes per
604
+ # text node id, which matches XPath / Nokogiri semantics.
605
+
606
+ # Following-sibling axis: CSS `~` (general sibling) when the
607
+ # name test is concrete, equivalently `* + tag` for the [1]
608
+ # case (adjacent sibling). XPath following-sibling::name and
609
+ # CSS `~ name` both select siblings of the context node that
610
+ # come after it and match name, regardless of intervening
611
+ # nodes — identical semantics.
612
+ if axis == :following_sibling
613
+ tag = sibling_axis_tag(nt)
614
+ return nil unless tag
615
+ pred_strs = collect_pred_strs(preds)
616
+ return nil if pred_strs.nil?
617
+ return nil if css_parts.empty?
618
+ css_parts << " ~ " << tag
619
+ pred_strs.each { |ps| css_parts << ps }
620
+ prev_was_descendant = false
621
+ idx += 1
622
+ next
623
+ end
624
+
625
+ # Only handle child axis for intermediate steps in the CSS path.
626
+ return nil unless axis == :child
627
+
628
+ # Node test must be a tag name or `*`.
629
+ tag =
630
+ case nt
631
+ when :any_element then "*"
632
+ when Hash
633
+ return nil unless nt[:name]
634
+ nt[:name]
635
+ else
636
+ return nil
637
+ end
638
+
639
+ # Predicates: translate each to CSS bracket / pseudo if possible.
640
+ pred_strs = collect_pred_strs(preds)
641
+ return nil if pred_strs.nil?
642
+
643
+ if css_parts.empty?
644
+ css_parts << tag
645
+ elsif prev_was_descendant
646
+ css_parts << " " << tag
647
+ else
648
+ css_parts << " > " << tag
649
+ end
650
+ pred_strs.each { |ps| css_parts << ps }
651
+
652
+ prev_was_descendant = false
653
+ idx += 1
654
+ end
655
+
656
+ sel = css_parts.join
657
+ return nil if sel.empty?
658
+ { sel: sel, kind: :nodes }
659
+ end
660
+
661
+ # Convert a predicate AST to a CSS bracket / pseudo selector
662
+ # fragment. Returns nil if the predicate uses anything CSS can't
663
+ # express (booleans, position()/last() functions, text() etc.).
664
+ def self.translate_predicate(ast)
665
+ case ast[:t]
666
+ when :path
667
+ # @attr alone: a path with one step axis=:attribute, nt={name:...}.
668
+ steps = ast[:steps]
669
+ return nil unless steps.length == 1
670
+ st = steps[0]
671
+ if st[:axis] == :attribute && st[:nt].is_a?(Hash) && st[:nt][:name] && st[:preds].empty?
672
+ return "[#{st[:nt][:name]}]"
673
+ end
674
+ nil
675
+ when :num
676
+ # Positional predicate [N]: XPath `child::tag[N]` ≡ "Nth tag
677
+ # child" which matches CSS `:nth-of-type(N)` exactly (both pick
678
+ # the Nth member of the same-tag children of the parent).
679
+ n = ast[:v].to_i
680
+ return nil unless n >= 1
681
+ ":nth-of-type(#{n})"
682
+ when :cmp
683
+ # position() comparisons: XPath position() refers to the
684
+ # context position within the parent's same-tag children
685
+ # (matching CSS :nth-of-type semantics). These translate to
686
+ # the corresponding :nth-of-type formulas:
687
+ # position() = N → :nth-of-type(N)
688
+ # position() > N → :nth-of-type(n+N+1)
689
+ # position() >= N → :nth-of-type(n+N)
690
+ # position() < N → :nth-of-type(-n+N-1)
691
+ # position() <= N → :nth-of-type(-n+N)
692
+ if is_position_func?(ast[:l]) && (n = const_int(ast[:r]))
693
+ return nth_for(ast[:op], n)
694
+ elsif is_position_func?(ast[:r]) && (n = const_int(ast[:l]))
695
+ return nth_for(flip_cmp(ast[:op]), n)
696
+ end
697
+ return nil unless ast[:op] == :eq
698
+ attr_name = extract_attr_name(ast[:l])
699
+ val_lit = extract_string_literal(ast[:r])
700
+ # try the other order
701
+ if attr_name.nil?
702
+ attr_name = extract_attr_name(ast[:r])
703
+ val_lit = extract_string_literal(ast[:l])
704
+ end
705
+ return nil if attr_name.nil? || val_lit.nil?
706
+ "[#{attr_name}=#{quote_css(val_lit)}]"
707
+ when :and
708
+ # Boolean AND of two simpler predicates → just concatenate the
709
+ # CSS fragments (CSS treats `[a][b]` as logical AND).
710
+ l = translate_predicate(ast[:l])
711
+ r = translate_predicate(ast[:r])
712
+ return nil if l.nil? || r.nil?
713
+ "#{l}#{r}"
714
+ when :func
715
+ case ast[:name]
716
+ when "contains"
717
+ a = extract_attr_name(ast[:args][0])
718
+ v = extract_string_literal(ast[:args][1])
719
+ return nil unless a && v
720
+ "[#{a}*=#{quote_css(v)}]"
721
+ when "starts-with"
722
+ a = extract_attr_name(ast[:args][0])
723
+ v = extract_string_literal(ast[:args][1])
724
+ return nil unless a && v
725
+ "[#{a}^=#{quote_css(v)}]"
726
+ else
727
+ nil
728
+ end
729
+ else
730
+ nil
731
+ end
732
+ end
733
+
734
+ def self.extract_attr_name(ast)
735
+ return nil unless ast.is_a?(Hash) && ast[:t] == :path
736
+ steps = ast[:steps]
737
+ return nil unless steps.length == 1
738
+ st = steps[0]
739
+ return nil unless st[:axis] == :attribute && st[:nt].is_a?(Hash) && st[:nt][:name]
740
+ st[:nt][:name]
741
+ end
742
+
743
+ def self.extract_string_literal(ast)
744
+ return nil unless ast.is_a?(Hash)
745
+ ast[:t] == :str ? ast[:v] : nil
746
+ end
747
+
748
+ def self.quote_css(s)
749
+ s.match?(/[\s\[\]'"=]/) ? "\"#{s.gsub('"', '\\"')}\"" : "'#{s}'"
750
+ end
751
+
752
+ def self.sibling_axis_tag(nt)
753
+ case nt
754
+ when :any_element then "*"
755
+ when Hash
756
+ nt[:name]
757
+ end
758
+ end
759
+
760
+ def self.collect_pred_strs(preds)
761
+ out = []
762
+ preds.each do |p|
763
+ cs = translate_predicate(p)
764
+ return nil if cs.nil?
765
+ out << cs
766
+ end
767
+ out
768
+ end
769
+
770
+ def self.is_position_func?(ast)
771
+ ast.is_a?(Hash) && ast[:t] == :func && ast[:name] == "position" && ast[:args].empty?
772
+ end
773
+
774
+ def self.const_int(ast)
775
+ return nil unless ast.is_a?(Hash) && ast[:t] == :num
776
+ n = ast[:v]
777
+ n.respond_to?(:to_i) ? n.to_i : nil
778
+ end
779
+
780
+ def self.flip_cmp(op)
781
+ # Flip the operator when operands are swapped
782
+ case op
783
+ when :lt then :gt
784
+ when :le then :ge
785
+ when :gt then :lt
786
+ when :ge then :le
787
+ else op
788
+ end
789
+ end
790
+
791
+ def self.nth_for(op, n)
792
+ case op
793
+ when :eq then ":nth-of-type(#{n})"
794
+ when :gt then ":nth-of-type(n+#{n + 1})"
795
+ when :ge then ":nth-of-type(n+#{n})"
796
+ when :lt
797
+ # All positions strictly less than n. CSS `-n+N` matches 1..N.
798
+ return nil if n <= 1
799
+ ":nth-of-type(-n+#{n - 1})"
800
+ when :le
801
+ return nil if n < 1
802
+ ":nth-of-type(-n+#{n})"
803
+ end
804
+ end
805
+ end
806
+
807
+ # ============================================================
808
+ # Evaluator
809
+ # ============================================================
810
+
811
+ class Evaluator
812
+ def initialize(context)
813
+ @document = context.is_a?(Scrapetor::Document) ? context : context.document
814
+ @native_doc, @native_root_id = native_handles_for(context)
815
+ @context_input = context
816
+ end
817
+
818
+ def eval_program(ast)
819
+ result = eval_expr(ast, [ initial_context_node ], nil)
820
+ # Flatten singleton arrays produced by terminal extractions.
821
+ result.is_a?(Array) ? result : [result]
822
+ end
823
+
824
+ def initial_context_node
825
+ # For Document inputs: the context is the document wrapper (so we
826
+ # can descend into its children via the arena). For Node inputs:
827
+ # the context is the node itself.
828
+ @initial =
829
+ if @context_input.is_a?(Scrapetor::Document)
830
+ @context_input.backing
831
+ else
832
+ @context_input
833
+ end
834
+ end
835
+
836
+ # eval_expr returns one of:
837
+ # Array<Node|String> (node-set or string-set for /@x and /text())
838
+ # String / Numeric / TrueClass / FalseClass / NilClass (scalar)
839
+ def eval_expr(ast, context_set, position_info)
840
+ case ast[:t]
841
+ when :path
842
+ eval_path(ast, context_set, position_info)
843
+ when :filter
844
+ base = eval_expr(ast[:primary], context_set, position_info)
845
+ apply_predicates(base, ast[:preds])
846
+ when :filter_path
847
+ base = eval_expr(ast[:primary], context_set, position_info)
848
+ eval_steps_against(base, ast[:steps])
849
+ when :union
850
+ out = []
851
+ seen = {}
852
+ ast[:ops].each do |op|
853
+ r = eval_expr(op, context_set, position_info)
854
+ r = [r] unless r.is_a?(Array)
855
+ r.each do |n|
856
+ key = node_identity(n)
857
+ next if seen[key]
858
+ seen[key] = true
859
+ out << n
860
+ end
861
+ end
862
+ out
863
+ when :or
864
+ xpath_boolean(eval_expr(ast[:l], context_set, position_info)) ||
865
+ xpath_boolean(eval_expr(ast[:r], context_set, position_info))
866
+ when :and
867
+ xpath_boolean(eval_expr(ast[:l], context_set, position_info)) &&
868
+ xpath_boolean(eval_expr(ast[:r], context_set, position_info))
869
+ when :cmp
870
+ do_compare(ast[:op],
871
+ eval_expr(ast[:l], context_set, position_info),
872
+ eval_expr(ast[:r], context_set, position_info))
873
+ when :add
874
+ l = xpath_number(eval_expr(ast[:l], context_set, position_info))
875
+ r = xpath_number(eval_expr(ast[:r], context_set, position_info))
876
+ return Float::NAN if l.respond_to?(:nan?) && (l.nan? || r.nan?)
877
+ ast[:op] == :plus ? (l + r) : (l - r)
878
+ when :mul
879
+ l = xpath_number(eval_expr(ast[:l], context_set, position_info))
880
+ r = xpath_number(eval_expr(ast[:r], context_set, position_info))
881
+ case ast[:op]
882
+ when :mul then l * r
883
+ when :div
884
+ r.zero? ? (l.zero? ? Float::NAN : (l.positive? ? Float::INFINITY : -Float::INFINITY)) : (l.to_f / r.to_f)
885
+ when :mod
886
+ r.zero? ? Float::NAN : (l - (l.to_i / r.to_i) * r)
887
+ end
888
+ when :neg
889
+ -xpath_number(eval_expr(ast[:e], context_set, position_info))
890
+ when :num
891
+ ast[:v]
892
+ when :str
893
+ ast[:v]
894
+ when :func
895
+ call_function(ast[:name], ast[:args], context_set, position_info)
896
+ else
897
+ raise UnsupportedError, "unknown AST node: #{ast[:t]}"
898
+ end
899
+ end
900
+
901
+ def eval_path(ast, context_set, position_info)
902
+ nodes =
903
+ if ast[:absolute]
904
+ [ root_for_context ]
905
+ else
906
+ context_set
907
+ end
908
+ eval_steps_against(nodes, ast[:steps])
909
+ end
910
+
911
+ def eval_steps_against(nodes, steps)
912
+ current = nodes
913
+ steps.each do |st|
914
+ current = step_walk(current, st)
915
+ current = apply_step_predicates(current, st[:preds]) unless st[:preds].empty?
916
+ end
917
+ current
918
+ end
919
+
920
+ def step_walk(current, st)
921
+ axis = st[:axis]
922
+ nt = st[:nt]
923
+ out = []
924
+ current.each do |n|
925
+ case axis
926
+ when :child
927
+ collect_children(n, nt, out)
928
+ when :descendant
929
+ collect_descendants(n, nt, out)
930
+ when :descendant_or_self
931
+ collect_self(n, nt, out)
932
+ collect_descendants(n, nt, out)
933
+ when :parent
934
+ p = parent_of(n)
935
+ push_if_matches(p, nt, out) if p
936
+ when :self
937
+ collect_self(n, nt, out)
938
+ when :ancestor
939
+ ancestors_of(n).each { |a| push_if_matches(a, nt, out) }
940
+ when :ancestor_or_self
941
+ ancestors_of(n).each { |a| push_if_matches(a, nt, out) }
942
+ collect_self(n, nt, out)
943
+ when :following_sibling
944
+ following_siblings_of(n).each { |s| push_if_matches(s, nt, out) }
945
+ when :preceding_sibling
946
+ preceding_siblings_of(n).each { |s| push_if_matches(s, nt, out) }
947
+ when :following
948
+ following_of(n).each { |s| push_if_matches(s, nt, out) }
949
+ when :preceding
950
+ preceding_of(n).each { |s| push_if_matches(s, nt, out) }
951
+ when :attribute
952
+ collect_attributes(n, nt, out)
953
+ when :namespace
954
+ # No-op: we don't model namespace nodes.
955
+ end
956
+ end
957
+ # Per XPath 1.0 §2.1: every axis step produces a node-set
958
+ # (i.e. duplicate-free, document-ordered). When the input
959
+ # context set has multiple nodes, the axis walks can produce
960
+ # overlapping results — e.g. //dt/following-sibling::dd from
961
+ # 50 sibling dts each emits a long suffix of overlapping dds.
962
+ # Deduplicate by node identity so callers see set semantics.
963
+ dedupe_node_set(out)
964
+ end
965
+
966
+ def dedupe_node_set(nodes)
967
+ return nodes if nodes.length < 2
968
+ seen = {}
969
+ out = []
970
+ nodes.each do |n|
971
+ key = node_identity(n)
972
+ next if seen[key]
973
+ seen[key] = true
974
+ out << n
975
+ end
976
+ out
977
+ end
978
+
979
+ def apply_step_predicates(nodes, preds)
980
+ preds.each do |pred_ast|
981
+ filtered = []
982
+ total = nodes.length
983
+ nodes.each_with_index do |n, idx|
984
+ ctx_info = { position: idx + 1, last: total }
985
+ r = eval_expr(pred_ast, [n], ctx_info)
986
+ keep =
987
+ if r.is_a?(Numeric)
988
+ # Numeric predicate: positional
989
+ r.to_i == idx + 1
990
+ else
991
+ xpath_boolean(r)
992
+ end
993
+ filtered << n if keep
994
+ end
995
+ nodes = filtered
996
+ end
997
+ nodes
998
+ end
999
+
1000
+ def apply_predicates(base, preds)
1001
+ nodes = base.is_a?(Array) ? base : [base]
1002
+ apply_step_predicates(nodes, preds)
1003
+ end
1004
+
1005
+ # ---- Node identity / wrapping ------------------------------------
1006
+
1007
+ def root_for_context
1008
+ @document.backing
1009
+ end
1010
+
1011
+ def parent_of(n)
1012
+ return nil if n.nil?
1013
+ if native_node?(n)
1014
+ pid = n.doc.node_parent(n.id)
1015
+ pid ? wrap_native(pid) : @document
1016
+ elsif n.is_a?(Scrapetor::Native::DocumentWrapper) || n.is_a?(Scrapetor::Document)
1017
+ nil
1018
+ elsif n.is_a?(Scrapetor::Node)
1019
+ n.parent
1020
+ elsif n.respond_to?(:parent)
1021
+ n.parent
1022
+ end
1023
+ end
1024
+
1025
+ def collect_self(n, nt, out)
1026
+ push_if_matches(n, nt, out)
1027
+ end
1028
+
1029
+ def collect_children(n, nt, out)
1030
+ nd, rid, wrapper = arena_handle_for(n)
1031
+ if nd
1032
+ nd.node_children(rid).each do |cid|
1033
+ type = nd.node_type(cid)
1034
+ wrapped = wrap_native_typed_with(nd, cid, type, wrapper: wrapper)
1035
+ push_if_matches(wrapped, nt, out) if wrapped
1036
+ end
1037
+ return
1038
+ end
1039
+ if n.is_a?(Scrapetor::Document)
1040
+ n.backing.respond_to?(:children) ? n.backing.children.each { |c| push_if_matches(wrap_dom(c), nt, out) } : nil
1041
+ elsif n.is_a?(Scrapetor::Node)
1042
+ n.backing_node.children.each { |c| push_if_matches(wrap_dom(c), nt, out) }
1043
+ end
1044
+ end
1045
+
1046
+ # Returns [native_doc, node_id, wrapper] when the node lives in
1047
+ # the arena, nil otherwise. Handles all three native carriers:
1048
+ # Scrapetor::Node wrapping a Native::Element, the
1049
+ # Native::DocumentWrapper itself (root context), and a raw
1050
+ # Native::Element.
1051
+ def arena_handle_for(n)
1052
+ if n.is_a?(Scrapetor::Node)
1053
+ bk = n.backing_node
1054
+ if bk.respond_to?(:id) && bk.respond_to?(:doc) && bk.doc.respond_to?(:node_following_sibling_ids)
1055
+ return [bk.doc, bk.id, (bk.respond_to?(:wrapper) ? bk.wrapper : nil)]
1056
+ end
1057
+ elsif n.is_a?(Scrapetor::Native::DocumentWrapper)
1058
+ return [n.native, 0, n]
1059
+ elsif n.respond_to?(:id) && n.respond_to?(:doc) && n.doc.respond_to?(:node_following_sibling_ids)
1060
+ # raw Native::Element
1061
+ return [n.doc, n.id, (n.respond_to?(:wrapper) ? n.wrapper : nil)]
1062
+ end
1063
+ nil
1064
+ end
1065
+
1066
+ def collect_descendants(n, nt, out)
1067
+ nd, rid, wrapper = arena_handle_for(n)
1068
+ if nd
1069
+ # Comment-specific fast path: dedicated C primitive.
1070
+ if nt == :comment
1071
+ nd.node_descendant_comment_ids(rid).each { |cid|
1072
+ out << wrap_native_typed_with(nd, cid, 8, wrapper: wrapper)
1073
+ }
1074
+ return
1075
+ end
1076
+ collect_descendant_ids(nd, rid, nt, out, wrapper)
1077
+ return
1078
+ end
1079
+ if n.respond_to?(:children)
1080
+ stack = n.children.to_a.reverse
1081
+ while (c = stack.pop)
1082
+ push_if_matches(c.is_a?(Scrapetor::Node) ? c : wrap_dom(c), nt, out)
1083
+ if c.respond_to?(:children)
1084
+ kids = c.children.to_a
1085
+ stack.concat(kids.reverse)
1086
+ end
1087
+ end
1088
+ end
1089
+ end
1090
+
1091
+ def collect_descendant_ids(nd, rid, nt, out, wrapper)
1092
+ # Range walk: ids (rid+1 .. dfs_out(rid)] are descendants. Filter
1093
+ # by node test and push wrapped results. Skips non-elements
1094
+ # unless the test wants them.
1095
+ # For DocumentWrapper rid=0, we want to enumerate all descendants
1096
+ # which means everything in the arena from id 1 up.
1097
+ size = nd.size
1098
+ lo = rid + 1
1099
+ hi = size - 1
1100
+ # We can fall back to a generic stack walk if needed, but ids
1101
+ # are pre-order in the unmutated case, so the range walk is exact.
1102
+ # node_type call avoids loading nodes we'll skip immediately.
1103
+ case nt
1104
+ when :any_element, :node
1105
+ (lo..hi).each do |k|
1106
+ t = nd.node_type(k)
1107
+ next unless t == 1 || (nt == :node && (t == 1 || t == 3 || t == 8))
1108
+ out << wrap_native_typed_with(nd, k, t, wrapper: wrapper)
1109
+ end
1110
+ when :text
1111
+ (lo..hi).each do |k|
1112
+ t = nd.node_type(k)
1113
+ next unless t == 3
1114
+ out << wrap_native_typed_with(nd, k, t, wrapper: wrapper)
1115
+ end
1116
+ when :comment
1117
+ (lo..hi).each do |k|
1118
+ t = nd.node_type(k)
1119
+ next unless t == 8
1120
+ out << wrap_native_typed_with(nd, k, t, wrapper: wrapper)
1121
+ end
1122
+ when Hash
1123
+ if (name = nt[:name])
1124
+ target = name.downcase
1125
+ (lo..hi).each do |k|
1126
+ next unless nd.node_type(k) == 1
1127
+ n = nd.node_name(k)
1128
+ next unless n.casecmp(target).zero?
1129
+ out << wrap_native_typed_with(nd, k, 1, wrapper: wrapper)
1130
+ end
1131
+ end
1132
+ end
1133
+ end
1134
+
1135
+ def ancestors_of(n)
1136
+ nd, rid, wrapper = arena_handle_for(n)
1137
+ if nd
1138
+ nd.node_ancestor_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
1139
+ elsif n.is_a?(Scrapetor::Node)
1140
+ list = []
1141
+ cur = n.parent
1142
+ while cur
1143
+ list << cur
1144
+ cur = cur.parent
1145
+ end
1146
+ list.reverse
1147
+ else
1148
+ []
1149
+ end
1150
+ end
1151
+
1152
+ def following_siblings_of(n)
1153
+ nd, rid, wrapper = arena_handle_for(n)
1154
+ if nd
1155
+ nd.node_following_sibling_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
1156
+ elsif n.is_a?(Scrapetor::Node)
1157
+ out = []
1158
+ cur = n.next_sibling
1159
+ while cur
1160
+ out << cur if cur.respond_to?(:element?) && cur.element?
1161
+ cur = cur.respond_to?(:next_sibling) ? cur.next_sibling : nil
1162
+ end
1163
+ out
1164
+ else
1165
+ []
1166
+ end
1167
+ end
1168
+
1169
+ def preceding_siblings_of(n)
1170
+ nd, rid, wrapper = arena_handle_for(n)
1171
+ if nd
1172
+ nd.node_preceding_sibling_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
1173
+ elsif n.is_a?(Scrapetor::Node)
1174
+ out = []
1175
+ cur = n.previous_sibling
1176
+ while cur
1177
+ out.unshift(cur) if cur.respond_to?(:element?) && cur.element?
1178
+ cur = cur.respond_to?(:previous_sibling) ? cur.previous_sibling : nil
1179
+ end
1180
+ out
1181
+ else
1182
+ []
1183
+ end
1184
+ end
1185
+
1186
+ def following_of(n)
1187
+ nd, rid, wrapper = arena_handle_for(n)
1188
+ return [] unless nd
1189
+ nd.node_following_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
1190
+ end
1191
+
1192
+ def preceding_of(n)
1193
+ nd, rid, wrapper = arena_handle_for(n)
1194
+ return [] unless nd
1195
+ nd.node_preceding_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
1196
+ end
1197
+
1198
+ def collect_attributes(n, nt, out)
1199
+ if native_node?(n)
1200
+ attrs = n.doc.node_attributes(n.id)
1201
+ attrs.each do |name, val|
1202
+ case nt
1203
+ when :any_element, :any_node, :node
1204
+ out << val
1205
+ when Hash
1206
+ if nt[:name].nil? || nt[:name] == "*" || name.casecmp(nt[:name]).zero?
1207
+ out << val
1208
+ end
1209
+ end
1210
+ end
1211
+ elsif n.is_a?(Scrapetor::Node)
1212
+ attrs = n.attributes
1213
+ attrs.each do |name, val|
1214
+ case nt
1215
+ when Hash
1216
+ if nt[:name].nil? || nt[:name] == "*" || name.casecmp(nt[:name]).zero?
1217
+ out << val
1218
+ end
1219
+ else
1220
+ out << val
1221
+ end
1222
+ end
1223
+ end
1224
+ end
1225
+
1226
+ def push_if_matches(n, nt, out)
1227
+ return unless matches_node_test?(n, nt)
1228
+ out << n
1229
+ end
1230
+
1231
+ def matches_node_test?(n, nt)
1232
+ case nt
1233
+ when :any_element
1234
+ n.respond_to?(:element?) ? n.element? : (n.respond_to?(:name) && !n.name.start_with?("#"))
1235
+ when :text
1236
+ n.respond_to?(:text?) && n.text?
1237
+ when :comment
1238
+ n.respond_to?(:comment?) && n.comment?
1239
+ when :node
1240
+ true
1241
+ when Hash
1242
+ return false unless n.respond_to?(:name)
1243
+ target = nt[:name]
1244
+ return false if target.nil?
1245
+ return true if target == "*"
1246
+ name = n.name
1247
+ return false if name.nil? || name.start_with?("#")
1248
+ name.casecmp(target).zero?
1249
+ else
1250
+ false
1251
+ end
1252
+ end
1253
+
1254
+ # ---- Native wrapping helpers --------------------------------------
1255
+
1256
+ def native_handles_for(context)
1257
+ if context.is_a?(Scrapetor::Document)
1258
+ bk = context.backing
1259
+ if defined?(Scrapetor::Native::DocumentWrapper) && bk.is_a?(Scrapetor::Native::DocumentWrapper) &&
1260
+ bk.native.respond_to?(:node_following_sibling_ids)
1261
+ return [bk.native, 0]
1262
+ end
1263
+ elsif context.is_a?(Scrapetor::Node)
1264
+ bk = context.backing_node
1265
+ if bk.respond_to?(:id) && bk.respond_to?(:doc) && bk.doc.respond_to?(:node_following_sibling_ids)
1266
+ return [bk.doc, bk.id]
1267
+ end
1268
+ end
1269
+ [nil, nil]
1270
+ end
1271
+
1272
+ def native_node?(n)
1273
+ return false unless n
1274
+ n.respond_to?(:id) && n.respond_to?(:doc) && n.doc.respond_to?(:node_following_sibling_ids)
1275
+ end
1276
+
1277
+ def native_wrapper_for(n)
1278
+ return n if n.is_a?(Scrapetor::Native::DocumentWrapper)
1279
+ return n.wrapper if n.respond_to?(:wrapper)
1280
+ nil
1281
+ end
1282
+
1283
+ def wrap_native(id)
1284
+ return nil if @native_doc.nil? || id.nil?
1285
+ wrap_native_typed(id, @native_doc.node_type(id))
1286
+ end
1287
+
1288
+ def wrap_native_typed(id, type)
1289
+ wrap_native_typed_with(@native_doc, id, type, wrapper: @initial.respond_to?(:wrapper) ? @initial.wrapper : nil)
1290
+ end
1291
+
1292
+ def wrap_native_typed_with(nd, id, type, wrapper: nil)
1293
+ case type
1294
+ when 1
1295
+ Scrapetor::Node.new(@document, Scrapetor::Native::Element.new(nd, id, wrapper))
1296
+ when 8
1297
+ Scrapetor::CommentNode.new(@document, nd.node_comment_text(id))
1298
+ when 3
1299
+ # text node — use TextNode (String subclass that responds to
1300
+ # text?, name, etc.) so XPath predicates against text-node
1301
+ # sets behave like Nokogiri's.
1302
+ Scrapetor::TextNode.new(nd.node_text(id))
1303
+ else
1304
+ # doc / unknown
1305
+ nil
1306
+ end
1307
+ end
1308
+
1309
+ def wrap_dom(node)
1310
+ return node if node.is_a?(Scrapetor::Node) || node.is_a?(Scrapetor::CommentNode)
1311
+ if node.respond_to?(:comment?) && node.comment?
1312
+ Scrapetor::CommentNode.new(@document, node.respond_to?(:content) ? node.content : node.to_s)
1313
+ elsif node.respond_to?(:text?) && node.text?
1314
+ node.respond_to?(:content) ? node.content : node.to_s
1315
+ elsif node.respond_to?(:element?) && node.element?
1316
+ Scrapetor::Node.new(@document, node)
1317
+ else
1318
+ node
1319
+ end
1320
+ end
1321
+
1322
+ def node_identity(n)
1323
+ if n.is_a?(Scrapetor::Node)
1324
+ bk = n.backing_node
1325
+ bk.respond_to?(:id) ? [:nat, bk.respond_to?(:doc) ? bk.doc.object_id : nil, bk.id] : bk.object_id
1326
+ else
1327
+ n.object_id
1328
+ end
1329
+ end
1330
+
1331
+ # ---- XPath type coercions ----------------------------------------
1332
+
1333
+ def xpath_boolean(v)
1334
+ case v
1335
+ when nil then false
1336
+ when true, false then v
1337
+ when Numeric then !(v.zero? || (v.respond_to?(:nan?) && v.nan?))
1338
+ when String then !v.empty?
1339
+ when Array then !v.empty?
1340
+ else true
1341
+ end
1342
+ end
1343
+
1344
+ def xpath_string(v)
1345
+ case v
1346
+ when nil then ""
1347
+ when String then v
1348
+ when true then "true"
1349
+ when false then "false"
1350
+ when Float
1351
+ if v.nan? then "NaN"
1352
+ elsif v.infinite? then v.positive? ? "Infinity" : "-Infinity"
1353
+ elsif v == v.to_i then v.to_i.to_s
1354
+ else v.to_s
1355
+ end
1356
+ when Numeric then v.to_s
1357
+ when Array
1358
+ n = v.first
1359
+ xpath_string_for_node(n)
1360
+ else
1361
+ xpath_string_for_node(v)
1362
+ end
1363
+ end
1364
+
1365
+ def xpath_string_for_node(n)
1366
+ return "" if n.nil?
1367
+ return n if n.is_a?(String)
1368
+ if n.respond_to?(:text)
1369
+ n.text.to_s
1370
+ elsif n.respond_to?(:to_s)
1371
+ n.to_s
1372
+ else
1373
+ ""
1374
+ end
1375
+ end
1376
+
1377
+ def xpath_number(v)
1378
+ case v
1379
+ when nil then Float::NAN
1380
+ when Numeric then v
1381
+ when true then 1
1382
+ when false then 0
1383
+ when String
1384
+ s = v.strip
1385
+ return Float::NAN if s.empty?
1386
+ if s =~ /\A-?\d+\.?\d*\z/ || s =~ /\A-?\.\d+\z/
1387
+ s.include?(".") ? s.to_f : s.to_i
1388
+ else
1389
+ Float::NAN
1390
+ end
1391
+ when Array
1392
+ xpath_number(xpath_string(v))
1393
+ else
1394
+ xpath_number(xpath_string(v))
1395
+ end
1396
+ end
1397
+
1398
+ # ---- Comparison rules (XPath 1.0 §3.4) --------------------------
1399
+
1400
+ def do_compare(op, l, r)
1401
+ # If either operand is a node-set, the comparison is true if any
1402
+ # node satisfies the condition against the other operand.
1403
+ if l.is_a?(Array) || r.is_a?(Array)
1404
+ a = l.is_a?(Array) ? l : [l]
1405
+ b = r.is_a?(Array) ? r : [r]
1406
+ return compare_node_sets(op, a, b)
1407
+ end
1408
+ case op
1409
+ when :eq, :neq
1410
+ # If neither is a node-set, type coercion:
1411
+ # - if either is boolean → both booleans
1412
+ # - else if either is number → both numbers
1413
+ # - else → both strings
1414
+ if l.is_a?(TrueClass) || l.is_a?(FalseClass) ||
1415
+ r.is_a?(TrueClass) || r.is_a?(FalseClass)
1416
+ res = xpath_boolean(l) == xpath_boolean(r)
1417
+ elsif l.is_a?(Numeric) || r.is_a?(Numeric)
1418
+ res = xpath_number(l) == xpath_number(r)
1419
+ else
1420
+ res = xpath_string(l) == xpath_string(r)
1421
+ end
1422
+ op == :eq ? res : !res
1423
+ else
1424
+ ln = xpath_number(l); rn = xpath_number(r)
1425
+ return false if (ln.is_a?(Float) && ln.nan?) || (rn.is_a?(Float) && rn.nan?)
1426
+ case op
1427
+ when :lt then ln < rn
1428
+ when :le then ln <= rn
1429
+ when :gt then ln > rn
1430
+ when :ge then ln >= rn
1431
+ end
1432
+ end
1433
+ end
1434
+
1435
+ def compare_node_sets(op, a, b)
1436
+ case op
1437
+ when :eq, :neq
1438
+ # Stringify each side; check if any pair matches under XPath rules.
1439
+ a.each do |x|
1440
+ sx = xpath_string(x)
1441
+ b.each do |y|
1442
+ sy = xpath_string(y)
1443
+ hit = sx == sy
1444
+ return op == :eq if hit && op == :eq
1445
+ return op == :neq if !hit && op == :neq
1446
+ end
1447
+ end
1448
+ op == :neq && a.empty? && b.empty? ? false : (op == :neq ? a.any? { |x| b.any? { |y| xpath_string(x) != xpath_string(y) } } : false)
1449
+ else
1450
+ # Numeric: any pair satisfies the comparison.
1451
+ a.each do |x|
1452
+ nx = xpath_number(x)
1453
+ next if nx.is_a?(Float) && nx.nan?
1454
+ b.each do |y|
1455
+ ny = xpath_number(y)
1456
+ next if ny.is_a?(Float) && ny.nan?
1457
+ ok =
1458
+ case op
1459
+ when :lt then nx < ny
1460
+ when :le then nx <= ny
1461
+ when :gt then nx > ny
1462
+ when :ge then nx >= ny
1463
+ end
1464
+ return true if ok
1465
+ end
1466
+ end
1467
+ false
1468
+ end
1469
+ end
1470
+
1471
+ # ---- Functions ----------------------------------------------------
1472
+
1473
+ def call_function(name, args, context_set, position_info)
1474
+ case name
1475
+ # node-set
1476
+ when "last"
1477
+ (position_info && position_info[:last]) || context_set.length
1478
+ when "position"
1479
+ (position_info && position_info[:position]) || 1
1480
+ when "count"
1481
+ v = eval_expr(args[0], context_set, position_info)
1482
+ v.is_a?(Array) ? v.length : 0
1483
+ when "id"
1484
+ # id('foo') — return element with that id from the document.
1485
+ v = eval_expr(args[0], context_set, position_info)
1486
+ ids = v.is_a?(Array) ? v.map { |x| xpath_string(x) }.flat_map { |s| s.split(/\s+/) } : xpath_string(v).split(/\s+/)
1487
+ out = []
1488
+ ids.each do |id_str|
1489
+ hit = @document.at_css("##{id_str}") rescue nil
1490
+ out << hit if hit
1491
+ end
1492
+ out
1493
+ when "local-name"
1494
+ n = arg_first_node(args, context_set, position_info)
1495
+ n && n.respond_to?(:name) ? n.name.split(":").last.to_s : ""
1496
+ when "name"
1497
+ n = arg_first_node(args, context_set, position_info)
1498
+ n && n.respond_to?(:name) ? n.name.to_s : ""
1499
+ when "namespace-uri"
1500
+ "" # we don't model namespaces in HTML
1501
+ # string
1502
+ when "string"
1503
+ xpath_string(args.empty? ? context_set.first : eval_expr(args[0], context_set, position_info))
1504
+ when "concat"
1505
+ args.map { |a| xpath_string(eval_expr(a, context_set, position_info)) }.join
1506
+ when "starts-with"
1507
+ xpath_string(eval_expr(args[0], context_set, position_info))
1508
+ .start_with?(xpath_string(eval_expr(args[1], context_set, position_info)))
1509
+ when "contains"
1510
+ xpath_string(eval_expr(args[0], context_set, position_info))
1511
+ .include?(xpath_string(eval_expr(args[1], context_set, position_info)))
1512
+ when "substring-before"
1513
+ a = xpath_string(eval_expr(args[0], context_set, position_info))
1514
+ b = xpath_string(eval_expr(args[1], context_set, position_info))
1515
+ idx = a.index(b)
1516
+ idx ? a[0...idx] : ""
1517
+ when "substring-after"
1518
+ a = xpath_string(eval_expr(args[0], context_set, position_info))
1519
+ b = xpath_string(eval_expr(args[1], context_set, position_info))
1520
+ idx = a.index(b)
1521
+ idx ? a[(idx + b.length)..] || "" : ""
1522
+ when "substring"
1523
+ s = xpath_string(eval_expr(args[0], context_set, position_info))
1524
+ start = xpath_number(eval_expr(args[1], context_set, position_info))
1525
+ # XPath substring is 1-based, rounding to nearest integer.
1526
+ start_i = start.respond_to?(:round) ? start.round.to_i : start.to_i
1527
+ if args.size > 2
1528
+ len = xpath_number(eval_expr(args[2], context_set, position_info))
1529
+ len_i = len.respond_to?(:round) ? len.round.to_i : len.to_i
1530
+ from = [start_i, 1].max
1531
+ to = start_i + len_i
1532
+ from_i = from - 1
1533
+ to_i = [to - 1, s.length].min
1534
+ s[from_i...to_i] || ""
1535
+ else
1536
+ from = [start_i, 1].max
1537
+ s[(from - 1)..] || ""
1538
+ end
1539
+ when "string-length"
1540
+ s = args.empty? ?
1541
+ xpath_string(context_set.first) :
1542
+ xpath_string(eval_expr(args[0], context_set, position_info))
1543
+ s.length
1544
+ when "normalize-space"
1545
+ s = args.empty? ?
1546
+ xpath_string(context_set.first) :
1547
+ xpath_string(eval_expr(args[0], context_set, position_info))
1548
+ s.strip.gsub(/\s+/, " ")
1549
+ when "translate"
1550
+ s = xpath_string(eval_expr(args[0], context_set, position_info))
1551
+ from = xpath_string(eval_expr(args[1], context_set, position_info))
1552
+ to = xpath_string(eval_expr(args[2], context_set, position_info))
1553
+ # Per XPath: characters in `from` are replaced by the same-index
1554
+ # char in `to`; characters in `from` past `to`'s length are deleted.
1555
+ map = {}
1556
+ from.each_char.with_index { |c, i| map[c] = i < to.length ? to[i] : nil }
1557
+ s.chars.map { |c| map.key?(c) ? map[c] : c }.compact.join
1558
+ # boolean
1559
+ when "boolean" then xpath_boolean(eval_expr(args[0], context_set, position_info))
1560
+ when "not" then !xpath_boolean(eval_expr(args[0], context_set, position_info))
1561
+ when "true" then true
1562
+ when "false" then false
1563
+ when "lang"
1564
+ # lang('en') — true if context node's xml:lang ancestor-or-self
1565
+ # starts with 'en' (case-insensitive). HTML: also `lang` attr.
1566
+ target = xpath_string(eval_expr(args[0], context_set, position_info)).downcase
1567
+ n = context_set.first
1568
+ n = n.is_a?(Array) ? n.first : n
1569
+ while n
1570
+ lang = nil
1571
+ if n.respond_to?(:[])
1572
+ lang = (n["xml:lang"] || n["lang"]) rescue nil
1573
+ end
1574
+ return true if lang && (lang.downcase == target || lang.downcase.start_with?("#{target}-"))
1575
+ n = parent_of(n)
1576
+ end
1577
+ false
1578
+ # number
1579
+ when "number"
1580
+ xpath_number(args.empty? ? context_set.first : eval_expr(args[0], context_set, position_info))
1581
+ when "sum"
1582
+ v = eval_expr(args[0], context_set, position_info)
1583
+ v = [v] unless v.is_a?(Array)
1584
+ v.inject(0.0) { |acc, x| acc + xpath_number(x).to_f }
1585
+ when "floor"
1586
+ xpath_number(eval_expr(args[0], context_set, position_info)).floor
1587
+ when "ceiling"
1588
+ xpath_number(eval_expr(args[0], context_set, position_info)).ceil
1589
+ when "round"
1590
+ n = xpath_number(eval_expr(args[0], context_set, position_info))
1591
+ n.is_a?(Float) && n.nan? ? n : n.round
1592
+ else
1593
+ raise UnsupportedError, "unknown XPath function `#{name}()`"
1594
+ end
1595
+ end
1596
+
1597
+ def arg_first_node(args, context_set, position_info)
1598
+ v = args.empty? ? context_set : eval_expr(args[0], context_set, position_info)
1599
+ v.is_a?(Array) ? v.first : v
1600
+ end
1601
+ end
1602
+ end
1603
+ end