scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,2258 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ module Native
5
+ # Wrapper module — `Scrapetor::Native::Document` is a TypedData class
6
+ # defined in C (see ext/scrapetor/native/scrapetor_dom.c). It exposes
7
+ # node-id based accessors. This module adds Ruby-level helpers and
8
+ # the Element wrapper that `Scrapetor::Node` can wrap and operate on
9
+ # the same way it does over a pure-Ruby `Dom::Element`.
10
+ AVAILABLE_DOM = defined?(Scrapetor::Native::Document)
11
+
12
+ # ----- pseudo-element handling at the css() boundary -----
13
+
14
+ PSEUDO_ELEMENT_RE = /(::(?:text|attr\([^)]+\)|first-letter|first-line|before|after))\s*\z/i.freeze
15
+
16
+ # Wrap each String entry in TextNode so Node-style `.text` /
17
+ # `.content` accessors and Parsel-style `.get` / `.getall` both work.
18
+ # Skips nil (`bulk_attr` returns nil for missing attributes) and any
19
+ # value that's already a TextNode. Mutates in place to avoid a second
20
+ # Array allocation on the result-collection hot path.
21
+ def self.wrap_text_nodes!(arr)
22
+ return arr unless arr.is_a?(Array)
23
+ i = 0
24
+ n = arr.length
25
+ while i < n
26
+ v = arr[i]
27
+ arr[i] = Scrapetor::TextNode.new(v) if v.is_a?(String) && !v.is_a?(Scrapetor::TextNode)
28
+ i += 1
29
+ end
30
+ arr
31
+ end
32
+
33
+ # `::text` and `::attr(name)` are Scrapy/Parsel-style pseudo-elements:
34
+ # they reshape the result of a selector into strings rather than
35
+ # affecting matching. Strip them before running the query and apply
36
+ # the transform on the way out.
37
+ #
38
+ # Returns [stripped_selector, transform_kind, arg]
39
+ # transform_kind = nil | :text | :attr | :text_approx
40
+ #
41
+ # Fast-path skip when the selector has no `::` substring (the common
42
+ # case) — saves a regex match on every css() call.
43
+ def self.peel_pseudo_element(selector_str)
44
+ s = selector_str
45
+ return [s, nil, nil] unless s.include?("::")
46
+ m = s.match(PSEUDO_ELEMENT_RE)
47
+ return [s, nil, nil] unless m
48
+ head = s[0...m.begin(0)].rstrip
49
+ pe = m[1]
50
+ # `head > ::text` and `head > ::attr(x)`: strip the trailing `>`
51
+ # combinator and flip kind into the direct-only variant. The
52
+ # native plan compiles cleanly for `head` and apply_pseudo_element
53
+ # walks only the immediate children when collecting text/attrs.
54
+ direct = false
55
+ if head.end_with?(">")
56
+ head = head[0..-2].rstrip
57
+ direct = true
58
+ end
59
+ if pe.casecmp("::text").zero?
60
+ [head, direct ? :direct_text : :text, nil]
61
+ elsif (a = pe.match(/::attr\(([^)]+)\)/i))
62
+ [head, direct ? :direct_attr : :attr, a[1].strip]
63
+ else
64
+ [head, :text_approx, nil]
65
+ end
66
+ end
67
+
68
+ if AVAILABLE_DOM
69
+ # Lightweight wrapper: two slots, the native doc + node id.
70
+ # Walks like a Dom::Element so the rest of Scrapetor can treat
71
+ # it the same.
72
+ class Element
73
+ attr_reader :doc, :id
74
+
75
+ def initialize(doc, id, wrapper = nil)
76
+ @doc = doc
77
+ @id = id
78
+ @wrapper = wrapper
79
+ @dom_node = nil
80
+ end
81
+
82
+ # The DocumentWrapper governs the native arena and any lazy
83
+ # Dom view used for mutations / fallback selectors. Surface it
84
+ # so subclasses and nav helpers can stay coherent.
85
+ def wrapper
86
+ @wrapper ||= @doc.instance_variable_get(:@__scrapetor_wrapper)
87
+ end
88
+
89
+ def element?
90
+ @dom_node ? @dom_node.element? : @doc.node_is_element(@id)
91
+ end
92
+
93
+ def text?; @dom_node ? @dom_node.text? : @doc.node_type(@id) == 3; end
94
+ def comment?; @dom_node ? @dom_node.comment? : @doc.node_type(@id) == 8; end
95
+ def document?; @dom_node ? @dom_node.document? : @doc.node_type(@id) == 9; end
96
+
97
+ def name
98
+ dom_node? ? @dom_node.name : @doc.node_name(@id)
99
+ end
100
+ alias node_name name
101
+ alias tag_name name
102
+
103
+ def [](key)
104
+ dom_node? ? @dom_node[key.to_s] : @doc.node_attr(@id, key.to_s)
105
+ end
106
+ alias get_attribute []
107
+ alias attribute_value []
108
+
109
+ def attributes
110
+ if dom_node?
111
+ @dom_node.attributes
112
+ else
113
+ @doc.node_attributes(@id)
114
+ end
115
+ end
116
+
117
+ # Lightweight pair returned from `attribute_nodes` / `attribute`.
118
+ # The `.text` / `.content` / `.inner_text` accessors mirror what
119
+ # Nokogiri's Nokogiri::XML::Attr exposes — production parser code
120
+ # iterates `node.attribute_nodes` and reads `.text` on each.
121
+ AttrNode = Struct.new(:name, :value) do
122
+ def text; value.to_s; end
123
+ alias content text
124
+ alias inner_text text
125
+ def to_s
126
+ %Q{#{name}="#{value}"}
127
+ end
128
+ end
129
+
130
+ def attribute_nodes
131
+ if dom_node?
132
+ @dom_node.attribute_nodes
133
+ else
134
+ attributes.map { |k, v| AttrNode.new(k, v) }
135
+ end
136
+ end
137
+
138
+ def attribute(name)
139
+ if dom_node?
140
+ @dom_node.attribute(name)
141
+ else
142
+ v = self[name]
143
+ v && AttrNode.new(name.to_s, v)
144
+ end
145
+ end
146
+
147
+ def keys; dom_node? ? @dom_node.keys : attributes.keys; end
148
+ def values; dom_node? ? @dom_node.values : attributes.values; end
149
+ def has_attribute?(k)
150
+ if dom_node?
151
+ @dom_node.has_attribute?(k)
152
+ else
153
+ !@doc.node_attr(@id, k.to_s).nil?
154
+ end
155
+ end
156
+ alias key? has_attribute?
157
+
158
+ # Stable identity used to relocate this node inside a lazy Dom
159
+ # view after the document switches to dom-mode. Builds the same
160
+ # `/tag[idx]/.../tag[@id='x']` shape we already exposed publicly.
161
+ # Memoized per-id on the document wrapper so a fallback-heavy
162
+ # parser doesn't pay the O(depth*siblings) walk per at_css call.
163
+ def path
164
+ w = wrapper
165
+ if w && (cached = w.cached_path(@id))
166
+ return cached
167
+ end
168
+ parts = []
169
+ cur = self
170
+ while cur && cur.element?
171
+ id = cur["id"]
172
+ if id && !id.empty?
173
+ parts.unshift("#{cur.name}[@id='#{id}']")
174
+ break
175
+ end
176
+ idx = 1
177
+ sib = cur.previous_sibling
178
+ while sib
179
+ if sib.element? && sib.name == cur.name
180
+ idx += 1
181
+ end
182
+ sib = sib.previous_sibling
183
+ end
184
+ parts.unshift("#{cur.name}[#{idx}]")
185
+ cur = cur.parent
186
+ end
187
+ str = "/" + parts.join("/")
188
+ w.store_path(@id, str) if w
189
+ str
190
+ end
191
+
192
+ def fragment?; false; end
193
+ def cdata?; false; end
194
+ def processing_instruction?; false; end
195
+
196
+ def text
197
+ dom_node? ? @dom_node.text : @doc.node_text(@id)
198
+ end
199
+ alias content text
200
+ alias inner_text text
201
+
202
+ def parent
203
+ if dom_node?
204
+ p = @dom_node.parent
205
+ return nil if p.nil?
206
+ return nil unless p.respond_to?(:element?) && p.element?
207
+ wrap_dom(p)
208
+ else
209
+ pid = @doc.node_parent(@id)
210
+ pid ? Element.new(@doc, pid, wrapper) : nil
211
+ end
212
+ end
213
+
214
+ def children
215
+ if dom_node?
216
+ @dom_node.children.map { |c| wrap_dom(c) }
217
+ else
218
+ @doc.node_children(@id).map { |cid| Element.new(@doc, cid, wrapper) }
219
+ end
220
+ end
221
+
222
+ def element_children
223
+ if dom_node?
224
+ @dom_node.element_children.map { |c| wrap_dom(c) }
225
+ else
226
+ @doc.node_element_children(@id).map { |cid| Element.new(@doc, cid, wrapper) }
227
+ end
228
+ end
229
+ alias elements element_children
230
+
231
+ def first_element_child
232
+ if dom_node?
233
+ c = @dom_node.first_element_child
234
+ c && wrap_dom(c)
235
+ else
236
+ ids = @doc.node_element_children(@id)
237
+ ids.empty? ? nil : Element.new(@doc, ids.first, wrapper)
238
+ end
239
+ end
240
+
241
+ def last_element_child
242
+ if dom_node?
243
+ c = @dom_node.last_element_child
244
+ c && wrap_dom(c)
245
+ else
246
+ ids = @doc.node_element_children(@id)
247
+ ids.empty? ? nil : Element.new(@doc, ids.last, wrapper)
248
+ end
249
+ end
250
+
251
+ def next_sibling
252
+ if dom_node?
253
+ n = @dom_node.next_sibling
254
+ n && wrap_dom(n)
255
+ else
256
+ nid = @doc.node_next_sibling(@id)
257
+ nid ? Element.new(@doc, nid, wrapper) : nil
258
+ end
259
+ end
260
+
261
+ def previous_sibling
262
+ if dom_node?
263
+ n = @dom_node.previous_sibling
264
+ n && wrap_dom(n)
265
+ else
266
+ nid = @doc.node_prev_sibling(@id)
267
+ nid ? Element.new(@doc, nid, wrapper) : nil
268
+ end
269
+ end
270
+
271
+ def next_element_sibling
272
+ if dom_node?
273
+ n = @dom_node.next_element_sibling
274
+ n && wrap_dom(n)
275
+ else
276
+ cur = @doc.node_next_sibling(@id)
277
+ while cur && !@doc.node_is_element(cur)
278
+ cur = @doc.node_next_sibling(cur)
279
+ end
280
+ cur ? Element.new(@doc, cur, wrapper) : nil
281
+ end
282
+ end
283
+
284
+ def previous_element_sibling
285
+ if dom_node?
286
+ n = @dom_node.previous_element_sibling
287
+ n && wrap_dom(n)
288
+ else
289
+ cur = @doc.node_prev_sibling(@id)
290
+ while cur && !@doc.node_is_element(cur)
291
+ cur = @doc.node_prev_sibling(cur)
292
+ end
293
+ cur ? Element.new(@doc, cur, wrapper) : nil
294
+ end
295
+ end
296
+
297
+ def classes
298
+ dom_node? ? @dom_node.classes : @doc.node_classes(@id)
299
+ end
300
+
301
+ def has_class?(klass); classes.include?(klass.to_s); end
302
+
303
+ # ----- selectors -----
304
+
305
+ # Slow path for css(). Native fast path is installed as a C
306
+ # method (`native_css`) at module load and aliased to `css`,
307
+ # so the heavy Ruby dispatch only runs for shapes that the C
308
+ # path can't handle directly (heterogeneous pseudo groups,
309
+ # post-peel attr/text transforms, dom-mode mutated trees, etc.).
310
+ def css_slow(selector)
311
+ str = selector.is_a?(String) ? selector : selector.to_s
312
+ if str.include?(",") && str.include?("::") &&
313
+ Native.heterogeneous_pseudo_groups?(str)
314
+ return Native.split_selector_groups(str).flat_map { |g| css(g).to_a }
315
+ end
316
+ stripped, kind, arg = Native.peel_pseudo_element(str)
317
+ stripped = "*" if stripped.empty?
318
+ if kind && %i[text text_approx attr].include?(kind) && !dom_node?
319
+ w = wrapper
320
+ plan = w ? w.compiled_plan(stripped) : Native.compile_selector_chain(stripped)
321
+ if plan && !stripped.include?(",")
322
+ ids = @doc.run_chain(plan, @id)
323
+ return case kind
324
+ when :text, :text_approx
325
+ wire_text_parents!(@doc.bulk_text(ids), ids, w)
326
+ when :attr
327
+ wire_text_parents!(@doc.bulk_attr(ids, arg), ids, w)
328
+ end
329
+ end
330
+ end
331
+ nodes = css_native_or_fallback(stripped)
332
+ apply_pseudo_element(nodes, kind, arg)
333
+ end
334
+
335
+ def at_css_slow(selector)
336
+ str = selector.is_a?(String) ? selector : selector.to_s
337
+ # Shared memo also covers the comma/pseudo-element slow path
338
+ # — many SerpApi-style parsers call the same complex selector
339
+ # repeatedly, and across identical-HTML iterations we can
340
+ # short-circuit before even peeling.
341
+ if !@dom_node && @id.is_a?(Integer)
342
+ cached = @doc.cache_get(str, @id)
343
+ if cached
344
+ first = cached[0]
345
+ return first.nil? ? nil : Element.new(@doc, first, @wrapper)
346
+ end
347
+ end
348
+ if str.include?(",") && str.include?("::") &&
349
+ Native.heterogeneous_pseudo_groups?(str)
350
+ Native.split_selector_groups(str).each do |g|
351
+ hit = at_css(g)
352
+ return hit if hit
353
+ end
354
+ return nil
355
+ end
356
+ stripped, kind, arg = Native.peel_pseudo_element(str)
357
+ stripped = "*" if stripped.empty?
358
+ nodes = css_native_or_fallback(stripped, limit_one: true)
359
+ return nil if nodes.empty?
360
+ return nodes.first unless kind
361
+ apply_pseudo_element(nodes, kind, arg).first
362
+ end
363
+
364
+ def xpath(_expr); []; end
365
+ def at_xpath(_expr); nil; end
366
+
367
+ # Batch API at the Element level. Pass an array of selector
368
+ # strings; receive parallel results in one C round trip.
369
+ # Selectors ending in `::text` / `::attr(...)` come back as
370
+ # Arrays of strings; everything else as a NodeSet.
371
+ def batch_css(selectors)
372
+ return [] if selectors.nil? || selectors.empty?
373
+ w = @wrapper
374
+ return selectors.map { |s| css(s) } if w.nil? || @dom_node
375
+ plans = Array.new(selectors.size)
376
+ kinds = Array.new(selectors.size)
377
+ args = Array.new(selectors.size)
378
+ stripped = Array.new(selectors.size)
379
+ fallback = []
380
+ selectors.each_with_index do |sel, i|
381
+ str = sel.is_a?(String) ? sel : sel.to_s
382
+ s2, k, a = Native.peel_pseudo_element(str)
383
+ s2 = "*" if s2.empty?
384
+ kinds[i] = k
385
+ args[i] = a
386
+ stripped[i] = s2
387
+ if !s2.include?(",")
388
+ plan = w.compiled_plan(s2)
389
+ if plan
390
+ plans[i] = plan
391
+ next
392
+ end
393
+ end
394
+ fallback << i
395
+ end
396
+ id_lists = @doc.batch_chain(plans.map { |p| p || [] }, @id)
397
+ out = Array.new(selectors.size)
398
+ id_lists.each_with_index do |ids, i|
399
+ next if fallback.include?(i)
400
+ kind = kinds[i]
401
+ arg = args[i]
402
+ out[i] =
403
+ case kind
404
+ when :text, :text_approx
405
+ wire_text_parents!(@doc.bulk_text(ids), ids, w)
406
+ when :attr
407
+ wire_text_parents!(@doc.bulk_attr(ids, arg), ids, w)
408
+ else
409
+ # Plain selector — wrap ids as Elements. For consistency
410
+ # with css() return shape, expose as an Array (caller can
411
+ # wrap in NodeSet at the boundary).
412
+ ids.map { |nid| Element.new(@doc, nid, w) }
413
+ end
414
+ end
415
+ # Fall back per-selector for the few that didn't compile.
416
+ fallback.each { |i| out[i] = css(selectors[i]) }
417
+ out
418
+ end
419
+
420
+ # Hash-form batch: map of {key => selector} → {key => result}.
421
+ # The classic scrape pattern shaped as a single declarative call.
422
+ def extract_css(map)
423
+ keys = map.keys
424
+ results = batch_css(map.values)
425
+ out = {}
426
+ keys.each_with_index { |k, i| out[k] = results[i] }
427
+ out
428
+ end
429
+
430
+ # Single-result extract — one C call, fields compiled in C,
431
+ # field iteration in C, result hash assembled in C. Falls
432
+ # back to the per-field Ruby loop only when a selector can't
433
+ # be compiled natively.
434
+ def extract(map)
435
+ return slow_extract(map) if @dom_node || @wrapper.nil?
436
+ r = @doc.extract_one_h(@id, map, @wrapper)
437
+ return slow_extract(map) if r.equal?(true)
438
+ r
439
+ end
440
+
441
+ # extract_each — one C call covers compile + outer plan run +
442
+ # every (match × field) tuple resolution. Outer selector is
443
+ # peeled inside C. Falls back to Ruby per-row only when any
444
+ # selector can't compile natively.
445
+ def extract_each(outer_selector, fields)
446
+ return slow_extract_each(outer_selector, fields) if @dom_node || @wrapper.nil?
447
+ outer_str = outer_selector.is_a?(String) ? outer_selector : outer_selector.to_s
448
+ r = @doc.extract_each_h(outer_str, @id, fields, @wrapper)
449
+ return slow_extract_each(outer_selector, fields) if r.equal?(true)
450
+ r
451
+ end
452
+
453
+ private
454
+
455
+ def slow_extract(map)
456
+ out = {}
457
+ map.each_pair { |k, sel| out[k] = at_css(sel) }
458
+ out
459
+ end
460
+
461
+ def slow_extract_each(outer_selector, fields)
462
+ css(outer_selector).to_a.map do |n|
463
+ elem = n.is_a?(Element) ? n : (n.respond_to?(:backing_node) ? n.backing_node : n)
464
+ elem.is_a?(Element) ? elem.extract(fields) : Node.new(@doc, elem).extract(fields)
465
+ end
466
+ end
467
+
468
+ public
469
+
470
+ def matches?(selector)
471
+ # Walk up self's ancestor-or-self set; cheap version of
472
+ # checking whether *this* node matches the selector.
473
+ doc = wrapper ? wrapper : nil
474
+ if doc
475
+ doc.css(selector).any? { |n| n == self }
476
+ else
477
+ # No wrapper available — fall back to checking via parent.
478
+ false
479
+ end
480
+ end
481
+
482
+ # ----- serialization -----
483
+
484
+ def inner_html
485
+ if dom_node?
486
+ @dom_node.inner_html
487
+ else
488
+ element_children.map(&:to_html).join + text_only_children
489
+ end
490
+ end
491
+
492
+ def outer_html
493
+ if dom_node?
494
+ @dom_node.outer_html
495
+ else
496
+ attr_str = attributes.map { |k, v| %( #{k}="#{Dom.escape_attr(v)}") }.join
497
+ if Dom::VOID.include?(name) && @doc.node_children(@id).empty?
498
+ "<#{name}#{attr_str}>"
499
+ else
500
+ "<#{name}#{attr_str}>#{inner_html}</#{name}>"
501
+ end
502
+ end
503
+ end
504
+ alias to_html outer_html
505
+ alias to_xml outer_html
506
+ alias to_s outer_html
507
+
508
+ def node_type
509
+ dom_node? ? @dom_node.node_type : @doc.node_type(@id)
510
+ end
511
+ alias type node_type
512
+
513
+ def ==(other)
514
+ return true if equal?(other)
515
+ return false unless other.is_a?(Element)
516
+ if dom_node? && other.dom_backed?
517
+ @dom_node.equal?(other.dom_node)
518
+ elsif !dom_node? && !other.dom_backed?
519
+ @doc.equal?(other.doc) && @id == other.id
520
+ else
521
+ false
522
+ end
523
+ end
524
+ alias eql? ==
525
+
526
+ def hash
527
+ if dom_node?
528
+ @dom_node.object_id
529
+ else
530
+ [@doc.object_id, @id].hash
531
+ end
532
+ end
533
+
534
+ def fingerprint
535
+ Scrapetor::Fingerprint.structural(self)
536
+ end
537
+
538
+ # ----- mutation API -----
539
+ #
540
+ # The native arena DOM is immutable by design (it gives us the
541
+ # zero-copy parse + 137x Lexbor lead). Mutations promote the
542
+ # document to a Ruby `Dom::Document` once, then operate on the
543
+ # equivalent Dom node. Reads continue to work on either side.
544
+
545
+ def []=(key, value)
546
+ ensure_dom!
547
+ @dom_node[key.to_s] = value.nil? ? nil : value.to_s
548
+ value
549
+ end
550
+ alias set_attribute []=
551
+
552
+ def remove_attribute(key)
553
+ ensure_dom!
554
+ @dom_node.remove_attribute(key.to_s)
555
+ self
556
+ end
557
+ alias delete_attribute remove_attribute
558
+
559
+ def add_class(klass)
560
+ ensure_dom!
561
+ @dom_node.add_class(klass.to_s)
562
+ self
563
+ end
564
+ alias append_class add_class
565
+
566
+ def remove_class(klass = nil)
567
+ ensure_dom!
568
+ @dom_node.remove_class(klass && klass.to_s)
569
+ self
570
+ end
571
+
572
+ def content=(text)
573
+ ensure_dom!
574
+ @dom_node.content = text.to_s
575
+ text
576
+ end
577
+ alias text= content=
578
+
579
+ def inner_html=(html)
580
+ # Native fast path: parse the fragment in C and graft it
581
+ # directly into the arena, no Ruby Dom round-trip. The
582
+ # selector engine continues to query the native arena on
583
+ # subsequent reads (with a parent-walk descendant fallback
584
+ # for the now-non-contiguous fragment subtree).
585
+ if !@dom_node && @wrapper && !@wrapper.dom_mode?
586
+ ok = @doc.node_set_inner_html(@id, html.to_s)
587
+ return html if ok == true
588
+ end
589
+ ensure_dom!
590
+ @dom_node.inner_html = html.to_s
591
+ html
592
+ end
593
+
594
+ def add_child(node_or_html)
595
+ ensure_dom!
596
+ @dom_node.add_child(unwrap_for_mutation(node_or_html))
597
+ end
598
+ alias << add_child
599
+
600
+ def add_previous_sibling(node_or_html)
601
+ ensure_dom!
602
+ @dom_node.add_previous_sibling(unwrap_for_mutation(node_or_html))
603
+ end
604
+ alias before add_previous_sibling
605
+
606
+ def add_next_sibling(node_or_html)
607
+ ensure_dom!
608
+ @dom_node.add_next_sibling(unwrap_for_mutation(node_or_html))
609
+ end
610
+ alias after add_next_sibling
611
+
612
+ def replace(node_or_html)
613
+ ensure_dom!
614
+ @dom_node.replace(unwrap_for_mutation(node_or_html))
615
+ end
616
+ alias swap replace
617
+ alias replace_with replace
618
+
619
+ # Detach this element from its parent. When we're still on the
620
+ # native arena, mutate it in place — that avoids the cross-DOM
621
+ # path lookup (which can't always pin down a node on HTML where
622
+ # the native vs Ruby SAX parsers disagree about whitespace or
623
+ # implicit close tags). Once the document has been promoted to
624
+ # Ruby Dom by some other mutation, delegate to that side.
625
+ def remove
626
+ if @dom_node
627
+ @dom_node.remove
628
+ else
629
+ @doc.node_remove(@id)
630
+ end
631
+ self
632
+ end
633
+ alias unlink remove
634
+ alias delete remove
635
+
636
+ # Wrap this element in a parsed HTML fragment whose deepest
637
+ # descendant becomes the new parent. Matches Nokogiri's
638
+ # Node#wrap semantics.
639
+ def wrap(html_or_node)
640
+ ensure_dom!
641
+ @dom_node.wrap(html_or_node) if @dom_node.respond_to?(:wrap)
642
+ self
643
+ end
644
+
645
+ def traverse(&block)
646
+ if block_given?
647
+ yield self
648
+ element_children.each { |c| c.traverse(&block) }
649
+ self
650
+ else
651
+ enum_for(:traverse)
652
+ end
653
+ end
654
+
655
+ # Internal: was this Element already promoted to a Dom::Element?
656
+ def dom_backed?
657
+ dom_node?
658
+ end
659
+
660
+ def dom_node
661
+ @dom_node
662
+ end
663
+
664
+ # Public version of the lazy dom-promotion step. NodeSet#remove
665
+ # uses it to resolve every node to its Dom equivalent BEFORE the
666
+ # first mutation, so subsequent removals don't shift the path
667
+ # index under their feet.
668
+ def promote_to_dom!
669
+ ensure_dom!
670
+ @dom_node
671
+ end
672
+
673
+ def apply_pseudo_element(nodes, kind, arg)
674
+ case kind
675
+ when nil then nodes
676
+ when :text, :text_approx
677
+ nodes.map do |n|
678
+ t = Scrapetor::TextNode.new(n.respond_to?(:text) ? n.text.to_s : n.to_s)
679
+ t.parent_node = n if n.respond_to?(:element?) && n.element?
680
+ t
681
+ end
682
+ when :direct_text
683
+ out = []
684
+ nodes.each do |n|
685
+ str = direct_text_of(n)
686
+ tn = Scrapetor::TextNode.new(str)
687
+ tn.parent_node = n if n.respond_to?(:element?) && n.element?
688
+ out << tn
689
+ end
690
+ out
691
+ when :attr
692
+ nodes.map do |n|
693
+ v = n.respond_to?(:[]) ? n[arg] : nil
694
+ next nil if v.nil?
695
+ t = Scrapetor::TextNode.new(v)
696
+ t.parent_node = n if n.respond_to?(:element?) && n.element?
697
+ t
698
+ end
699
+ when :direct_attr
700
+ out = []
701
+ nodes.each do |n|
702
+ v = n.respond_to?(:[]) ? n[arg] : nil
703
+ next if v.nil?
704
+ tn = Scrapetor::TextNode.new(v)
705
+ tn.parent_node = n if n.respond_to?(:element?) && n.element?
706
+ out << tn
707
+ end
708
+ out
709
+ end
710
+ end
711
+
712
+ # Direct text-node children only — handles the convention
713
+ # `parent > ::text` (and `> ::attr(x)`) where descendant text
714
+ # inside child elements must NOT be included.
715
+ DOM_TYPE_TEXT = 3
716
+ def direct_text_of(n)
717
+ buf = +""
718
+ if n.is_a?(Element) && !n.send(:dom_node?)
719
+ doc = @doc
720
+ cid = doc.node_first_child(n.id)
721
+ while cid
722
+ if doc.node_type(cid) == DOM_TYPE_TEXT
723
+ buf << doc.node_text(cid).to_s
724
+ end
725
+ cid = doc.node_next_sibling(cid)
726
+ end
727
+ elsif n.respond_to?(:children)
728
+ n.children.each do |c|
729
+ if c.respond_to?(:text?) && c.text?
730
+ buf << (c.respond_to?(:text) ? c.text.to_s : c.to_s)
731
+ elsif !c.respond_to?(:element?) || !c.element?
732
+ buf << c.to_s
733
+ end
734
+ end
735
+ end
736
+ buf
737
+ end
738
+
739
+ # Helper for Element#css: take a bulk_text / bulk_attr result
740
+ # and wire each TextNode's parent to the matching Element wrapper.
741
+ def wire_text_parents!(values, ids, w)
742
+ i = 0
743
+ n = values.length
744
+ while i < n
745
+ v = values[i]
746
+ if v.is_a?(Scrapetor::TextNode)
747
+ v.parent_node = Element.new(@doc, ids[i], w)
748
+ end
749
+ i += 1
750
+ end
751
+ values
752
+ end
753
+
754
+ private
755
+
756
+ def dom_node?
757
+ !@dom_node.nil?
758
+ end
759
+
760
+ # Promote this Element (and the underlying document) to the
761
+ # Ruby DOM. After this, all reads and writes hit @dom_node and
762
+ # the wrapper's @dom_doc rather than the native arena.
763
+ #
764
+ # Three-stage lookup, each weaker than the last but always
765
+ # leaving the caller with a mutable Dom::Element to operate on:
766
+ # 1. Strict path-based locate (well-formed HTML where both
767
+ # parsers produce the same element tree).
768
+ # 2. DFS pre-order element-index lookup (handles parsers
769
+ # disagreeing on whitespace text nodes / implicit close
770
+ # tags — element-order is still stable).
771
+ # 3. Isolated subtree parse — feed our own outer_html through
772
+ # the Ruby Dom parser and use the top-level element as the
773
+ # promoted node. Mutations propagate to subsequent reads
774
+ # via @dom_node (Element#outer_html reads from there), so
775
+ # the user's `node.inner_html = ...` etc. always work even
776
+ # if we can't pin the node back into the document's Dom.
777
+ def ensure_dom!
778
+ return @dom_node if @dom_node
779
+ w = wrapper
780
+ raise NotImplementedError, "Mutation requires a DocumentWrapper" if w.nil?
781
+ w.switch_to_dom!
782
+ @dom_node = w.locate_in_dom(path) ||
783
+ w.locate_dom_by_native_id(@id) ||
784
+ isolated_dom_clone
785
+ raise NotImplementedError, "Cannot locate or clone equivalent node" if @dom_node.nil?
786
+ @dom_node
787
+ end
788
+
789
+ def isolated_dom_clone
790
+ html = to_html
791
+ return nil if html.nil? || html.empty?
792
+ frag = Scrapetor::Dom::Parser.fragment(html)
793
+ frag.find { |n| n.respond_to?(:element?) && n.element? }
794
+ end
795
+
796
+ def wrap_dom(node)
797
+ el = Element.new(@doc, @id, wrapper)
798
+ el.instance_variable_set(:@dom_node, node)
799
+ el
800
+ end
801
+
802
+ def unwrap_for_mutation(input)
803
+ if input.is_a?(Element)
804
+ input.dom_node || input.to_html
805
+ elsif input.is_a?(Scrapetor::Node)
806
+ inner = input.backing_node
807
+ if inner.is_a?(Element)
808
+ inner.dom_node || inner.to_html
809
+ else
810
+ inner
811
+ end
812
+ else
813
+ input
814
+ end
815
+ end
816
+
817
+ def text_only_children
818
+ children = @doc.node_children(@id)
819
+ children.filter_map do |cid|
820
+ @doc.node_type(cid) == 3 ? @doc.node_text(cid) : nil
821
+ end.join
822
+ end
823
+
824
+ # Try native first; fall back to the lazy Dom view on the
825
+ # wrapper. Returns an Array of Element wrappers (native or
826
+ # dom-backed).
827
+ def css_native_or_fallback(selector_str, limit_one: false)
828
+ if dom_node?
829
+ # Text / comment / doctype dom nodes don't support .css —
830
+ # NodeSet#children aggregates these alongside element nodes
831
+ # and Nokogiri-shape code paths still pump them through the
832
+ # subsequent `.css` call. Return an empty Array instead of
833
+ # blowing up with "undefined method `css`".
834
+ return [] unless @dom_node.respond_to?(:css)
835
+ return @dom_node.css(selector_str).map { |n| wrap_dom(n) }
836
+ end
837
+
838
+ w = wrapper
839
+
840
+ # Fast path: single-group selector with cached plan.
841
+ if !selector_str.include?(",")
842
+ plan = w ? w.compiled_plan(selector_str) : Native.compile_selector_chain(selector_str)
843
+ if plan
844
+ ids = @doc.run_chain(plan, @id)
845
+ ids = ids.first(1) if limit_one
846
+ return ids.map { |nid| Element.new(@doc, nid, w) }
847
+ end
848
+ # Single-group but failed to compile — try distributing
849
+ # `:is(...)` alternatives into separate groups before bailing.
850
+ expanded = Native.expand_is_groups(selector_str)
851
+ if expanded.size > 1
852
+ all = []
853
+ seen = nil
854
+ all_ok = true
855
+ expanded.each do |g|
856
+ plan = w ? w.compiled_plan(g) : Native.compile_selector_chain(g)
857
+ if plan.nil?
858
+ all_ok = false
859
+ break
860
+ end
861
+ @doc.run_chain(plan, @id).each do |nid|
862
+ seen ||= {}
863
+ next if seen[nid]
864
+ seen[nid] = true
865
+ all << Element.new(@doc, nid, w)
866
+ break if limit_one
867
+ end
868
+ break if limit_one && !all.empty?
869
+ end
870
+ return all if all_ok
871
+ end
872
+ if w
873
+ dom_scope = w.locate_in_dom(path) || w.fallback_dom
874
+ list = dom_scope.css(selector_str).to_a
875
+ list = list.first(1) if limit_one
876
+ return list.map { |n| wrap_dom(n) }
877
+ end
878
+ return []
879
+ end
880
+
881
+ all = []
882
+ seen = nil
883
+ ok = true
884
+ groups = Native.split_selector_groups(selector_str)
885
+ .flat_map { |g| Native.expand_is_groups(g) }
886
+ groups.each do |g|
887
+ plan = w ? w.compiled_plan(g) : Native.compile_selector_chain(g)
888
+ if plan.nil?
889
+ ok = false
890
+ break
891
+ end
892
+ @doc.run_chain(plan, @id).each do |nid|
893
+ seen ||= {}
894
+ next if seen[nid]
895
+ seen[nid] = true
896
+ all << Element.new(@doc, nid, w)
897
+ break if limit_one
898
+ end
899
+ break if limit_one && !all.empty?
900
+ end
901
+ return all if ok
902
+
903
+ if w
904
+ dom_scope = w.locate_in_dom(path) || w.fallback_dom
905
+ return dom_scope.css(selector_str).map { |n| wrap_dom(n) }
906
+ end
907
+ []
908
+ end
909
+ end
910
+
911
+ # Install Element#at_css / Element#css as C methods. The C versions
912
+ # do the shape check, plan-cache lookup, run-with-limit, and Element
913
+ # allocation — all without re-entering Ruby method dispatch — and
914
+ # fall through to at_css_slow / css_slow only when the selector
915
+ # shape isn't supported by the fast path.
916
+ if Native.respond_to?(:_register_element_methods)
917
+ Native._register_element_methods(Element)
918
+ Element.class_eval do
919
+ alias_method :at_css, :native_at_css
920
+ alias_method :css, :native_css
921
+ alias at at_css
922
+ alias search css
923
+ end
924
+ end
925
+ if Native.respond_to?(:_register_node_methods) && defined?(Scrapetor::Node)
926
+ Native._register_node_methods(Scrapetor::Node)
927
+ Scrapetor::Node.class_eval do
928
+ alias_method :at, :native_at
929
+ alias_method :at_css, :native_at
930
+ alias_method :css, :native_css
931
+ alias_method :search, :native_css
932
+ end
933
+ end
934
+
935
+ # Document wrapper — wraps Native::Document and provides Dom-like
936
+ # methods so `Scrapetor::Document#backing` can return one of these
937
+ # interchangeably with `Dom::Document`.
938
+ class DocumentWrapper
939
+ attr_reader :native
940
+
941
+ # The compile cache lives on the wrapper so repeated queries
942
+ # (the common case in scraping pipelines, where the same set of
943
+ # selectors run against thousands of pages) skip the parse +
944
+ # native-plan build entirely. Sized to cover typical templates;
945
+ # untouched entries fall off the back when we exceed cap.
946
+ COMPILE_CACHE_CAP = 1024
947
+
948
+ def initialize(native)
949
+ @native = native
950
+ # Back-pointer so Elements created from this wrapper can
951
+ # find their way back without us threading `wrapper:` through
952
+ # every navigation method.
953
+ native.instance_variable_set(:@__scrapetor_wrapper, self) if native.respond_to?(:instance_variable_set)
954
+ @dom_doc = nil
955
+ @dom_mode = false
956
+ @compile_cache = {}
957
+ # Path cache keyed by native node id. Stable until the tree
958
+ # mutates (dom-mode flip clears it).
959
+ @path_cache = {}
960
+ end
961
+
962
+ def cached_path(id)
963
+ @path_cache[id]
964
+ end
965
+
966
+ def store_path(id, str)
967
+ @path_cache[id] = str
968
+ end
969
+
970
+ # Look up (or compile) the native plan for a single selector group.
971
+ # `nil` means "this group uses a feature the native engine
972
+ # doesn't accept" — callers route those to the Ruby fallback.
973
+ def compiled_plan(group_str)
974
+ if (entry = @compile_cache[group_str])
975
+ return entry == false ? nil : entry
976
+ end
977
+ plan = Native.compile_selector_chain(group_str)
978
+ @compile_cache.shift if @compile_cache.size >= COMPILE_CACHE_CAP
979
+ @compile_cache[group_str] = plan.nil? ? false : plan
980
+ if plan.nil? && ENV["SCRAP_TRACE_FALLBACK"]
981
+ warn "[scrap-fallback] #{group_str}"
982
+ end
983
+ plan
984
+ end
985
+
986
+ def element?; false; end
987
+ def document?; true; end
988
+ def name; "#document"; end
989
+
990
+ def root
991
+ rid = @native.root_id
992
+ Element.new(@native, rid, self)
993
+ end
994
+
995
+ def root_element; root; end
996
+
997
+ def text; fallback_dom.text; end
998
+ def content; text; end
999
+
1000
+ # ----- selector entry points -----
1001
+
1002
+ # `lazy_css` is the fast path that Document#css uses: it returns
1003
+ # raw ids when the native engine can handle the whole selector,
1004
+ # so the Element-wrap happens once-per-iteration instead of
1005
+ # once-per-result. Falls back to the eager `css` when native
1006
+ # can't handle the selector (kind, fallback dom, etc.).
1007
+ #
1008
+ # Returns a `LazyIds` struct OR an Array of strings (for
1009
+ # ::text/::attr) OR an Array of Element wrappers (when the
1010
+ # selector needs the Dom fallback).
1011
+ LazyIds = Struct.new(:wrapper, :native, :ids)
1012
+
1013
+ def lazy_css(selector)
1014
+ str = selector.to_s
1015
+ # Heterogeneous pseudo groups: peel each group separately and
1016
+ # concatenate. Returns a flat Array of mixed Element/TextNode
1017
+ # results — callers wrap it in NodeSet via .to_a.
1018
+ if str.include?(",") && str.include?("::") &&
1019
+ Native.heterogeneous_pseudo_groups?(str)
1020
+ return Native.split_selector_groups(str).flat_map do |g|
1021
+ r = lazy_css(g)
1022
+ r.is_a?(LazyIds) ? r.ids.map { |nid| Element.new(@native, nid, self) } : r.to_a
1023
+ end
1024
+ end
1025
+ stripped, kind, arg = Native.peel_pseudo_element(str)
1026
+ stripped = "*" if stripped.empty?
1027
+ if kind && %i[text text_approx attr].include?(kind) && !@dom_mode
1028
+ ids = native_ids(stripped)
1029
+ if ids
1030
+ return case kind
1031
+ when :text, :text_approx
1032
+ wire_parent_nodes!(@native.bulk_text(ids), ids)
1033
+ when :attr
1034
+ wire_parent_nodes!(@native.bulk_attr(ids, arg), ids)
1035
+ end
1036
+ end
1037
+ end
1038
+ if !@dom_mode && kind.nil?
1039
+ ids = native_ids(stripped)
1040
+ return LazyIds.new(self, @native, ids) if ids
1041
+ end
1042
+ nodes = css_native_or_fallback(stripped)
1043
+ apply_transform(nodes, kind, arg)
1044
+ end
1045
+
1046
+ # Set each TextNode's parent_node to the matching element it
1047
+ # came from. Production parser code (Google Light's organic
1048
+ # results, Yahoo's knowledge graph) chains `result.parent.css(...)`
1049
+ # to walk into siblings of a `::text` match — without a parent
1050
+ # ref the `.parent` returns nil and the next call crashes.
1051
+ def wire_parent_nodes!(values, ids)
1052
+ i = 0
1053
+ n = values.length
1054
+ while i < n
1055
+ v = values[i]
1056
+ if v.is_a?(Scrapetor::TextNode)
1057
+ v.parent_node = Element.new(@native, ids[i], self)
1058
+ end
1059
+ i += 1
1060
+ end
1061
+ values
1062
+ end
1063
+
1064
+ def css(selector)
1065
+ str = selector.to_s
1066
+ stripped, kind, arg = Native.peel_pseudo_element(str)
1067
+ stripped = "*" if stripped.empty?
1068
+ if kind && !@dom_mode
1069
+ ids = native_ids(stripped)
1070
+ if ids
1071
+ return case kind
1072
+ when :text, :text_approx
1073
+ wire_parent_nodes!(@native.bulk_text(ids), ids)
1074
+ when :attr
1075
+ wire_parent_nodes!(@native.bulk_attr(ids, arg), ids)
1076
+ end
1077
+ end
1078
+ end
1079
+ nodes = css_native_or_fallback(stripped)
1080
+ apply_transform(nodes, kind, arg)
1081
+ end
1082
+
1083
+ def at_css(selector)
1084
+ str = selector.to_s
1085
+ stripped, kind, arg = Native.peel_pseudo_element(str)
1086
+ stripped = "*" if stripped.empty?
1087
+ nodes = css_native_or_fallback(stripped, limit_one: true)
1088
+ return nil if nodes.empty?
1089
+ return nodes.first unless kind
1090
+ apply_transform(nodes, kind, arg).first
1091
+ end
1092
+ alias at at_css
1093
+
1094
+ # Run N selectors in ONE C call, returning an Array of results
1095
+ # parallel to `selectors`. Each result is either a `LazyIds`
1096
+ # (wrapped by Document#css as a lazy NodeSet) or an Array of
1097
+ # strings (for `::text` / `::attr` pseudo-elements). Selectors
1098
+ # the native engine can't compile fall through to the per-query
1099
+ # Ruby path; the rest amortise to one Ruby dispatch.
1100
+ def batch_css(doc, selectors)
1101
+ plans = Array.new(selectors.size)
1102
+ kinds = Array.new(selectors.size)
1103
+ args = Array.new(selectors.size)
1104
+ natives = []
1105
+ native_to_orig = []
1106
+ fallback_indices = []
1107
+
1108
+ selectors.each_with_index do |sel, i|
1109
+ str = sel.to_s
1110
+ stripped, kind, arg = Native.peel_pseudo_element(str)
1111
+ stripped = "*" if stripped.empty?
1112
+ kinds[i] = kind
1113
+ args[i] = arg
1114
+ if @dom_mode || stripped.include?(",")
1115
+ fallback_indices << i
1116
+ next
1117
+ end
1118
+ plan = compiled_plan(stripped)
1119
+ if plan
1120
+ plans[i] = plan
1121
+ natives << plan
1122
+ native_to_orig << i
1123
+ else
1124
+ fallback_indices << i
1125
+ end
1126
+ end
1127
+
1128
+ out = Array.new(selectors.size)
1129
+
1130
+ # One C call across all native plans.
1131
+ unless natives.empty?
1132
+ id_lists = @native.batch_chain(natives, nil)
1133
+ id_lists.each_with_index do |ids, j|
1134
+ orig = native_to_orig[j]
1135
+ out[orig] = case kinds[orig]
1136
+ when :text, :text_approx
1137
+ wire_parent_nodes!(@native.bulk_text(ids), ids)
1138
+ when :attr
1139
+ wire_parent_nodes!(@native.bulk_attr(ids, args[orig]), ids)
1140
+ else
1141
+ LazyIds.new(self, @native, ids)
1142
+ end
1143
+ end
1144
+ end
1145
+
1146
+ # Per-selector Ruby path for the few that need it.
1147
+ fallback_indices.each do |i|
1148
+ out[i] = lazy_css(selectors[i])
1149
+ end
1150
+
1151
+ # Wrap each result as Document#css would. Lazy NodeSet for
1152
+ # node-based results; pass strings through.
1153
+ out.map! do |r|
1154
+ if r.is_a?(LazyIds)
1155
+ Scrapetor::NodeSet.new(doc, r)
1156
+ else
1157
+ r
1158
+ end
1159
+ end
1160
+ out
1161
+ end
1162
+
1163
+ def xpath(_expr); []; end
1164
+ def at_xpath(_expr); nil; end
1165
+
1166
+ def traverse(&block)
1167
+ return enum_for(:traverse) unless block_given?
1168
+ root.traverse(&block)
1169
+ self
1170
+ end
1171
+
1172
+ def to_html
1173
+ @dom_mode ? @dom_doc.to_html : @native.html
1174
+ end
1175
+ alias to_s to_html
1176
+
1177
+ def html
1178
+ root
1179
+ end
1180
+
1181
+ def body
1182
+ at_css("body")
1183
+ end
1184
+
1185
+ def head
1186
+ at_css("head")
1187
+ end
1188
+
1189
+ # ----- internals for the mutation fallback -----
1190
+
1191
+ def dom_mode?; @dom_mode; end
1192
+
1193
+ # Build (once) and return the Ruby DOM view of this document.
1194
+ # Used by Element#css fallback when the selector exceeds the
1195
+ # native engine's grammar, and by Element mutations.
1196
+ #
1197
+ # The previous implementation re-tokenised the entire HTML
1198
+ # through the Ruby SAX parser — for a 400 KB page that's
1199
+ # 50–100 ms on the first mutating call. The native arena is
1200
+ # already parsed; we can build the Dom tree by walking it
1201
+ # node-by-node in O(N) instead of O(bytes). That drops to
1202
+ # ~5–10 ms on the same page.
1203
+ def fallback_dom
1204
+ @dom_doc ||= build_dom_from_native
1205
+ end
1206
+
1207
+ # O(N nodes) tree-walk that materialises a Scrapetor::Dom
1208
+ # mirror of the native arena. Used for the mutation fallback
1209
+ # path so node mutations have a Ruby-side handle to operate
1210
+ # on, without re-tokenising the source HTML.
1211
+ def build_dom_from_native
1212
+ doc = Scrapetor::Dom::Document.new
1213
+ size = @native.size
1214
+ return doc if size <= 1
1215
+ id_to_dom = Array.new(size)
1216
+ id_to_dom[0] = doc
1217
+ i = 1
1218
+ while i < size
1219
+ type = @native.node_type(i)
1220
+ # Skip removed (tombstoned via dom_node_remove). Type
1221
+ # constants: 1=element, 3=text, 8=comment, 9=doc,
1222
+ # 0xFE=REMOVED.
1223
+ if type != 1 && type != 3 && type != 8
1224
+ i += 1
1225
+ next
1226
+ end
1227
+ parent_id = @native.node_parent(i) || 0
1228
+ parent_dom = id_to_dom[parent_id] || doc
1229
+ node = case type
1230
+ when 1
1231
+ name = @native.node_name(i)
1232
+ attrs = @native.node_attributes(i)
1233
+ Scrapetor::Dom::Element.new(name, attrs)
1234
+ when 3
1235
+ Scrapetor::Dom::Text.new(@native.node_text(i))
1236
+ when 8
1237
+ Scrapetor::Dom::Comment.new(@native.node_text(i))
1238
+ end
1239
+ parent_dom.add_child(node)
1240
+ id_to_dom[i] = node
1241
+ i += 1
1242
+ end
1243
+ doc
1244
+ end
1245
+
1246
+ # Promote the document to dom-mode. After this, css() runs only
1247
+ # against the Dom view (it is the source of truth for mutations
1248
+ # the user has already made).
1249
+ def switch_to_dom!
1250
+ fallback_dom
1251
+ @dom_mode = true
1252
+ # Cached paths may not survive a mutation series; let them
1253
+ # rebuild lazily after the switch.
1254
+ @path_cache = {}
1255
+ end
1256
+
1257
+ # Walk a `/tag[idx]/.../tag[@id='x']` path inside the lazy Dom
1258
+ # view. Used by Element#ensure_dom! to relocate itself after
1259
+ # promotion.
1260
+ def locate_in_dom(path_str)
1261
+ doc = fallback_dom
1262
+ parts = path_str.to_s.split("/").reject(&:empty?)
1263
+ cur = doc
1264
+ parts.each do |part|
1265
+ if (m = part.match(/\A([\w-]+)\[@id='([^']+)'\]\z/))
1266
+ tag = m[1]; id = m[2]
1267
+ found = nil
1268
+ walk_elements(doc) do |el|
1269
+ if el.name == tag && el["id"] == id
1270
+ found = el
1271
+ break
1272
+ end
1273
+ end
1274
+ return nil if found.nil?
1275
+ cur = found
1276
+ elsif (m = part.match(/\A([\w-]+)\[(\d+)\]\z/))
1277
+ tag = m[1]; idx = m[2].to_i
1278
+ children = cur.respond_to?(:children) ? cur.children : []
1279
+ same = children.select { |c| c.respond_to?(:element?) && c.element? && c.name == tag }
1280
+ return nil if same.empty? || idx < 1 || idx > same.length
1281
+ cur = same[idx - 1]
1282
+ else
1283
+ return nil
1284
+ end
1285
+ end
1286
+ cur
1287
+ end
1288
+
1289
+ # Robust cross-DOM lookup. Native ids enumerate every node in
1290
+ # the arena (text, comments, elements). Both parsers visit
1291
+ # ELEMENT nodes in document order, so the N-th element on the
1292
+ # native side is the N-th element on the Ruby side — even when
1293
+ # the two parsers disagree on whitespace text nodes or implicit
1294
+ # close-tag handling. Used as a fallback when the path-based
1295
+ # locator can't find a match.
1296
+ def locate_dom_by_native_id(native_id)
1297
+ @native_element_offset_map ||= build_native_element_offset_map
1298
+ offset = @native_element_offset_map[native_id]
1299
+ return nil if offset.nil?
1300
+ @dom_element_index ||= build_dom_element_index
1301
+ @dom_element_index[offset]
1302
+ end
1303
+
1304
+ private
1305
+
1306
+ def build_native_element_offset_map
1307
+ map = {}
1308
+ count = 0
1309
+ size = @native.size
1310
+ i = 0
1311
+ while i < size
1312
+ if @native.node_is_element(i)
1313
+ map[i] = count
1314
+ count += 1
1315
+ end
1316
+ i += 1
1317
+ end
1318
+ map
1319
+ end
1320
+
1321
+ def build_dom_element_index
1322
+ list = []
1323
+ walk_elements(fallback_dom) { |el| list << el }
1324
+ list
1325
+ end
1326
+
1327
+ public
1328
+
1329
+ # Run the cached plan(s) for a selector and return the raw id
1330
+ # Array, or nil if any group needs the Ruby fallback. Used by
1331
+ # css() to feed bulk_text / bulk_attr without intermediate
1332
+ # Element allocations.
1333
+ def native_ids(selector_str)
1334
+ if !selector_str.include?(",")
1335
+ plan = compiled_plan(selector_str)
1336
+ return @native.run_chain(plan, nil) if plan
1337
+ expanded = Native.expand_is_groups(selector_str)
1338
+ return nil if expanded.size <= 1
1339
+ ids = []
1340
+ seen = nil
1341
+ expanded.each do |g|
1342
+ p = compiled_plan(g)
1343
+ return nil unless p
1344
+ @native.run_chain(p, nil).each do |nid|
1345
+ seen ||= {}
1346
+ next if seen[nid]
1347
+ seen[nid] = true
1348
+ ids << nid
1349
+ end
1350
+ end
1351
+ return ids
1352
+ end
1353
+ ids = []
1354
+ seen = nil
1355
+ groups = Native.split_selector_groups(selector_str)
1356
+ .flat_map { |g| Native.expand_is_groups(g) }
1357
+ groups.each do |g|
1358
+ plan = compiled_plan(g)
1359
+ return nil unless plan
1360
+ @native.run_chain(plan, nil).each do |nid|
1361
+ seen ||= {}
1362
+ next if seen[nid]
1363
+ seen[nid] = true
1364
+ ids << nid
1365
+ end
1366
+ end
1367
+ ids
1368
+ end
1369
+
1370
+ def apply_transform(nodes, kind, arg)
1371
+ case kind
1372
+ when nil then nodes
1373
+ when :text, :text_approx
1374
+ nodes.map do |n|
1375
+ t = Scrapetor::TextNode.new(n.respond_to?(:text) ? n.text.to_s : n.to_s)
1376
+ t.parent_node = n if n.respond_to?(:element?) && n.element?
1377
+ t
1378
+ end
1379
+ when :attr
1380
+ nodes.map do |n|
1381
+ v = n.respond_to?(:[]) ? n[arg] : nil
1382
+ next nil if v.nil?
1383
+ t = Scrapetor::TextNode.new(v)
1384
+ t.parent_node = n if n.respond_to?(:element?) && n.element?
1385
+ t
1386
+ end
1387
+ when :direct_text
1388
+ nodes.map do |n|
1389
+ t = Scrapetor::TextNode.new(direct_text_of_any(n))
1390
+ t.parent_node = n if n.respond_to?(:element?) && n.element?
1391
+ t
1392
+ end
1393
+ when :direct_attr
1394
+ out = []
1395
+ nodes.each do |n|
1396
+ v = n.respond_to?(:[]) ? n[arg] : nil
1397
+ next if v.nil?
1398
+ t = Scrapetor::TextNode.new(v)
1399
+ t.parent_node = n if n.respond_to?(:element?) && n.element?
1400
+ out << t
1401
+ end
1402
+ out
1403
+ end
1404
+ end
1405
+
1406
+ # Direct text-node children of an element. Used at the
1407
+ # Document/wrapper level — accepts either a native Element or a
1408
+ # Dom-fallback node and pulls only the immediate text children.
1409
+ def direct_text_of_any(n)
1410
+ buf = +""
1411
+ if n.is_a?(Element) && !n.send(:dom_node?)
1412
+ cid = @native.node_first_child(n.id)
1413
+ while cid
1414
+ if @native.node_type(cid) == 3
1415
+ buf << @native.node_text(cid).to_s
1416
+ end
1417
+ cid = @native.node_next_sibling(cid)
1418
+ end
1419
+ elsif n.respond_to?(:children)
1420
+ n.children.each do |c|
1421
+ if c.respond_to?(:text?) && c.text?
1422
+ buf << (c.respond_to?(:text) ? c.text.to_s : c.to_s)
1423
+ elsif !c.respond_to?(:element?) || !c.element?
1424
+ buf << c.to_s
1425
+ end
1426
+ end
1427
+ end
1428
+ buf
1429
+ end
1430
+
1431
+ private
1432
+
1433
+ def walk_elements(scope, &block)
1434
+ children = scope.respond_to?(:children) ? scope.children : []
1435
+ children.each do |c|
1436
+ next unless c.respond_to?(:element?) && c.element?
1437
+ yield c
1438
+ walk_elements(c, &block)
1439
+ end
1440
+ end
1441
+
1442
+ def css_native_or_fallback(selector_str, limit_one: false)
1443
+ # Once in dom-mode, native arena is stale wrt user mutations.
1444
+ if @dom_mode
1445
+ doc = fallback_dom
1446
+ list = doc.css(selector_str).to_a
1447
+ list = list.first(1) if limit_one
1448
+ return list.map { |n| wrap_dom_node(n) }
1449
+ end
1450
+
1451
+ # Fast path: single-group selector with cached plan.
1452
+ if !selector_str.include?(",")
1453
+ plan = compiled_plan(selector_str)
1454
+ if plan
1455
+ ids = @native.run_chain(plan, nil)
1456
+ ids = ids.first(1) if limit_one
1457
+ return ids.map { |nid| Element.new(@native, nid, self) }
1458
+ end
1459
+ expanded = Native.expand_is_groups(selector_str)
1460
+ if expanded.size > 1
1461
+ all = []
1462
+ seen = nil
1463
+ all_ok = true
1464
+ expanded.each do |g|
1465
+ p = compiled_plan(g)
1466
+ if p.nil?
1467
+ all_ok = false
1468
+ break
1469
+ end
1470
+ @native.run_chain(p, nil).each do |nid|
1471
+ seen ||= {}
1472
+ next if seen[nid]
1473
+ seen[nid] = true
1474
+ all << Element.new(@native, nid, self)
1475
+ break if limit_one
1476
+ end
1477
+ break if limit_one && !all.empty?
1478
+ end
1479
+ return all if all_ok
1480
+ end
1481
+ # Not natively supported — route to Dom fallback.
1482
+ list = fallback_dom.css(selector_str).to_a
1483
+ list = list.first(1) if limit_one
1484
+ return list.map { |n| wrap_dom_node(n) }
1485
+ end
1486
+
1487
+ # Comma-separated groups.
1488
+ all = []
1489
+ seen = nil
1490
+ ok = true
1491
+ groups = Native.split_selector_groups(selector_str)
1492
+ .flat_map { |g| Native.expand_is_groups(g) }
1493
+ groups.each do |g|
1494
+ plan = compiled_plan(g)
1495
+ if plan.nil?
1496
+ ok = false
1497
+ break
1498
+ end
1499
+ @native.run_chain(plan, nil).each do |nid|
1500
+ seen ||= {}
1501
+ next if seen[nid]
1502
+ seen[nid] = true
1503
+ all << Element.new(@native, nid, self)
1504
+ break if limit_one
1505
+ end
1506
+ break if limit_one && !all.empty?
1507
+ end
1508
+ return all if ok
1509
+
1510
+ list = fallback_dom.css(selector_str).to_a
1511
+ list = list.first(1) if limit_one
1512
+ list.map { |n| wrap_dom_node(n) }
1513
+ end
1514
+
1515
+ def wrap_dom_node(dom_el)
1516
+ el = Element.new(@native, 0, self)
1517
+ el.instance_variable_set(:@dom_node, dom_el)
1518
+ el
1519
+ end
1520
+ end
1521
+ end # if AVAILABLE_DOM
1522
+
1523
+ # ----- selector compilation: CSS string -> chain of native plans -----
1524
+
1525
+ # Each plan entry is `[selector_atom, combinator_or_nil]`.
1526
+ # selector_atom = [tag, classes, id, attrs]
1527
+ # selector_atom = [tag, classes, id, attrs, pseudo_data] # extended
1528
+ # pseudo_data = nil | [flags, nth_a, nth_b, nth_type_a, nth_type_b,
1529
+ # not_inner, is_inner, has_inner]
1530
+ # combinator = nil | "descendant" | "child"
1531
+ #
1532
+ # Returns nil (never raises) when the selector contains a feature
1533
+ # the native engine doesn't accept (sibling combinator, comma at
1534
+ # top level, pseudo with a non-simple inner selector). Callers route
1535
+ # those to the Ruby DOM fallback in this same gem.
1536
+
1537
+ # Mirrors C_PS_* in ext/scrapetor/native/scrapetor_dom.c. Keep in sync.
1538
+ NATIVE_PSEUDO_FLAGS = {
1539
+ "first-child" => 1 << 0,
1540
+ "last-child" => 1 << 1,
1541
+ "only-child" => 1 << 2,
1542
+ "first-of-type" => 1 << 3,
1543
+ "last-of-type" => 1 << 4,
1544
+ "only-of-type" => 1 << 5,
1545
+ "empty" => 1 << 6,
1546
+ "root" => 1 << 7,
1547
+ "checked" => 1 << 8,
1548
+ "disabled" => 1 << 9,
1549
+ "enabled" => 1 << 10,
1550
+ "required" => 1 << 11,
1551
+ "optional" => 1 << 12,
1552
+ "read-only" => 1 << 13,
1553
+ "read-write" => 1 << 14,
1554
+ "any-link" => 1 << 15,
1555
+ "link" => 1 << 15,
1556
+ "scope" => 1 << 23
1557
+ }.freeze
1558
+
1559
+ NATIVE_NTH_BITS = {
1560
+ "nth-child" => 1 << 16,
1561
+ "nth-last-child" => 1 << 17,
1562
+ "nth-of-type" => 1 << 18,
1563
+ "nth-last-of-type" => 1 << 19
1564
+ }.freeze
1565
+
1566
+ NATIVE_PSEUDO_FALLBACK = :__scrapetor_native_fallback__
1567
+
1568
+ def self.compile_selector_chain(selector_str)
1569
+ plan = Scrapetor::Selector.compile(selector_str)
1570
+ out = []
1571
+ plan.each do |atom|
1572
+ pseudo_data = nil
1573
+ if atom.pseudos && !atom.pseudos.empty?
1574
+ pseudo_data = native_pseudo_data(atom.pseudos)
1575
+ return nil if pseudo_data == NATIVE_PSEUDO_FALLBACK
1576
+ end
1577
+ sel = [
1578
+ atom.tag ? atom.tag.to_s : nil,
1579
+ atom.classes,
1580
+ atom.id,
1581
+ atom.attrs,
1582
+ pseudo_data
1583
+ ]
1584
+ combo =
1585
+ case atom.combinator
1586
+ when :descendant then "descendant"
1587
+ when :child then "child"
1588
+ when :adj then "adjacent"
1589
+ when :gen then "sibling"
1590
+ else nil
1591
+ end
1592
+ out << [sel, combo]
1593
+ end
1594
+ out
1595
+ rescue ArgumentError
1596
+ nil
1597
+ end
1598
+
1599
+ # Compile the Atom#pseudos list into the eight-element Array the C
1600
+ # side reads. Returns NATIVE_PSEUDO_FALLBACK if any pseudo is
1601
+ # outside the native subset (in which case the whole chain falls
1602
+ # back to Ruby).
1603
+ def self.native_pseudo_data(pseudos)
1604
+ flags = 0
1605
+ nth_a = nth_b = 0
1606
+ nth_type_a = nth_type_b = 0
1607
+ not_inner = []
1608
+ is_inner = []
1609
+ has_inner = []
1610
+ not_has_inner = []
1611
+ has_child_inner = []
1612
+ not_has_child_inner = []
1613
+ has_chain_inner = []
1614
+ not_has_chain_inner = []
1615
+
1616
+ pseudos.each do |name, arg, double_colon|
1617
+ return NATIVE_PSEUDO_FALLBACK if double_colon
1618
+
1619
+ if (bit = NATIVE_PSEUDO_FLAGS[name])
1620
+ flags |= bit
1621
+ elsif (bit = NATIVE_NTH_BITS[name])
1622
+ a, b = Scrapetor::Selector.parse_nth(arg)
1623
+ return NATIVE_PSEUDO_FALLBACK unless a
1624
+ flags |= bit
1625
+ if name == "nth-of-type" || name == "nth-last-of-type"
1626
+ nth_type_a, nth_type_b = a, b
1627
+ else
1628
+ nth_a, nth_b = a, b
1629
+ end
1630
+ elsif name == "not"
1631
+ # `:not(:has(X, Y))` — common scrape pattern. Rather than
1632
+ # forcing a Ruby Dom fallback (which is ~3-5 ms per call on a
1633
+ # 100KB page), recognise the shape at compile time and emit
1634
+ # a C_PS_NOT_HAS bit on the outer atom. The C side checks
1635
+ # "no descendant matches any of these simple atoms" — same
1636
+ # cost as C_PS_HAS, just inverted.
1637
+ if (nh = parse_not_has_form(arg))
1638
+ not_has_inner.concat(nh)
1639
+ flags |= (1 << 24)
1640
+ next
1641
+ end
1642
+ # `:not(:has(> X))` direct-child variant.
1643
+ if (nhc = parse_not_has_child_form(arg))
1644
+ not_has_child_inner.concat(nhc)
1645
+ flags |= (1 << 26)
1646
+ next
1647
+ end
1648
+ # `:not(:has(X Y, A B, ...))` — chain inner with multiple
1649
+ # alternatives. Mirrors `:has(X Y, A B)` (1<<27) but with
1650
+ # the negated descendant check.
1651
+ if (nchains = parse_not_has_chains_form(arg))
1652
+ not_has_chain_inner = nchains
1653
+ flags |= (1 << 29)
1654
+ next
1655
+ end
1656
+ inner = native_inner_simples(arg)
1657
+ return NATIVE_PSEUDO_FALLBACK if inner == NATIVE_PSEUDO_FALLBACK
1658
+ not_inner.concat(inner)
1659
+ flags |= (1 << 20)
1660
+ elsif name == "is" || name == "matches" || name == "where"
1661
+ inner = native_inner_simples(arg)
1662
+ return NATIVE_PSEUDO_FALLBACK if inner == NATIVE_PSEUDO_FALLBACK
1663
+ is_inner.concat(inner)
1664
+ flags |= (1 << 21)
1665
+ elsif name == "has"
1666
+ # `:has(>::text)` / `:has(::text)` — "node has a direct
1667
+ # text-node child". Non-standard but appears in production
1668
+ # parsers. Maps to a one-bit flag the C side evaluates with
1669
+ # a single child walk.
1670
+ if has_text_child_form?(arg)
1671
+ flags |= (1 << 28)
1672
+ next
1673
+ end
1674
+ # `:has(> X, > Y)` — leading combinator inside :has. The
1675
+ # arg's compile output starts with `:scope` (compile()
1676
+ # desugars the leading `>`), giving each group two atoms.
1677
+ # native_inner_simples requires a single atom, so detect
1678
+ # this shape explicitly and lift the *child* atoms into
1679
+ # has_child_inner.
1680
+ if (hc = parse_has_child_form(arg))
1681
+ has_child_inner.concat(hc)
1682
+ flags |= (1 << 25)
1683
+ next
1684
+ end
1685
+ # `:has(+ X, + Y)` / `:has(~ X, ~ Y)` — sibling-from-scope
1686
+ # variants. Same lifting machinery but the walk is on the
1687
+ # outer node's siblings, not its descendants.
1688
+ if (hs = parse_has_sib_form(arg, "+"))
1689
+ has_inner.concat(hs)
1690
+ flags |= (1 << 30)
1691
+ next
1692
+ end
1693
+ if (hs = parse_has_sib_form(arg, "~"))
1694
+ has_inner.concat(hs)
1695
+ flags |= (1 << 31)
1696
+ next
1697
+ end
1698
+ # `:is(...)` inside :has: distribute alternatives so an inner
1699
+ # like `:is(h2, span).a-color-base` becomes
1700
+ # `h2.a-color-base, span.a-color-base` before we hand it to
1701
+ # native_inner_simples (which needs single-atom groups). Force
1702
+ # distribution even for single-atom alternatives — the comma-
1703
+ # joined form is exactly the shape native_inner_simples wants.
1704
+ arg_expanded = Native.split_selector_groups(arg)
1705
+ .flat_map { |g| Native.expand_is_groups(g, force: true) }
1706
+ .join(", ")
1707
+ inner = native_inner_simples(arg_expanded)
1708
+ if inner != NATIVE_PSEUDO_FALLBACK
1709
+ has_inner.concat(inner)
1710
+ flags |= (1 << 22)
1711
+ next
1712
+ end
1713
+ # `:has(X Y, A B, ...)` — multi-chain. Each comma alternative
1714
+ # is its own chain of simple atoms with descendant/child/
1715
+ # sibling combinators between them. The native engine matches
1716
+ # if ANY chain has a descendant match.
1717
+ if (chains = parse_has_chains_form(arg))
1718
+ has_chain_inner = chains
1719
+ flags |= (1 << 27)
1720
+ next
1721
+ end
1722
+ return NATIVE_PSEUDO_FALLBACK
1723
+ else
1724
+ return NATIVE_PSEUDO_FALLBACK
1725
+ end
1726
+ end
1727
+
1728
+ [flags, nth_a, nth_b, nth_type_a, nth_type_b, not_inner, is_inner, has_inner,
1729
+ not_has_inner, has_child_inner, not_has_child_inner, has_chain_inner,
1730
+ not_has_chain_inner]
1731
+ end
1732
+
1733
+ # `:not(:has(X Y))` — :not wrapping a single :has with a multi-atom
1734
+ # chain. Returns the chain shape (same as parse_has_chain_form) or
1735
+ # nil. The matching is the negated descendant-chain check.
1736
+ def self.parse_not_has_chain_form(arg)
1737
+ r = parse_not_has_chains_form(arg)
1738
+ return nil if r.nil? || r.size != 1
1739
+ r.first
1740
+ end
1741
+
1742
+ def self.parse_not_has_chains_form(arg)
1743
+ return nil if arg.nil? || arg.empty?
1744
+ groups = Scrapetor::Dom::Selectors.selector_groups(arg)
1745
+ return nil if groups.size != 1
1746
+ plan = Scrapetor::Selector.compile(groups.first)
1747
+ return nil if plan.size != 1
1748
+ atom = plan.first
1749
+ return nil unless atom.pseudos && atom.pseudos.size == 1
1750
+ name, inner_arg, double_colon = atom.pseudos.first
1751
+ return nil if double_colon || name != "has"
1752
+ return nil if atom.tag || !atom.classes.empty? || atom.id || !atom.attrs.empty?
1753
+ parse_has_chains_form(inner_arg)
1754
+ rescue ArgumentError
1755
+ nil
1756
+ end
1757
+
1758
+ # `:has(+ X, + Y)` / `:has(~ X, ~ Y)` — every group of the argument
1759
+ # must start with the given sibling combinator. Returns the list of
1760
+ # leaf simple-atom entries (right of the combinator) on success.
1761
+ def self.parse_has_sib_form(arg, combinator_char)
1762
+ return nil if arg.nil? || arg.empty?
1763
+ groups = Scrapetor::Dom::Selectors.selector_groups(arg)
1764
+ out = []
1765
+ groups.each do |g|
1766
+ gs = g.strip
1767
+ return nil unless gs.start_with?(combinator_char)
1768
+ inner = gs[1..].lstrip
1769
+ plan = Scrapetor::Selector.compile(inner)
1770
+ return nil if plan.size != 1
1771
+ atom = plan.first
1772
+ leaf_pseudo = nil
1773
+ if atom.pseudos && !atom.pseudos.empty?
1774
+ leaf_pseudo = native_leaf_pseudo_data(atom.pseudos)
1775
+ return nil if leaf_pseudo.nil?
1776
+ end
1777
+ entry = [atom.tag ? atom.tag.to_s : nil, atom.classes, atom.id, atom.attrs]
1778
+ entry << leaf_pseudo if leaf_pseudo
1779
+ out << entry
1780
+ end
1781
+ out
1782
+ rescue ArgumentError
1783
+ nil
1784
+ end
1785
+
1786
+ # `:has(>::text)` / `:has(::text)` — "node has at least one direct
1787
+ # text-node child". The compile would otherwise reject the bare
1788
+ # pseudo-element inside :has, forcing the whole selector to the
1789
+ # Ruby Dom fallback. Cheap-as-shrimp shape detector — just trims
1790
+ # whitespace and an optional leading `>`.
1791
+ def self.has_text_child_form?(arg)
1792
+ return false if arg.nil?
1793
+ s = arg.strip
1794
+ s = s[1..].lstrip if s.start_with?(">")
1795
+ s == "::text"
1796
+ end
1797
+
1798
+ # `:has(X Y)` — single chain (no commas, no leading combinator). The
1799
+ # arg's compile output is multiple atoms joined by descendant/child
1800
+ # combinators. Returns an Array of [simple_atom_entry, combo_str]
1801
+ # pairs (combo_str is "descendant" / "child" / nil). Rejects forms
1802
+ # native_inner_simples already handles (single atom) and forms that
1803
+ # need recursive pseudos.
1804
+ def self.parse_has_chain_form(arg)
1805
+ r = parse_has_chains_form(arg)
1806
+ return nil if r.nil? || r.size != 1
1807
+ r.first
1808
+ end
1809
+
1810
+ # `:has(X Y, A B, ...)` — multi-chain. Returns an Array of chains.
1811
+ # Each chain is an Array of [atom_entry, combinator_string] pairs.
1812
+ # The first entry's combinator is nil; subsequent entries carry
1813
+ # descendant/child/adjacent/sibling. Returns nil when any group's
1814
+ # shape isn't a supported chain form (no recursive pseudos beyond
1815
+ # leaf, etc.). Single-atom alternatives are also lifted as 1-long
1816
+ # chains so the caller doesn't have to distinguish.
1817
+ def self.parse_has_chains_form(arg)
1818
+ return nil if arg.nil? || arg.empty?
1819
+ groups = Scrapetor::Dom::Selectors.selector_groups(arg)
1820
+ return nil if groups.empty? || groups.size > 8
1821
+ chains = []
1822
+ groups.each do |g|
1823
+ plan = Scrapetor::Selector.compile(g)
1824
+ return nil if plan.empty?
1825
+ chain = []
1826
+ plan.each_with_index do |atom, idx|
1827
+ leaf_pseudo = nil
1828
+ if atom.pseudos && !atom.pseudos.empty?
1829
+ leaf_pseudo = native_inner_simple_pseudo(atom.pseudos) ||
1830
+ native_leaf_pseudo_data(atom.pseudos)
1831
+ return nil if leaf_pseudo.nil?
1832
+ end
1833
+ entry = [atom.tag ? atom.tag.to_s : nil, atom.classes, atom.id, atom.attrs]
1834
+ entry << leaf_pseudo if leaf_pseudo
1835
+ combo =
1836
+ case atom.combinator
1837
+ when :descendant then "descendant"
1838
+ when :child then "child"
1839
+ when :adj then "adjacent"
1840
+ when :gen then "sibling"
1841
+ when nil then (idx.zero? ? nil : "descendant")
1842
+ else nil
1843
+ end
1844
+ chain << [entry, combo]
1845
+ end
1846
+ chains << chain
1847
+ end
1848
+ chains
1849
+ rescue ArgumentError
1850
+ nil
1851
+ end
1852
+
1853
+ # `:has(> X, > Y)` — every group of the argument must be of shape
1854
+ # `:scope > simple`. Returns the simple atoms (each is the right
1855
+ # side of the `>`) if so, nil otherwise.
1856
+ def self.parse_has_child_form(arg)
1857
+ return nil if arg.nil? || arg.empty?
1858
+ groups = Scrapetor::Dom::Selectors.selector_groups(arg)
1859
+ out = []
1860
+ groups.each do |g|
1861
+ gs = g.strip
1862
+ return nil unless gs.start_with?(">")
1863
+ inner = gs[1..].lstrip
1864
+ plan = Scrapetor::Selector.compile(inner)
1865
+ return nil if plan.size != 1
1866
+ atom = plan.first
1867
+ leaf_pseudo = nil
1868
+ if atom.pseudos && !atom.pseudos.empty?
1869
+ leaf_pseudo = native_leaf_pseudo_data(atom.pseudos)
1870
+ return nil if leaf_pseudo.nil?
1871
+ end
1872
+ entry = [atom.tag ? atom.tag.to_s : nil, atom.classes, atom.id, atom.attrs]
1873
+ entry << leaf_pseudo if leaf_pseudo
1874
+ out << entry
1875
+ end
1876
+ out
1877
+ rescue ArgumentError
1878
+ nil
1879
+ end
1880
+
1881
+ # `:not(:has(> X))` — direct-child negative form.
1882
+ def self.parse_not_has_child_form(arg)
1883
+ return nil if arg.nil? || arg.empty?
1884
+ groups = Scrapetor::Dom::Selectors.selector_groups(arg)
1885
+ return nil if groups.size != 1
1886
+ plan = Scrapetor::Selector.compile(groups.first)
1887
+ return nil if plan.size != 1
1888
+ atom = plan.first
1889
+ return nil unless atom.pseudos && atom.pseudos.size == 1
1890
+ name, inner_arg, double_colon = atom.pseudos.first
1891
+ return nil if double_colon || name != "has"
1892
+ return nil if atom.tag || !atom.classes.empty? || atom.id || !atom.attrs.empty?
1893
+ parse_has_child_form(inner_arg)
1894
+ rescue ArgumentError
1895
+ nil
1896
+ end
1897
+
1898
+ # Inspect a `:not(...)` argument; if the argument compiles to exactly
1899
+ # `:has(simple, simple, ...)` (no other tag/class/id/attr constraints
1900
+ # outside the :has), return the array of inner simple-atom forms so
1901
+ # the caller can lift them into the C_PS_NOT_HAS path. Returns nil
1902
+ # for anything else.
1903
+ def self.parse_not_has_form(arg)
1904
+ return nil if arg.nil? || arg.empty?
1905
+ groups = Scrapetor::Dom::Selectors.selector_groups(arg)
1906
+ return nil if groups.size != 1
1907
+ plan = Scrapetor::Selector.compile(groups.first)
1908
+ return nil if plan.size != 1
1909
+ atom = plan.first
1910
+ return nil unless atom.pseudos && atom.pseudos.size == 1
1911
+ name, inner_arg, double_colon = atom.pseudos.first
1912
+ return nil if double_colon || name != "has"
1913
+ return nil if atom.tag || !atom.classes.empty? || atom.id || !atom.attrs.empty?
1914
+ inner = native_inner_simples(inner_arg)
1915
+ return nil if inner == NATIVE_PSEUDO_FALLBACK
1916
+ inner
1917
+ rescue ArgumentError
1918
+ nil
1919
+ end
1920
+
1921
+ # Compile an inner-selector argument (`:not(.x, :empty, .y[z])`) into
1922
+ # an array of simple-atom descriptors the C engine can read. Each
1923
+ # inner is `[tag, classes, id, attrs]` or, when pseudo flags are
1924
+ # present, `[tag, classes, id, attrs, leaf_pseudo_data]`. Combinators
1925
+ # and recursive pseudos (a `:not` inside a `:not`) still force the
1926
+ # Ruby fallback — the C side only flattens one level deep.
1927
+ def self.native_inner_simples(arg, depth = 0)
1928
+ return NATIVE_PSEUDO_FALLBACK if arg.nil? || arg.empty?
1929
+ return NATIVE_PSEUDO_FALLBACK if depth > 4
1930
+ groups = Scrapetor::Dom::Selectors.selector_groups(arg)
1931
+ out = []
1932
+ groups.each do |g|
1933
+ plan = Scrapetor::Selector.compile(g)
1934
+ return NATIVE_PSEUDO_FALLBACK if plan.size != 1
1935
+ atom = plan.first
1936
+ # `:has(:is(X, Y))` / `:not(:is(X, Y))` etc.: unwrap a pure
1937
+ # `:is(...)` atom into its alternatives so the inner pool
1938
+ # receives the leaf simples without the recursive :is.
1939
+ if pure_is_atom?(atom)
1940
+ inner_arg = atom.pseudos.first[1]
1941
+ sub = native_inner_simples(inner_arg, depth + 1)
1942
+ return NATIVE_PSEUDO_FALLBACK if sub == NATIVE_PSEUDO_FALLBACK
1943
+ out.concat(sub)
1944
+ next
1945
+ end
1946
+ leaf_pseudo = nil
1947
+ if atom.pseudos && !atom.pseudos.empty?
1948
+ # Try the nested (one-level-recursive) shape first — accepts
1949
+ # `:not(simple)` / `:has(simple)` / `:not(:has(simple))` on the
1950
+ # inner atom, lifting them into inner pools on the inner
1951
+ # c_simple_atom. Falls back to leaf-only if that doesn't apply.
1952
+ leaf_pseudo = native_inner_simple_pseudo(atom.pseudos) ||
1953
+ native_leaf_pseudo_data(atom.pseudos)
1954
+ return NATIVE_PSEUDO_FALLBACK if leaf_pseudo.nil?
1955
+ end
1956
+ entry = [atom.tag ? atom.tag.to_s : nil, atom.classes, atom.id, atom.attrs]
1957
+ entry << leaf_pseudo if leaf_pseudo
1958
+ out << entry
1959
+ end
1960
+ out
1961
+ rescue ArgumentError
1962
+ NATIVE_PSEUDO_FALLBACK
1963
+ end
1964
+
1965
+ # An atom that is *only* `:is(...)` — no tag/class/id/attrs and no
1966
+ # other pseudos — so the `:is` wraps a list of alternatives that
1967
+ # can be unwrapped into the surrounding inner pool. Anything else
1968
+ # on the atom (e.g. `.x:is(...)`) would change semantics and isn't
1969
+ # eligible for this rewrite.
1970
+ def self.pure_is_atom?(atom)
1971
+ return false if atom.tag || !atom.classes.empty? || atom.id || !atom.attrs.empty?
1972
+ return false unless atom.pseudos && atom.pseudos.size == 1
1973
+ name, _arg, double_colon = atom.pseudos.first
1974
+ !double_colon && %w[is matches where].include?(name)
1975
+ end
1976
+
1977
+ # Build the extended pseudo_data slot for a c_simple_atom that
1978
+ # itself carries `:not(simple)` / `:has(simple)` / `:not(:has(simple))`
1979
+ # constraints. The C layer reads optional indices 5, 6, 7 as
1980
+ # inner_not / inner_has / inner_not_has pools and applies them in
1981
+ # matches_simple_atom. Returns nil when the shape isn't supported
1982
+ # (sibling combinators inside, recursive pseudos beyond one level,
1983
+ # etc.) — the caller falls back to native_leaf_pseudo_data which
1984
+ # rejects the atom entirely if leaves aren't enough.
1985
+ def self.native_inner_simple_pseudo(pseudos)
1986
+ flags = 0
1987
+ nth_a = nth_b = 0
1988
+ nth_type_a = nth_type_b = 0
1989
+ inner_not = []
1990
+ inner_has = []
1991
+ inner_not_has = []
1992
+ inner_has_chain = nil
1993
+ pseudos.each do |name, arg, double_colon|
1994
+ return nil if double_colon
1995
+ if (bit = NATIVE_PSEUDO_FLAGS[name])
1996
+ flags |= bit
1997
+ elsif (bit = NATIVE_NTH_BITS[name])
1998
+ a, b = Scrapetor::Selector.parse_nth(arg)
1999
+ return nil unless a
2000
+ flags |= bit
2001
+ if name == "nth-of-type" || name == "nth-last-of-type"
2002
+ nth_type_a, nth_type_b = a, b
2003
+ else
2004
+ nth_a, nth_b = a, b
2005
+ end
2006
+ elsif name == "not"
2007
+ # `:not(:has(simple))` → inner_not_has
2008
+ if (nh = parse_inner_not_has_form(arg))
2009
+ inner_not_has.concat(nh)
2010
+ next
2011
+ end
2012
+ sub = inner_pool_for(arg)
2013
+ return nil if sub.nil?
2014
+ inner_not.concat(sub)
2015
+ elsif name == "has"
2016
+ # Try simple-atom inner first.
2017
+ sub = inner_pool_for(arg)
2018
+ if sub
2019
+ inner_has.concat(sub)
2020
+ elsif (chain = parse_has_chains_form(arg))
2021
+ # Multi-atom chain alternatives. Lift into inner_has_chain
2022
+ # so the C engine evaluates the chain match natively.
2023
+ inner_has_chain = chain
2024
+ else
2025
+ return nil
2026
+ end
2027
+ else
2028
+ return nil
2029
+ end
2030
+ end
2031
+ out = [flags, nth_a, nth_b, nth_type_a, nth_type_b]
2032
+ # Pad with empty arrays as needed so the C layer indexes work.
2033
+ need_8 = inner_has_chain && !inner_has_chain.empty?
2034
+ need_7 = need_8 || !inner_not_has.empty?
2035
+ need_6 = need_7 || !inner_has.empty?
2036
+ need_5 = need_6 || !inner_not.empty?
2037
+ out << inner_not if need_5
2038
+ out << inner_has if need_6
2039
+ out << inner_not_has if need_7
2040
+ out << inner_has_chain if need_8
2041
+ out
2042
+ end
2043
+
2044
+ # Compile a `:not(arg)` / `:has(arg)` payload as a list of leaf
2045
+ # simple atoms (no further pseudo recursion). Used to fill an inner
2046
+ # pool on a c_simple_atom — limit one level deep.
2047
+ def self.inner_pool_for(arg)
2048
+ return nil if arg.nil? || arg.empty?
2049
+ groups = Scrapetor::Dom::Selectors.selector_groups(arg)
2050
+ out = []
2051
+ groups.each do |g|
2052
+ plan = Scrapetor::Selector.compile(g)
2053
+ return nil if plan.size != 1
2054
+ atom = plan.first
2055
+ if pure_is_atom?(atom)
2056
+ sub = inner_pool_for(atom.pseudos.first[1])
2057
+ return nil if sub.nil?
2058
+ out.concat(sub)
2059
+ next
2060
+ end
2061
+ leaf_pseudo = nil
2062
+ if atom.pseudos && !atom.pseudos.empty?
2063
+ leaf_pseudo = native_leaf_pseudo_data(atom.pseudos)
2064
+ return nil if leaf_pseudo.nil?
2065
+ end
2066
+ entry = [atom.tag ? atom.tag.to_s : nil, atom.classes, atom.id, atom.attrs]
2067
+ entry << leaf_pseudo if leaf_pseudo
2068
+ out << entry
2069
+ end
2070
+ out
2071
+ rescue ArgumentError
2072
+ nil
2073
+ end
2074
+
2075
+ # `:not(:has(simple))` payload — used by inner_simple_pseudo to lift
2076
+ # the nested negation into inner_not_has on the simple atom.
2077
+ def self.parse_inner_not_has_form(arg)
2078
+ return nil if arg.nil? || arg.empty?
2079
+ groups = Scrapetor::Dom::Selectors.selector_groups(arg)
2080
+ return nil if groups.size != 1
2081
+ plan = Scrapetor::Selector.compile(groups.first)
2082
+ return nil if plan.size != 1
2083
+ atom = plan.first
2084
+ return nil unless atom.pseudos && atom.pseudos.size == 1
2085
+ name, inner_arg, double_colon = atom.pseudos.first
2086
+ return nil if double_colon || name != "has"
2087
+ return nil if atom.tag || !atom.classes.empty? || atom.id || !atom.attrs.empty?
2088
+ inner_pool_for(inner_arg)
2089
+ rescue ArgumentError
2090
+ nil
2091
+ end
2092
+
2093
+ # Like native_pseudo_data, but rejects any pseudo that requires a
2094
+ # nested sub-selector (`:not`/`:is`/`:has`). The C `c_simple_atom`
2095
+ # only has the leaf pseudo fields; the recursive ones would need
2096
+ # their own inner pool which we don't allocate.
2097
+ def self.native_leaf_pseudo_data(pseudos)
2098
+ flags = 0
2099
+ nth_a = nth_b = 0
2100
+ nth_type_a = nth_type_b = 0
2101
+ pseudos.each do |name, arg, double_colon|
2102
+ return nil if double_colon
2103
+ if (bit = NATIVE_PSEUDO_FLAGS[name])
2104
+ flags |= bit
2105
+ elsif (bit = NATIVE_NTH_BITS[name])
2106
+ a, b = Scrapetor::Selector.parse_nth(arg)
2107
+ return nil unless a
2108
+ flags |= bit
2109
+ if name == "nth-of-type" || name == "nth-last-of-type"
2110
+ nth_type_a, nth_type_b = a, b
2111
+ else
2112
+ nth_a, nth_b = a, b
2113
+ end
2114
+ else
2115
+ return nil
2116
+ end
2117
+ end
2118
+ [flags, nth_a, nth_b, nth_type_a, nth_type_b]
2119
+ end
2120
+
2121
+ # Split a CSS selector on top-level commas (outside [...] and (...)).
2122
+ def self.split_selector_groups(s)
2123
+ groups = []
2124
+ buf = +""
2125
+ depth = 0
2126
+ paren = 0
2127
+ s.each_char do |ch|
2128
+ case ch
2129
+ when "[" then depth += 1; buf << ch
2130
+ when "]" then depth -= 1 if depth.positive?; buf << ch
2131
+ when "(" then paren += 1; buf << ch
2132
+ when ")" then paren -= 1 if paren.positive?; buf << ch
2133
+ when ","
2134
+ if depth.zero? && paren.zero?
2135
+ groups << buf.strip
2136
+ buf = +""
2137
+ else
2138
+ buf << ch
2139
+ end
2140
+ else
2141
+ buf << ch
2142
+ end
2143
+ end
2144
+ groups << buf.strip
2145
+ groups.reject(&:empty?)
2146
+ end
2147
+
2148
+ # Returns true if the comma-separated selector has groups with
2149
+ # different pseudo-element shapes — e.g. `.a > ::text, .b` — so
2150
+ # callers can split + peel per-group instead of one shared peel.
2151
+ # Compile a {key => selector_string} fields map into the parallel
2152
+ # (keys, plans, kinds, args) arrays the C extract_one_native /
2153
+ # extract_each_native entry points consume. Returns the 4-tuple
2154
+ # on success, nil when any selector can't be compiled natively
2155
+ # (caller falls back to the slow per-row at_css loop).
2156
+ #
2157
+ # kinds:
2158
+ # 0 = Element (C side allocates the wrapper)
2159
+ # 1 = ::text (TextNode of subtree text)
2160
+ # 2 = ::attr (TextNode of attribute value)
2161
+ #
2162
+ # plan = nil + kind = 2 means bare `::attr(name)` against the scope
2163
+ # element itself — the C side reads the attribute directly without
2164
+ # running a plan. The peel + plan-cache lookups here cost a few
2165
+ # hundred nanoseconds and are amortised across every iteration
2166
+ # of the resulting C-side loop.
2167
+ def self.compile_extract_fields(fields, wrapper)
2168
+ keys = []
2169
+ plans = []
2170
+ kinds = []
2171
+ args = []
2172
+ fields.each_pair do |key, sel|
2173
+ keys << key
2174
+ sel_str = sel.is_a?(String) ? sel : sel.to_s
2175
+ stripped, kind, arg = peel_pseudo_element(sel_str)
2176
+ stripped = "*" if stripped.empty? && kind.nil?
2177
+ if stripped.empty? && (kind == :attr || kind == :direct_attr)
2178
+ plans << nil; kinds << 2; args << arg.to_s
2179
+ next
2180
+ end
2181
+ return nil if stripped.include?(",")
2182
+ plan = wrapper.compiled_plan(stripped)
2183
+ return nil unless plan
2184
+ plans << plan
2185
+ case kind
2186
+ when :text, :text_approx then kinds << 1; args << ""
2187
+ when :attr then kinds << 2; args << arg.to_s
2188
+ when nil then kinds << 0; args << ""
2189
+ else return nil # :direct_text / :direct_attr / unsupported
2190
+ end
2191
+ end
2192
+ [keys, plans, kinds, args]
2193
+ end
2194
+
2195
+ HET_PSEUDO_CACHE = {}
2196
+ HET_PSEUDO_CACHE_CAP = 1024
2197
+ def self.heterogeneous_pseudo_groups?(s)
2198
+ cached = HET_PSEUDO_CACHE[s]
2199
+ return cached unless cached.nil?
2200
+ groups = split_selector_groups(s)
2201
+ kinds = groups.map { |g| peel_pseudo_element(g)[1] }
2202
+ result = kinds.uniq.size > 1
2203
+ HET_PSEUDO_CACHE.shift if HET_PSEUDO_CACHE.size >= HET_PSEUDO_CACHE_CAP
2204
+ HET_PSEUDO_CACHE[s] = result
2205
+ result
2206
+ end
2207
+
2208
+ # `:is(A, B C)`-distribution. Finds a `:is(...)` / `:matches(...)` /
2209
+ # `:where(...)` token that sits at an atom boundary (i.e. preceded
2210
+ # and followed by start/end/combinator/whitespace) and whose
2211
+ # alternatives include at least one with a combinator/whitespace
2212
+ # inside. Returns one group string per alternative, with the
2213
+ # alternative substituted in. Without this rewrite a selector like
2214
+ # `:is(aside, main .x) .y` falls back to the Ruby DOM parser because
2215
+ # the native engine can't represent multi-atom alternatives inside
2216
+ # `:is`. Returns `[group_str]` (single element) when no rewrite
2217
+ # applies — caller treats that as a no-op.
2218
+ IS_AT_BOUNDARY_RE = /
2219
+ (?:\A|(?<=[\s>+~,]))
2220
+ :(?:is|matches|where)\(
2221
+ /x.freeze
2222
+ def self.expand_is_groups(group_str, force: false)
2223
+ m = IS_AT_BOUNDARY_RE.match(group_str)
2224
+ return [group_str] unless m
2225
+ paren_start = m.end(0) - 1 # position of '('
2226
+ depth = 1
2227
+ i = paren_start + 1
2228
+ len = group_str.length
2229
+ while i < len && depth > 0
2230
+ ch = group_str[i]
2231
+ if ch == "("
2232
+ depth += 1
2233
+ elsif ch == ")"
2234
+ depth -= 1
2235
+ end
2236
+ i += 1
2237
+ end
2238
+ return [group_str] if depth != 0
2239
+ paren_end = i - 1 # position of matching ')'
2240
+ inner = group_str[(paren_start + 1)...paren_end]
2241
+ alts = split_selector_groups(inner)
2242
+ return [group_str] if alts.size < 2
2243
+ # By default only distribute when an alternative has a combinator
2244
+ # (multi-atom) — single-atom alternatives compile natively as
2245
+ # is_inner. When called from inside `:has`, force distribution so
2246
+ # the inner pool sees plain single atoms rather than `:is(...)`
2247
+ # wrappers that don't fit native_inner_simples.
2248
+ multi = alts.any? { |a| a =~ /[\s>+~]/ }
2249
+ return [group_str] unless multi || force
2250
+ prefix = group_str[0...m.begin(0)]
2251
+ suffix = group_str[(paren_end + 1)..]
2252
+ alts.flat_map do |alt|
2253
+ merged = "#{prefix}#{alt}#{suffix}".strip
2254
+ expand_is_groups(merged, force: force)
2255
+ end
2256
+ end
2257
+ end
2258
+ end