scrapetor 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +242 -0
- data/LICENSE +21 -0
- data/README.md +440 -0
- data/bin/scrapetor +190 -0
- data/bin/scrapetor-bench +5 -0
- data/ext/scrapetor/README.md +53 -0
- data/ext/scrapetor/native/extconf.rb +67 -0
- data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
- data/ext/scrapetor/native/scrapetor_http.c +2591 -0
- data/ext/scrapetor/native/scrapetor_native.c +1156 -0
- data/lib/scrapetor/builder.rb +158 -0
- data/lib/scrapetor/cleaner.rb +10 -0
- data/lib/scrapetor/comment_node.rb +67 -0
- data/lib/scrapetor/document.rb +457 -0
- data/lib/scrapetor/dom/parser.rb +69 -0
- data/lib/scrapetor/dom/selectors.rb +208 -0
- data/lib/scrapetor/dom.rb +563 -0
- data/lib/scrapetor/encoding.rb +85 -0
- data/lib/scrapetor/entities.rb +90 -0
- data/lib/scrapetor/errors.rb +12 -0
- data/lib/scrapetor/extractor.rb +147 -0
- data/lib/scrapetor/fetcher.rb +390 -0
- data/lib/scrapetor/fingerprint.rb +29 -0
- data/lib/scrapetor/form.rb +141 -0
- data/lib/scrapetor/http.rb +114 -0
- data/lib/scrapetor/microdata.rb +132 -0
- data/lib/scrapetor/money.rb +30 -0
- data/lib/scrapetor/native.rb +291 -0
- data/lib/scrapetor/native_dom.rb +2258 -0
- data/lib/scrapetor/node.rb +539 -0
- data/lib/scrapetor/node_set.rb +301 -0
- data/lib/scrapetor/page_type.rb +95 -0
- data/lib/scrapetor/pagination.rb +109 -0
- data/lib/scrapetor/persistent_cache.rb +130 -0
- data/lib/scrapetor/robots.rb +159 -0
- data/lib/scrapetor/sax.rb +285 -0
- data/lib/scrapetor/schema.rb +144 -0
- data/lib/scrapetor/selector.rb +576 -0
- data/lib/scrapetor/session.rb +141 -0
- data/lib/scrapetor/sitemap.rb +52 -0
- data/lib/scrapetor/stream.rb +111 -0
- data/lib/scrapetor/structured_data.rb +74 -0
- data/lib/scrapetor/template_registry.rb +24 -0
- data/lib/scrapetor/text_node.rb +101 -0
- data/lib/scrapetor/url.rb +21 -0
- data/lib/scrapetor/version.rb +5 -0
- data/lib/scrapetor/xpath.rb +1603 -0
- data/lib/scrapetor.rb +167 -0
- data/scrapetor.gemspec +77 -0
- metadata +200 -0
|
@@ -0,0 +1,2258 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
module Native
|
|
5
|
+
# Wrapper module — `Scrapetor::Native::Document` is a TypedData class
|
|
6
|
+
# defined in C (see ext/scrapetor/native/scrapetor_dom.c). It exposes
|
|
7
|
+
# node-id based accessors. This module adds Ruby-level helpers and
|
|
8
|
+
# the Element wrapper that `Scrapetor::Node` can wrap and operate on
|
|
9
|
+
# the same way it does over a pure-Ruby `Dom::Element`.
|
|
10
|
+
AVAILABLE_DOM = defined?(Scrapetor::Native::Document)
|
|
11
|
+
|
|
12
|
+
# ----- pseudo-element handling at the css() boundary -----
|
|
13
|
+
|
|
14
|
+
PSEUDO_ELEMENT_RE = /(::(?:text|attr\([^)]+\)|first-letter|first-line|before|after))\s*\z/i.freeze
|
|
15
|
+
|
|
16
|
+
# Wrap each String entry in TextNode so Node-style `.text` /
|
|
17
|
+
# `.content` accessors and Parsel-style `.get` / `.getall` both work.
|
|
18
|
+
# Skips nil (`bulk_attr` returns nil for missing attributes) and any
|
|
19
|
+
# value that's already a TextNode. Mutates in place to avoid a second
|
|
20
|
+
# Array allocation on the result-collection hot path.
|
|
21
|
+
def self.wrap_text_nodes!(arr)
|
|
22
|
+
return arr unless arr.is_a?(Array)
|
|
23
|
+
i = 0
|
|
24
|
+
n = arr.length
|
|
25
|
+
while i < n
|
|
26
|
+
v = arr[i]
|
|
27
|
+
arr[i] = Scrapetor::TextNode.new(v) if v.is_a?(String) && !v.is_a?(Scrapetor::TextNode)
|
|
28
|
+
i += 1
|
|
29
|
+
end
|
|
30
|
+
arr
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# `::text` and `::attr(name)` are Scrapy/Parsel-style pseudo-elements:
|
|
34
|
+
# they reshape the result of a selector into strings rather than
|
|
35
|
+
# affecting matching. Strip them before running the query and apply
|
|
36
|
+
# the transform on the way out.
|
|
37
|
+
#
|
|
38
|
+
# Returns [stripped_selector, transform_kind, arg]
|
|
39
|
+
# transform_kind = nil | :text | :attr | :text_approx
|
|
40
|
+
#
|
|
41
|
+
# Fast-path skip when the selector has no `::` substring (the common
|
|
42
|
+
# case) — saves a regex match on every css() call.
|
|
43
|
+
def self.peel_pseudo_element(selector_str)
|
|
44
|
+
s = selector_str
|
|
45
|
+
return [s, nil, nil] unless s.include?("::")
|
|
46
|
+
m = s.match(PSEUDO_ELEMENT_RE)
|
|
47
|
+
return [s, nil, nil] unless m
|
|
48
|
+
head = s[0...m.begin(0)].rstrip
|
|
49
|
+
pe = m[1]
|
|
50
|
+
# `head > ::text` and `head > ::attr(x)`: strip the trailing `>`
|
|
51
|
+
# combinator and flip kind into the direct-only variant. The
|
|
52
|
+
# native plan compiles cleanly for `head` and apply_pseudo_element
|
|
53
|
+
# walks only the immediate children when collecting text/attrs.
|
|
54
|
+
direct = false
|
|
55
|
+
if head.end_with?(">")
|
|
56
|
+
head = head[0..-2].rstrip
|
|
57
|
+
direct = true
|
|
58
|
+
end
|
|
59
|
+
if pe.casecmp("::text").zero?
|
|
60
|
+
[head, direct ? :direct_text : :text, nil]
|
|
61
|
+
elsif (a = pe.match(/::attr\(([^)]+)\)/i))
|
|
62
|
+
[head, direct ? :direct_attr : :attr, a[1].strip]
|
|
63
|
+
else
|
|
64
|
+
[head, :text_approx, nil]
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
if AVAILABLE_DOM
|
|
69
|
+
# Lightweight wrapper: two slots, the native doc + node id.
|
|
70
|
+
# Walks like a Dom::Element so the rest of Scrapetor can treat
|
|
71
|
+
# it the same.
|
|
72
|
+
class Element
|
|
73
|
+
attr_reader :doc, :id
|
|
74
|
+
|
|
75
|
+
def initialize(doc, id, wrapper = nil)
|
|
76
|
+
@doc = doc
|
|
77
|
+
@id = id
|
|
78
|
+
@wrapper = wrapper
|
|
79
|
+
@dom_node = nil
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# The DocumentWrapper governs the native arena and any lazy
|
|
83
|
+
# Dom view used for mutations / fallback selectors. Surface it
|
|
84
|
+
# so subclasses and nav helpers can stay coherent.
|
|
85
|
+
def wrapper
|
|
86
|
+
@wrapper ||= @doc.instance_variable_get(:@__scrapetor_wrapper)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def element?
|
|
90
|
+
@dom_node ? @dom_node.element? : @doc.node_is_element(@id)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def text?; @dom_node ? @dom_node.text? : @doc.node_type(@id) == 3; end
|
|
94
|
+
def comment?; @dom_node ? @dom_node.comment? : @doc.node_type(@id) == 8; end
|
|
95
|
+
def document?; @dom_node ? @dom_node.document? : @doc.node_type(@id) == 9; end
|
|
96
|
+
|
|
97
|
+
def name
|
|
98
|
+
dom_node? ? @dom_node.name : @doc.node_name(@id)
|
|
99
|
+
end
|
|
100
|
+
alias node_name name
|
|
101
|
+
alias tag_name name
|
|
102
|
+
|
|
103
|
+
def [](key)
|
|
104
|
+
dom_node? ? @dom_node[key.to_s] : @doc.node_attr(@id, key.to_s)
|
|
105
|
+
end
|
|
106
|
+
alias get_attribute []
|
|
107
|
+
alias attribute_value []
|
|
108
|
+
|
|
109
|
+
def attributes
|
|
110
|
+
if dom_node?
|
|
111
|
+
@dom_node.attributes
|
|
112
|
+
else
|
|
113
|
+
@doc.node_attributes(@id)
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Lightweight pair returned from `attribute_nodes` / `attribute`.
|
|
118
|
+
# The `.text` / `.content` / `.inner_text` accessors mirror what
|
|
119
|
+
# Nokogiri's Nokogiri::XML::Attr exposes — production parser code
|
|
120
|
+
# iterates `node.attribute_nodes` and reads `.text` on each.
|
|
121
|
+
AttrNode = Struct.new(:name, :value) do
|
|
122
|
+
def text; value.to_s; end
|
|
123
|
+
alias content text
|
|
124
|
+
alias inner_text text
|
|
125
|
+
def to_s
|
|
126
|
+
%Q{#{name}="#{value}"}
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def attribute_nodes
|
|
131
|
+
if dom_node?
|
|
132
|
+
@dom_node.attribute_nodes
|
|
133
|
+
else
|
|
134
|
+
attributes.map { |k, v| AttrNode.new(k, v) }
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def attribute(name)
|
|
139
|
+
if dom_node?
|
|
140
|
+
@dom_node.attribute(name)
|
|
141
|
+
else
|
|
142
|
+
v = self[name]
|
|
143
|
+
v && AttrNode.new(name.to_s, v)
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def keys; dom_node? ? @dom_node.keys : attributes.keys; end
|
|
148
|
+
def values; dom_node? ? @dom_node.values : attributes.values; end
|
|
149
|
+
def has_attribute?(k)
|
|
150
|
+
if dom_node?
|
|
151
|
+
@dom_node.has_attribute?(k)
|
|
152
|
+
else
|
|
153
|
+
!@doc.node_attr(@id, k.to_s).nil?
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
alias key? has_attribute?
|
|
157
|
+
|
|
158
|
+
# Stable identity used to relocate this node inside a lazy Dom
|
|
159
|
+
# view after the document switches to dom-mode. Builds the same
|
|
160
|
+
# `/tag[idx]/.../tag[@id='x']` shape we already exposed publicly.
|
|
161
|
+
# Memoized per-id on the document wrapper so a fallback-heavy
|
|
162
|
+
# parser doesn't pay the O(depth*siblings) walk per at_css call.
|
|
163
|
+
def path
|
|
164
|
+
w = wrapper
|
|
165
|
+
if w && (cached = w.cached_path(@id))
|
|
166
|
+
return cached
|
|
167
|
+
end
|
|
168
|
+
parts = []
|
|
169
|
+
cur = self
|
|
170
|
+
while cur && cur.element?
|
|
171
|
+
id = cur["id"]
|
|
172
|
+
if id && !id.empty?
|
|
173
|
+
parts.unshift("#{cur.name}[@id='#{id}']")
|
|
174
|
+
break
|
|
175
|
+
end
|
|
176
|
+
idx = 1
|
|
177
|
+
sib = cur.previous_sibling
|
|
178
|
+
while sib
|
|
179
|
+
if sib.element? && sib.name == cur.name
|
|
180
|
+
idx += 1
|
|
181
|
+
end
|
|
182
|
+
sib = sib.previous_sibling
|
|
183
|
+
end
|
|
184
|
+
parts.unshift("#{cur.name}[#{idx}]")
|
|
185
|
+
cur = cur.parent
|
|
186
|
+
end
|
|
187
|
+
str = "/" + parts.join("/")
|
|
188
|
+
w.store_path(@id, str) if w
|
|
189
|
+
str
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def fragment?; false; end
|
|
193
|
+
def cdata?; false; end
|
|
194
|
+
def processing_instruction?; false; end
|
|
195
|
+
|
|
196
|
+
def text
|
|
197
|
+
dom_node? ? @dom_node.text : @doc.node_text(@id)
|
|
198
|
+
end
|
|
199
|
+
alias content text
|
|
200
|
+
alias inner_text text
|
|
201
|
+
|
|
202
|
+
def parent
|
|
203
|
+
if dom_node?
|
|
204
|
+
p = @dom_node.parent
|
|
205
|
+
return nil if p.nil?
|
|
206
|
+
return nil unless p.respond_to?(:element?) && p.element?
|
|
207
|
+
wrap_dom(p)
|
|
208
|
+
else
|
|
209
|
+
pid = @doc.node_parent(@id)
|
|
210
|
+
pid ? Element.new(@doc, pid, wrapper) : nil
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def children
|
|
215
|
+
if dom_node?
|
|
216
|
+
@dom_node.children.map { |c| wrap_dom(c) }
|
|
217
|
+
else
|
|
218
|
+
@doc.node_children(@id).map { |cid| Element.new(@doc, cid, wrapper) }
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def element_children
|
|
223
|
+
if dom_node?
|
|
224
|
+
@dom_node.element_children.map { |c| wrap_dom(c) }
|
|
225
|
+
else
|
|
226
|
+
@doc.node_element_children(@id).map { |cid| Element.new(@doc, cid, wrapper) }
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
alias elements element_children
|
|
230
|
+
|
|
231
|
+
def first_element_child
|
|
232
|
+
if dom_node?
|
|
233
|
+
c = @dom_node.first_element_child
|
|
234
|
+
c && wrap_dom(c)
|
|
235
|
+
else
|
|
236
|
+
ids = @doc.node_element_children(@id)
|
|
237
|
+
ids.empty? ? nil : Element.new(@doc, ids.first, wrapper)
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def last_element_child
|
|
242
|
+
if dom_node?
|
|
243
|
+
c = @dom_node.last_element_child
|
|
244
|
+
c && wrap_dom(c)
|
|
245
|
+
else
|
|
246
|
+
ids = @doc.node_element_children(@id)
|
|
247
|
+
ids.empty? ? nil : Element.new(@doc, ids.last, wrapper)
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def next_sibling
|
|
252
|
+
if dom_node?
|
|
253
|
+
n = @dom_node.next_sibling
|
|
254
|
+
n && wrap_dom(n)
|
|
255
|
+
else
|
|
256
|
+
nid = @doc.node_next_sibling(@id)
|
|
257
|
+
nid ? Element.new(@doc, nid, wrapper) : nil
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
def previous_sibling
|
|
262
|
+
if dom_node?
|
|
263
|
+
n = @dom_node.previous_sibling
|
|
264
|
+
n && wrap_dom(n)
|
|
265
|
+
else
|
|
266
|
+
nid = @doc.node_prev_sibling(@id)
|
|
267
|
+
nid ? Element.new(@doc, nid, wrapper) : nil
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
def next_element_sibling
|
|
272
|
+
if dom_node?
|
|
273
|
+
n = @dom_node.next_element_sibling
|
|
274
|
+
n && wrap_dom(n)
|
|
275
|
+
else
|
|
276
|
+
cur = @doc.node_next_sibling(@id)
|
|
277
|
+
while cur && !@doc.node_is_element(cur)
|
|
278
|
+
cur = @doc.node_next_sibling(cur)
|
|
279
|
+
end
|
|
280
|
+
cur ? Element.new(@doc, cur, wrapper) : nil
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
def previous_element_sibling
|
|
285
|
+
if dom_node?
|
|
286
|
+
n = @dom_node.previous_element_sibling
|
|
287
|
+
n && wrap_dom(n)
|
|
288
|
+
else
|
|
289
|
+
cur = @doc.node_prev_sibling(@id)
|
|
290
|
+
while cur && !@doc.node_is_element(cur)
|
|
291
|
+
cur = @doc.node_prev_sibling(cur)
|
|
292
|
+
end
|
|
293
|
+
cur ? Element.new(@doc, cur, wrapper) : nil
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def classes
|
|
298
|
+
dom_node? ? @dom_node.classes : @doc.node_classes(@id)
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
def has_class?(klass); classes.include?(klass.to_s); end
|
|
302
|
+
|
|
303
|
+
# ----- selectors -----
|
|
304
|
+
|
|
305
|
+
# Slow path for css(). Native fast path is installed as a C
|
|
306
|
+
# method (`native_css`) at module load and aliased to `css`,
|
|
307
|
+
# so the heavy Ruby dispatch only runs for shapes that the C
|
|
308
|
+
# path can't handle directly (heterogeneous pseudo groups,
|
|
309
|
+
# post-peel attr/text transforms, dom-mode mutated trees, etc.).
|
|
310
|
+
def css_slow(selector)
|
|
311
|
+
str = selector.is_a?(String) ? selector : selector.to_s
|
|
312
|
+
if str.include?(",") && str.include?("::") &&
|
|
313
|
+
Native.heterogeneous_pseudo_groups?(str)
|
|
314
|
+
return Native.split_selector_groups(str).flat_map { |g| css(g).to_a }
|
|
315
|
+
end
|
|
316
|
+
stripped, kind, arg = Native.peel_pseudo_element(str)
|
|
317
|
+
stripped = "*" if stripped.empty?
|
|
318
|
+
if kind && %i[text text_approx attr].include?(kind) && !dom_node?
|
|
319
|
+
w = wrapper
|
|
320
|
+
plan = w ? w.compiled_plan(stripped) : Native.compile_selector_chain(stripped)
|
|
321
|
+
if plan && !stripped.include?(",")
|
|
322
|
+
ids = @doc.run_chain(plan, @id)
|
|
323
|
+
return case kind
|
|
324
|
+
when :text, :text_approx
|
|
325
|
+
wire_text_parents!(@doc.bulk_text(ids), ids, w)
|
|
326
|
+
when :attr
|
|
327
|
+
wire_text_parents!(@doc.bulk_attr(ids, arg), ids, w)
|
|
328
|
+
end
|
|
329
|
+
end
|
|
330
|
+
end
|
|
331
|
+
nodes = css_native_or_fallback(stripped)
|
|
332
|
+
apply_pseudo_element(nodes, kind, arg)
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
def at_css_slow(selector)
|
|
336
|
+
str = selector.is_a?(String) ? selector : selector.to_s
|
|
337
|
+
# Shared memo also covers the comma/pseudo-element slow path
|
|
338
|
+
# — many SerpApi-style parsers call the same complex selector
|
|
339
|
+
# repeatedly, and across identical-HTML iterations we can
|
|
340
|
+
# short-circuit before even peeling.
|
|
341
|
+
if !@dom_node && @id.is_a?(Integer)
|
|
342
|
+
cached = @doc.cache_get(str, @id)
|
|
343
|
+
if cached
|
|
344
|
+
first = cached[0]
|
|
345
|
+
return first.nil? ? nil : Element.new(@doc, first, @wrapper)
|
|
346
|
+
end
|
|
347
|
+
end
|
|
348
|
+
if str.include?(",") && str.include?("::") &&
|
|
349
|
+
Native.heterogeneous_pseudo_groups?(str)
|
|
350
|
+
Native.split_selector_groups(str).each do |g|
|
|
351
|
+
hit = at_css(g)
|
|
352
|
+
return hit if hit
|
|
353
|
+
end
|
|
354
|
+
return nil
|
|
355
|
+
end
|
|
356
|
+
stripped, kind, arg = Native.peel_pseudo_element(str)
|
|
357
|
+
stripped = "*" if stripped.empty?
|
|
358
|
+
nodes = css_native_or_fallback(stripped, limit_one: true)
|
|
359
|
+
return nil if nodes.empty?
|
|
360
|
+
return nodes.first unless kind
|
|
361
|
+
apply_pseudo_element(nodes, kind, arg).first
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
def xpath(_expr); []; end
|
|
365
|
+
def at_xpath(_expr); nil; end
|
|
366
|
+
|
|
367
|
+
# Batch API at the Element level. Pass an array of selector
|
|
368
|
+
# strings; receive parallel results in one C round trip.
|
|
369
|
+
# Selectors ending in `::text` / `::attr(...)` come back as
|
|
370
|
+
# Arrays of strings; everything else as a NodeSet.
|
|
371
|
+
def batch_css(selectors)
|
|
372
|
+
return [] if selectors.nil? || selectors.empty?
|
|
373
|
+
w = @wrapper
|
|
374
|
+
return selectors.map { |s| css(s) } if w.nil? || @dom_node
|
|
375
|
+
plans = Array.new(selectors.size)
|
|
376
|
+
kinds = Array.new(selectors.size)
|
|
377
|
+
args = Array.new(selectors.size)
|
|
378
|
+
stripped = Array.new(selectors.size)
|
|
379
|
+
fallback = []
|
|
380
|
+
selectors.each_with_index do |sel, i|
|
|
381
|
+
str = sel.is_a?(String) ? sel : sel.to_s
|
|
382
|
+
s2, k, a = Native.peel_pseudo_element(str)
|
|
383
|
+
s2 = "*" if s2.empty?
|
|
384
|
+
kinds[i] = k
|
|
385
|
+
args[i] = a
|
|
386
|
+
stripped[i] = s2
|
|
387
|
+
if !s2.include?(",")
|
|
388
|
+
plan = w.compiled_plan(s2)
|
|
389
|
+
if plan
|
|
390
|
+
plans[i] = plan
|
|
391
|
+
next
|
|
392
|
+
end
|
|
393
|
+
end
|
|
394
|
+
fallback << i
|
|
395
|
+
end
|
|
396
|
+
id_lists = @doc.batch_chain(plans.map { |p| p || [] }, @id)
|
|
397
|
+
out = Array.new(selectors.size)
|
|
398
|
+
id_lists.each_with_index do |ids, i|
|
|
399
|
+
next if fallback.include?(i)
|
|
400
|
+
kind = kinds[i]
|
|
401
|
+
arg = args[i]
|
|
402
|
+
out[i] =
|
|
403
|
+
case kind
|
|
404
|
+
when :text, :text_approx
|
|
405
|
+
wire_text_parents!(@doc.bulk_text(ids), ids, w)
|
|
406
|
+
when :attr
|
|
407
|
+
wire_text_parents!(@doc.bulk_attr(ids, arg), ids, w)
|
|
408
|
+
else
|
|
409
|
+
# Plain selector — wrap ids as Elements. For consistency
|
|
410
|
+
# with css() return shape, expose as an Array (caller can
|
|
411
|
+
# wrap in NodeSet at the boundary).
|
|
412
|
+
ids.map { |nid| Element.new(@doc, nid, w) }
|
|
413
|
+
end
|
|
414
|
+
end
|
|
415
|
+
# Fall back per-selector for the few that didn't compile.
|
|
416
|
+
fallback.each { |i| out[i] = css(selectors[i]) }
|
|
417
|
+
out
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
# Hash-form batch: map of {key => selector} → {key => result}.
|
|
421
|
+
# The classic scrape pattern shaped as a single declarative call.
|
|
422
|
+
def extract_css(map)
|
|
423
|
+
keys = map.keys
|
|
424
|
+
results = batch_css(map.values)
|
|
425
|
+
out = {}
|
|
426
|
+
keys.each_with_index { |k, i| out[k] = results[i] }
|
|
427
|
+
out
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
# Single-result extract — one C call, fields compiled in C,
|
|
431
|
+
# field iteration in C, result hash assembled in C. Falls
|
|
432
|
+
# back to the per-field Ruby loop only when a selector can't
|
|
433
|
+
# be compiled natively.
|
|
434
|
+
def extract(map)
|
|
435
|
+
return slow_extract(map) if @dom_node || @wrapper.nil?
|
|
436
|
+
r = @doc.extract_one_h(@id, map, @wrapper)
|
|
437
|
+
return slow_extract(map) if r.equal?(true)
|
|
438
|
+
r
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
# extract_each — one C call covers compile + outer plan run +
|
|
442
|
+
# every (match × field) tuple resolution. Outer selector is
|
|
443
|
+
# peeled inside C. Falls back to Ruby per-row only when any
|
|
444
|
+
# selector can't compile natively.
|
|
445
|
+
def extract_each(outer_selector, fields)
|
|
446
|
+
return slow_extract_each(outer_selector, fields) if @dom_node || @wrapper.nil?
|
|
447
|
+
outer_str = outer_selector.is_a?(String) ? outer_selector : outer_selector.to_s
|
|
448
|
+
r = @doc.extract_each_h(outer_str, @id, fields, @wrapper)
|
|
449
|
+
return slow_extract_each(outer_selector, fields) if r.equal?(true)
|
|
450
|
+
r
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
private
|
|
454
|
+
|
|
455
|
+
def slow_extract(map)
|
|
456
|
+
out = {}
|
|
457
|
+
map.each_pair { |k, sel| out[k] = at_css(sel) }
|
|
458
|
+
out
|
|
459
|
+
end
|
|
460
|
+
|
|
461
|
+
def slow_extract_each(outer_selector, fields)
|
|
462
|
+
css(outer_selector).to_a.map do |n|
|
|
463
|
+
elem = n.is_a?(Element) ? n : (n.respond_to?(:backing_node) ? n.backing_node : n)
|
|
464
|
+
elem.is_a?(Element) ? elem.extract(fields) : Node.new(@doc, elem).extract(fields)
|
|
465
|
+
end
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
public
|
|
469
|
+
|
|
470
|
+
def matches?(selector)
|
|
471
|
+
# Walk up self's ancestor-or-self set; cheap version of
|
|
472
|
+
# checking whether *this* node matches the selector.
|
|
473
|
+
doc = wrapper ? wrapper : nil
|
|
474
|
+
if doc
|
|
475
|
+
doc.css(selector).any? { |n| n == self }
|
|
476
|
+
else
|
|
477
|
+
# No wrapper available — fall back to checking via parent.
|
|
478
|
+
false
|
|
479
|
+
end
|
|
480
|
+
end
|
|
481
|
+
|
|
482
|
+
# ----- serialization -----
|
|
483
|
+
|
|
484
|
+
def inner_html
|
|
485
|
+
if dom_node?
|
|
486
|
+
@dom_node.inner_html
|
|
487
|
+
else
|
|
488
|
+
element_children.map(&:to_html).join + text_only_children
|
|
489
|
+
end
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
def outer_html
|
|
493
|
+
if dom_node?
|
|
494
|
+
@dom_node.outer_html
|
|
495
|
+
else
|
|
496
|
+
attr_str = attributes.map { |k, v| %( #{k}="#{Dom.escape_attr(v)}") }.join
|
|
497
|
+
if Dom::VOID.include?(name) && @doc.node_children(@id).empty?
|
|
498
|
+
"<#{name}#{attr_str}>"
|
|
499
|
+
else
|
|
500
|
+
"<#{name}#{attr_str}>#{inner_html}</#{name}>"
|
|
501
|
+
end
|
|
502
|
+
end
|
|
503
|
+
end
|
|
504
|
+
alias to_html outer_html
|
|
505
|
+
alias to_xml outer_html
|
|
506
|
+
alias to_s outer_html
|
|
507
|
+
|
|
508
|
+
def node_type
|
|
509
|
+
dom_node? ? @dom_node.node_type : @doc.node_type(@id)
|
|
510
|
+
end
|
|
511
|
+
alias type node_type
|
|
512
|
+
|
|
513
|
+
def ==(other)
|
|
514
|
+
return true if equal?(other)
|
|
515
|
+
return false unless other.is_a?(Element)
|
|
516
|
+
if dom_node? && other.dom_backed?
|
|
517
|
+
@dom_node.equal?(other.dom_node)
|
|
518
|
+
elsif !dom_node? && !other.dom_backed?
|
|
519
|
+
@doc.equal?(other.doc) && @id == other.id
|
|
520
|
+
else
|
|
521
|
+
false
|
|
522
|
+
end
|
|
523
|
+
end
|
|
524
|
+
alias eql? ==
|
|
525
|
+
|
|
526
|
+
def hash
|
|
527
|
+
if dom_node?
|
|
528
|
+
@dom_node.object_id
|
|
529
|
+
else
|
|
530
|
+
[@doc.object_id, @id].hash
|
|
531
|
+
end
|
|
532
|
+
end
|
|
533
|
+
|
|
534
|
+
def fingerprint
|
|
535
|
+
Scrapetor::Fingerprint.structural(self)
|
|
536
|
+
end
|
|
537
|
+
|
|
538
|
+
# ----- mutation API -----
|
|
539
|
+
#
|
|
540
|
+
# The native arena DOM is immutable by design (it gives us the
|
|
541
|
+
# zero-copy parse + 137x Lexbor lead). Mutations promote the
|
|
542
|
+
# document to a Ruby `Dom::Document` once, then operate on the
|
|
543
|
+
# equivalent Dom node. Reads continue to work on either side.
|
|
544
|
+
|
|
545
|
+
def []=(key, value)
|
|
546
|
+
ensure_dom!
|
|
547
|
+
@dom_node[key.to_s] = value.nil? ? nil : value.to_s
|
|
548
|
+
value
|
|
549
|
+
end
|
|
550
|
+
alias set_attribute []=
|
|
551
|
+
|
|
552
|
+
def remove_attribute(key)
|
|
553
|
+
ensure_dom!
|
|
554
|
+
@dom_node.remove_attribute(key.to_s)
|
|
555
|
+
self
|
|
556
|
+
end
|
|
557
|
+
alias delete_attribute remove_attribute
|
|
558
|
+
|
|
559
|
+
def add_class(klass)
|
|
560
|
+
ensure_dom!
|
|
561
|
+
@dom_node.add_class(klass.to_s)
|
|
562
|
+
self
|
|
563
|
+
end
|
|
564
|
+
alias append_class add_class
|
|
565
|
+
|
|
566
|
+
def remove_class(klass = nil)
|
|
567
|
+
ensure_dom!
|
|
568
|
+
@dom_node.remove_class(klass && klass.to_s)
|
|
569
|
+
self
|
|
570
|
+
end
|
|
571
|
+
|
|
572
|
+
def content=(text)
|
|
573
|
+
ensure_dom!
|
|
574
|
+
@dom_node.content = text.to_s
|
|
575
|
+
text
|
|
576
|
+
end
|
|
577
|
+
alias text= content=
|
|
578
|
+
|
|
579
|
+
def inner_html=(html)
|
|
580
|
+
# Native fast path: parse the fragment in C and graft it
|
|
581
|
+
# directly into the arena, no Ruby Dom round-trip. The
|
|
582
|
+
# selector engine continues to query the native arena on
|
|
583
|
+
# subsequent reads (with a parent-walk descendant fallback
|
|
584
|
+
# for the now-non-contiguous fragment subtree).
|
|
585
|
+
if !@dom_node && @wrapper && !@wrapper.dom_mode?
|
|
586
|
+
ok = @doc.node_set_inner_html(@id, html.to_s)
|
|
587
|
+
return html if ok == true
|
|
588
|
+
end
|
|
589
|
+
ensure_dom!
|
|
590
|
+
@dom_node.inner_html = html.to_s
|
|
591
|
+
html
|
|
592
|
+
end
|
|
593
|
+
|
|
594
|
+
def add_child(node_or_html)
|
|
595
|
+
ensure_dom!
|
|
596
|
+
@dom_node.add_child(unwrap_for_mutation(node_or_html))
|
|
597
|
+
end
|
|
598
|
+
alias << add_child
|
|
599
|
+
|
|
600
|
+
def add_previous_sibling(node_or_html)
|
|
601
|
+
ensure_dom!
|
|
602
|
+
@dom_node.add_previous_sibling(unwrap_for_mutation(node_or_html))
|
|
603
|
+
end
|
|
604
|
+
alias before add_previous_sibling
|
|
605
|
+
|
|
606
|
+
def add_next_sibling(node_or_html)
|
|
607
|
+
ensure_dom!
|
|
608
|
+
@dom_node.add_next_sibling(unwrap_for_mutation(node_or_html))
|
|
609
|
+
end
|
|
610
|
+
alias after add_next_sibling
|
|
611
|
+
|
|
612
|
+
def replace(node_or_html)
|
|
613
|
+
ensure_dom!
|
|
614
|
+
@dom_node.replace(unwrap_for_mutation(node_or_html))
|
|
615
|
+
end
|
|
616
|
+
alias swap replace
|
|
617
|
+
alias replace_with replace
|
|
618
|
+
|
|
619
|
+
# Detach this element from its parent. When we're still on the
|
|
620
|
+
# native arena, mutate it in place — that avoids the cross-DOM
|
|
621
|
+
# path lookup (which can't always pin down a node on HTML where
|
|
622
|
+
# the native vs Ruby SAX parsers disagree about whitespace or
|
|
623
|
+
# implicit close tags). Once the document has been promoted to
|
|
624
|
+
# Ruby Dom by some other mutation, delegate to that side.
|
|
625
|
+
def remove
|
|
626
|
+
if @dom_node
|
|
627
|
+
@dom_node.remove
|
|
628
|
+
else
|
|
629
|
+
@doc.node_remove(@id)
|
|
630
|
+
end
|
|
631
|
+
self
|
|
632
|
+
end
|
|
633
|
+
alias unlink remove
|
|
634
|
+
alias delete remove
|
|
635
|
+
|
|
636
|
+
# Wrap this element in a parsed HTML fragment whose deepest
|
|
637
|
+
# descendant becomes the new parent. Matches Nokogiri's
|
|
638
|
+
# Node#wrap semantics.
|
|
639
|
+
def wrap(html_or_node)
|
|
640
|
+
ensure_dom!
|
|
641
|
+
@dom_node.wrap(html_or_node) if @dom_node.respond_to?(:wrap)
|
|
642
|
+
self
|
|
643
|
+
end
|
|
644
|
+
|
|
645
|
+
def traverse(&block)
|
|
646
|
+
if block_given?
|
|
647
|
+
yield self
|
|
648
|
+
element_children.each { |c| c.traverse(&block) }
|
|
649
|
+
self
|
|
650
|
+
else
|
|
651
|
+
enum_for(:traverse)
|
|
652
|
+
end
|
|
653
|
+
end
|
|
654
|
+
|
|
655
|
+
# Internal: was this Element already promoted to a Dom::Element?
|
|
656
|
+
def dom_backed?
|
|
657
|
+
dom_node?
|
|
658
|
+
end
|
|
659
|
+
|
|
660
|
+
def dom_node
|
|
661
|
+
@dom_node
|
|
662
|
+
end
|
|
663
|
+
|
|
664
|
+
# Public version of the lazy dom-promotion step. NodeSet#remove
|
|
665
|
+
# uses it to resolve every node to its Dom equivalent BEFORE the
|
|
666
|
+
# first mutation, so subsequent removals don't shift the path
|
|
667
|
+
# index under their feet.
|
|
668
|
+
def promote_to_dom!
|
|
669
|
+
ensure_dom!
|
|
670
|
+
@dom_node
|
|
671
|
+
end
|
|
672
|
+
|
|
673
|
+
def apply_pseudo_element(nodes, kind, arg)
|
|
674
|
+
case kind
|
|
675
|
+
when nil then nodes
|
|
676
|
+
when :text, :text_approx
|
|
677
|
+
nodes.map do |n|
|
|
678
|
+
t = Scrapetor::TextNode.new(n.respond_to?(:text) ? n.text.to_s : n.to_s)
|
|
679
|
+
t.parent_node = n if n.respond_to?(:element?) && n.element?
|
|
680
|
+
t
|
|
681
|
+
end
|
|
682
|
+
when :direct_text
|
|
683
|
+
out = []
|
|
684
|
+
nodes.each do |n|
|
|
685
|
+
str = direct_text_of(n)
|
|
686
|
+
tn = Scrapetor::TextNode.new(str)
|
|
687
|
+
tn.parent_node = n if n.respond_to?(:element?) && n.element?
|
|
688
|
+
out << tn
|
|
689
|
+
end
|
|
690
|
+
out
|
|
691
|
+
when :attr
|
|
692
|
+
nodes.map do |n|
|
|
693
|
+
v = n.respond_to?(:[]) ? n[arg] : nil
|
|
694
|
+
next nil if v.nil?
|
|
695
|
+
t = Scrapetor::TextNode.new(v)
|
|
696
|
+
t.parent_node = n if n.respond_to?(:element?) && n.element?
|
|
697
|
+
t
|
|
698
|
+
end
|
|
699
|
+
when :direct_attr
|
|
700
|
+
out = []
|
|
701
|
+
nodes.each do |n|
|
|
702
|
+
v = n.respond_to?(:[]) ? n[arg] : nil
|
|
703
|
+
next if v.nil?
|
|
704
|
+
tn = Scrapetor::TextNode.new(v)
|
|
705
|
+
tn.parent_node = n if n.respond_to?(:element?) && n.element?
|
|
706
|
+
out << tn
|
|
707
|
+
end
|
|
708
|
+
out
|
|
709
|
+
end
|
|
710
|
+
end
|
|
711
|
+
|
|
712
|
+
# Direct text-node children only — handles the convention
|
|
713
|
+
# `parent > ::text` (and `> ::attr(x)`) where descendant text
|
|
714
|
+
# inside child elements must NOT be included.
|
|
715
|
+
DOM_TYPE_TEXT = 3
|
|
716
|
+
def direct_text_of(n)
|
|
717
|
+
buf = +""
|
|
718
|
+
if n.is_a?(Element) && !n.send(:dom_node?)
|
|
719
|
+
doc = @doc
|
|
720
|
+
cid = doc.node_first_child(n.id)
|
|
721
|
+
while cid
|
|
722
|
+
if doc.node_type(cid) == DOM_TYPE_TEXT
|
|
723
|
+
buf << doc.node_text(cid).to_s
|
|
724
|
+
end
|
|
725
|
+
cid = doc.node_next_sibling(cid)
|
|
726
|
+
end
|
|
727
|
+
elsif n.respond_to?(:children)
|
|
728
|
+
n.children.each do |c|
|
|
729
|
+
if c.respond_to?(:text?) && c.text?
|
|
730
|
+
buf << (c.respond_to?(:text) ? c.text.to_s : c.to_s)
|
|
731
|
+
elsif !c.respond_to?(:element?) || !c.element?
|
|
732
|
+
buf << c.to_s
|
|
733
|
+
end
|
|
734
|
+
end
|
|
735
|
+
end
|
|
736
|
+
buf
|
|
737
|
+
end
|
|
738
|
+
|
|
739
|
+
# Helper for Element#css: take a bulk_text / bulk_attr result
|
|
740
|
+
# and wire each TextNode's parent to the matching Element wrapper.
|
|
741
|
+
def wire_text_parents!(values, ids, w)
|
|
742
|
+
i = 0
|
|
743
|
+
n = values.length
|
|
744
|
+
while i < n
|
|
745
|
+
v = values[i]
|
|
746
|
+
if v.is_a?(Scrapetor::TextNode)
|
|
747
|
+
v.parent_node = Element.new(@doc, ids[i], w)
|
|
748
|
+
end
|
|
749
|
+
i += 1
|
|
750
|
+
end
|
|
751
|
+
values
|
|
752
|
+
end
|
|
753
|
+
|
|
754
|
+
private
|
|
755
|
+
|
|
756
|
+
def dom_node?
|
|
757
|
+
!@dom_node.nil?
|
|
758
|
+
end
|
|
759
|
+
|
|
760
|
+
# Promote this Element (and the underlying document) to the
|
|
761
|
+
# Ruby DOM. After this, all reads and writes hit @dom_node and
|
|
762
|
+
# the wrapper's @dom_doc rather than the native arena.
|
|
763
|
+
#
|
|
764
|
+
# Three-stage lookup, each weaker than the last but always
|
|
765
|
+
# leaving the caller with a mutable Dom::Element to operate on:
|
|
766
|
+
# 1. Strict path-based locate (well-formed HTML where both
|
|
767
|
+
# parsers produce the same element tree).
|
|
768
|
+
# 2. DFS pre-order element-index lookup (handles parsers
|
|
769
|
+
# disagreeing on whitespace text nodes / implicit close
|
|
770
|
+
# tags — element-order is still stable).
|
|
771
|
+
# 3. Isolated subtree parse — feed our own outer_html through
|
|
772
|
+
# the Ruby Dom parser and use the top-level element as the
|
|
773
|
+
# promoted node. Mutations propagate to subsequent reads
|
|
774
|
+
# via @dom_node (Element#outer_html reads from there), so
|
|
775
|
+
# the user's `node.inner_html = ...` etc. always work even
|
|
776
|
+
# if we can't pin the node back into the document's Dom.
|
|
777
|
+
def ensure_dom!
|
|
778
|
+
return @dom_node if @dom_node
|
|
779
|
+
w = wrapper
|
|
780
|
+
raise NotImplementedError, "Mutation requires a DocumentWrapper" if w.nil?
|
|
781
|
+
w.switch_to_dom!
|
|
782
|
+
@dom_node = w.locate_in_dom(path) ||
|
|
783
|
+
w.locate_dom_by_native_id(@id) ||
|
|
784
|
+
isolated_dom_clone
|
|
785
|
+
raise NotImplementedError, "Cannot locate or clone equivalent node" if @dom_node.nil?
|
|
786
|
+
@dom_node
|
|
787
|
+
end
|
|
788
|
+
|
|
789
|
+
def isolated_dom_clone
|
|
790
|
+
html = to_html
|
|
791
|
+
return nil if html.nil? || html.empty?
|
|
792
|
+
frag = Scrapetor::Dom::Parser.fragment(html)
|
|
793
|
+
frag.find { |n| n.respond_to?(:element?) && n.element? }
|
|
794
|
+
end
|
|
795
|
+
|
|
796
|
+
def wrap_dom(node)
|
|
797
|
+
el = Element.new(@doc, @id, wrapper)
|
|
798
|
+
el.instance_variable_set(:@dom_node, node)
|
|
799
|
+
el
|
|
800
|
+
end
|
|
801
|
+
|
|
802
|
+
def unwrap_for_mutation(input)
|
|
803
|
+
if input.is_a?(Element)
|
|
804
|
+
input.dom_node || input.to_html
|
|
805
|
+
elsif input.is_a?(Scrapetor::Node)
|
|
806
|
+
inner = input.backing_node
|
|
807
|
+
if inner.is_a?(Element)
|
|
808
|
+
inner.dom_node || inner.to_html
|
|
809
|
+
else
|
|
810
|
+
inner
|
|
811
|
+
end
|
|
812
|
+
else
|
|
813
|
+
input
|
|
814
|
+
end
|
|
815
|
+
end
|
|
816
|
+
|
|
817
|
+
def text_only_children
|
|
818
|
+
children = @doc.node_children(@id)
|
|
819
|
+
children.filter_map do |cid|
|
|
820
|
+
@doc.node_type(cid) == 3 ? @doc.node_text(cid) : nil
|
|
821
|
+
end.join
|
|
822
|
+
end
|
|
823
|
+
|
|
824
|
+
# Try native first; fall back to the lazy Dom view on the
|
|
825
|
+
# wrapper. Returns an Array of Element wrappers (native or
|
|
826
|
+
# dom-backed).
|
|
827
|
+
def css_native_or_fallback(selector_str, limit_one: false)
|
|
828
|
+
if dom_node?
|
|
829
|
+
# Text / comment / doctype dom nodes don't support .css —
|
|
830
|
+
# NodeSet#children aggregates these alongside element nodes
|
|
831
|
+
# and Nokogiri-shape code paths still pump them through the
|
|
832
|
+
# subsequent `.css` call. Return an empty Array instead of
|
|
833
|
+
# blowing up with "undefined method `css`".
|
|
834
|
+
return [] unless @dom_node.respond_to?(:css)
|
|
835
|
+
return @dom_node.css(selector_str).map { |n| wrap_dom(n) }
|
|
836
|
+
end
|
|
837
|
+
|
|
838
|
+
w = wrapper
|
|
839
|
+
|
|
840
|
+
# Fast path: single-group selector with cached plan.
|
|
841
|
+
if !selector_str.include?(",")
|
|
842
|
+
plan = w ? w.compiled_plan(selector_str) : Native.compile_selector_chain(selector_str)
|
|
843
|
+
if plan
|
|
844
|
+
ids = @doc.run_chain(plan, @id)
|
|
845
|
+
ids = ids.first(1) if limit_one
|
|
846
|
+
return ids.map { |nid| Element.new(@doc, nid, w) }
|
|
847
|
+
end
|
|
848
|
+
# Single-group but failed to compile — try distributing
|
|
849
|
+
# `:is(...)` alternatives into separate groups before bailing.
|
|
850
|
+
expanded = Native.expand_is_groups(selector_str)
|
|
851
|
+
if expanded.size > 1
|
|
852
|
+
all = []
|
|
853
|
+
seen = nil
|
|
854
|
+
all_ok = true
|
|
855
|
+
expanded.each do |g|
|
|
856
|
+
plan = w ? w.compiled_plan(g) : Native.compile_selector_chain(g)
|
|
857
|
+
if plan.nil?
|
|
858
|
+
all_ok = false
|
|
859
|
+
break
|
|
860
|
+
end
|
|
861
|
+
@doc.run_chain(plan, @id).each do |nid|
|
|
862
|
+
seen ||= {}
|
|
863
|
+
next if seen[nid]
|
|
864
|
+
seen[nid] = true
|
|
865
|
+
all << Element.new(@doc, nid, w)
|
|
866
|
+
break if limit_one
|
|
867
|
+
end
|
|
868
|
+
break if limit_one && !all.empty?
|
|
869
|
+
end
|
|
870
|
+
return all if all_ok
|
|
871
|
+
end
|
|
872
|
+
if w
|
|
873
|
+
dom_scope = w.locate_in_dom(path) || w.fallback_dom
|
|
874
|
+
list = dom_scope.css(selector_str).to_a
|
|
875
|
+
list = list.first(1) if limit_one
|
|
876
|
+
return list.map { |n| wrap_dom(n) }
|
|
877
|
+
end
|
|
878
|
+
return []
|
|
879
|
+
end
|
|
880
|
+
|
|
881
|
+
all = []
|
|
882
|
+
seen = nil
|
|
883
|
+
ok = true
|
|
884
|
+
groups = Native.split_selector_groups(selector_str)
|
|
885
|
+
.flat_map { |g| Native.expand_is_groups(g) }
|
|
886
|
+
groups.each do |g|
|
|
887
|
+
plan = w ? w.compiled_plan(g) : Native.compile_selector_chain(g)
|
|
888
|
+
if plan.nil?
|
|
889
|
+
ok = false
|
|
890
|
+
break
|
|
891
|
+
end
|
|
892
|
+
@doc.run_chain(plan, @id).each do |nid|
|
|
893
|
+
seen ||= {}
|
|
894
|
+
next if seen[nid]
|
|
895
|
+
seen[nid] = true
|
|
896
|
+
all << Element.new(@doc, nid, w)
|
|
897
|
+
break if limit_one
|
|
898
|
+
end
|
|
899
|
+
break if limit_one && !all.empty?
|
|
900
|
+
end
|
|
901
|
+
return all if ok
|
|
902
|
+
|
|
903
|
+
if w
|
|
904
|
+
dom_scope = w.locate_in_dom(path) || w.fallback_dom
|
|
905
|
+
return dom_scope.css(selector_str).map { |n| wrap_dom(n) }
|
|
906
|
+
end
|
|
907
|
+
[]
|
|
908
|
+
end
|
|
909
|
+
end
|
|
910
|
+
|
|
911
|
+
# Install Element#at_css / Element#css as C methods. The C versions
|
|
912
|
+
# do the shape check, plan-cache lookup, run-with-limit, and Element
|
|
913
|
+
# allocation — all without re-entering Ruby method dispatch — and
|
|
914
|
+
# fall through to at_css_slow / css_slow only when the selector
|
|
915
|
+
# shape isn't supported by the fast path.
|
|
916
|
+
if Native.respond_to?(:_register_element_methods)
|
|
917
|
+
Native._register_element_methods(Element)
|
|
918
|
+
Element.class_eval do
|
|
919
|
+
alias_method :at_css, :native_at_css
|
|
920
|
+
alias_method :css, :native_css
|
|
921
|
+
alias at at_css
|
|
922
|
+
alias search css
|
|
923
|
+
end
|
|
924
|
+
end
|
|
925
|
+
if Native.respond_to?(:_register_node_methods) && defined?(Scrapetor::Node)
|
|
926
|
+
Native._register_node_methods(Scrapetor::Node)
|
|
927
|
+
Scrapetor::Node.class_eval do
|
|
928
|
+
alias_method :at, :native_at
|
|
929
|
+
alias_method :at_css, :native_at
|
|
930
|
+
alias_method :css, :native_css
|
|
931
|
+
alias_method :search, :native_css
|
|
932
|
+
end
|
|
933
|
+
end
|
|
934
|
+
|
|
935
|
+
# Document wrapper — wraps Native::Document and provides Dom-like
|
|
936
|
+
# methods so `Scrapetor::Document#backing` can return one of these
|
|
937
|
+
# interchangeably with `Dom::Document`.
|
|
938
|
+
class DocumentWrapper
|
|
939
|
+
attr_reader :native
|
|
940
|
+
|
|
941
|
+
# The compile cache lives on the wrapper so repeated queries
|
|
942
|
+
# (the common case in scraping pipelines, where the same set of
|
|
943
|
+
# selectors run against thousands of pages) skip the parse +
|
|
944
|
+
# native-plan build entirely. Sized to cover typical templates;
|
|
945
|
+
# untouched entries fall off the back when we exceed cap.
|
|
946
|
+
COMPILE_CACHE_CAP = 1024
|
|
947
|
+
|
|
948
|
+
def initialize(native)
|
|
949
|
+
@native = native
|
|
950
|
+
# Back-pointer so Elements created from this wrapper can
|
|
951
|
+
# find their way back without us threading `wrapper:` through
|
|
952
|
+
# every navigation method.
|
|
953
|
+
native.instance_variable_set(:@__scrapetor_wrapper, self) if native.respond_to?(:instance_variable_set)
|
|
954
|
+
@dom_doc = nil
|
|
955
|
+
@dom_mode = false
|
|
956
|
+
@compile_cache = {}
|
|
957
|
+
# Path cache keyed by native node id. Stable until the tree
|
|
958
|
+
# mutates (dom-mode flip clears it).
|
|
959
|
+
@path_cache = {}
|
|
960
|
+
end
|
|
961
|
+
|
|
962
|
+
def cached_path(id)
|
|
963
|
+
@path_cache[id]
|
|
964
|
+
end
|
|
965
|
+
|
|
966
|
+
def store_path(id, str)
|
|
967
|
+
@path_cache[id] = str
|
|
968
|
+
end
|
|
969
|
+
|
|
970
|
+
# Look up (or compile) the native plan for a single selector group.
|
|
971
|
+
# `nil` means "this group uses a feature the native engine
|
|
972
|
+
# doesn't accept" — callers route those to the Ruby fallback.
|
|
973
|
+
def compiled_plan(group_str)
|
|
974
|
+
if (entry = @compile_cache[group_str])
|
|
975
|
+
return entry == false ? nil : entry
|
|
976
|
+
end
|
|
977
|
+
plan = Native.compile_selector_chain(group_str)
|
|
978
|
+
@compile_cache.shift if @compile_cache.size >= COMPILE_CACHE_CAP
|
|
979
|
+
@compile_cache[group_str] = plan.nil? ? false : plan
|
|
980
|
+
if plan.nil? && ENV["SCRAP_TRACE_FALLBACK"]
|
|
981
|
+
warn "[scrap-fallback] #{group_str}"
|
|
982
|
+
end
|
|
983
|
+
plan
|
|
984
|
+
end
|
|
985
|
+
|
|
986
|
+
def element?; false; end
|
|
987
|
+
def document?; true; end
|
|
988
|
+
def name; "#document"; end
|
|
989
|
+
|
|
990
|
+
def root
|
|
991
|
+
rid = @native.root_id
|
|
992
|
+
Element.new(@native, rid, self)
|
|
993
|
+
end
|
|
994
|
+
|
|
995
|
+
def root_element; root; end
|
|
996
|
+
|
|
997
|
+
def text; fallback_dom.text; end
|
|
998
|
+
def content; text; end
|
|
999
|
+
|
|
1000
|
+
# ----- selector entry points -----
|
|
1001
|
+
|
|
1002
|
+
# `lazy_css` is the fast path that Document#css uses: it returns
|
|
1003
|
+
# raw ids when the native engine can handle the whole selector,
|
|
1004
|
+
# so the Element-wrap happens once-per-iteration instead of
|
|
1005
|
+
# once-per-result. Falls back to the eager `css` when native
|
|
1006
|
+
# can't handle the selector (kind, fallback dom, etc.).
|
|
1007
|
+
#
|
|
1008
|
+
# Returns a `LazyIds` struct OR an Array of strings (for
|
|
1009
|
+
# ::text/::attr) OR an Array of Element wrappers (when the
|
|
1010
|
+
# selector needs the Dom fallback).
|
|
1011
|
+
LazyIds = Struct.new(:wrapper, :native, :ids)
|
|
1012
|
+
|
|
1013
|
+
def lazy_css(selector)
|
|
1014
|
+
str = selector.to_s
|
|
1015
|
+
# Heterogeneous pseudo groups: peel each group separately and
|
|
1016
|
+
# concatenate. Returns a flat Array of mixed Element/TextNode
|
|
1017
|
+
# results — callers wrap it in NodeSet via .to_a.
|
|
1018
|
+
if str.include?(",") && str.include?("::") &&
|
|
1019
|
+
Native.heterogeneous_pseudo_groups?(str)
|
|
1020
|
+
return Native.split_selector_groups(str).flat_map do |g|
|
|
1021
|
+
r = lazy_css(g)
|
|
1022
|
+
r.is_a?(LazyIds) ? r.ids.map { |nid| Element.new(@native, nid, self) } : r.to_a
|
|
1023
|
+
end
|
|
1024
|
+
end
|
|
1025
|
+
stripped, kind, arg = Native.peel_pseudo_element(str)
|
|
1026
|
+
stripped = "*" if stripped.empty?
|
|
1027
|
+
if kind && %i[text text_approx attr].include?(kind) && !@dom_mode
|
|
1028
|
+
ids = native_ids(stripped)
|
|
1029
|
+
if ids
|
|
1030
|
+
return case kind
|
|
1031
|
+
when :text, :text_approx
|
|
1032
|
+
wire_parent_nodes!(@native.bulk_text(ids), ids)
|
|
1033
|
+
when :attr
|
|
1034
|
+
wire_parent_nodes!(@native.bulk_attr(ids, arg), ids)
|
|
1035
|
+
end
|
|
1036
|
+
end
|
|
1037
|
+
end
|
|
1038
|
+
if !@dom_mode && kind.nil?
|
|
1039
|
+
ids = native_ids(stripped)
|
|
1040
|
+
return LazyIds.new(self, @native, ids) if ids
|
|
1041
|
+
end
|
|
1042
|
+
nodes = css_native_or_fallback(stripped)
|
|
1043
|
+
apply_transform(nodes, kind, arg)
|
|
1044
|
+
end
|
|
1045
|
+
|
|
1046
|
+
# Set each TextNode's parent_node to the matching element it
|
|
1047
|
+
# came from. Production parser code (Google Light's organic
|
|
1048
|
+
# results, Yahoo's knowledge graph) chains `result.parent.css(...)`
|
|
1049
|
+
# to walk into siblings of a `::text` match — without a parent
|
|
1050
|
+
# ref the `.parent` returns nil and the next call crashes.
|
|
1051
|
+
def wire_parent_nodes!(values, ids)
|
|
1052
|
+
i = 0
|
|
1053
|
+
n = values.length
|
|
1054
|
+
while i < n
|
|
1055
|
+
v = values[i]
|
|
1056
|
+
if v.is_a?(Scrapetor::TextNode)
|
|
1057
|
+
v.parent_node = Element.new(@native, ids[i], self)
|
|
1058
|
+
end
|
|
1059
|
+
i += 1
|
|
1060
|
+
end
|
|
1061
|
+
values
|
|
1062
|
+
end
|
|
1063
|
+
|
|
1064
|
+
def css(selector)
|
|
1065
|
+
str = selector.to_s
|
|
1066
|
+
stripped, kind, arg = Native.peel_pseudo_element(str)
|
|
1067
|
+
stripped = "*" if stripped.empty?
|
|
1068
|
+
if kind && !@dom_mode
|
|
1069
|
+
ids = native_ids(stripped)
|
|
1070
|
+
if ids
|
|
1071
|
+
return case kind
|
|
1072
|
+
when :text, :text_approx
|
|
1073
|
+
wire_parent_nodes!(@native.bulk_text(ids), ids)
|
|
1074
|
+
when :attr
|
|
1075
|
+
wire_parent_nodes!(@native.bulk_attr(ids, arg), ids)
|
|
1076
|
+
end
|
|
1077
|
+
end
|
|
1078
|
+
end
|
|
1079
|
+
nodes = css_native_or_fallback(stripped)
|
|
1080
|
+
apply_transform(nodes, kind, arg)
|
|
1081
|
+
end
|
|
1082
|
+
|
|
1083
|
+
def at_css(selector)
|
|
1084
|
+
str = selector.to_s
|
|
1085
|
+
stripped, kind, arg = Native.peel_pseudo_element(str)
|
|
1086
|
+
stripped = "*" if stripped.empty?
|
|
1087
|
+
nodes = css_native_or_fallback(stripped, limit_one: true)
|
|
1088
|
+
return nil if nodes.empty?
|
|
1089
|
+
return nodes.first unless kind
|
|
1090
|
+
apply_transform(nodes, kind, arg).first
|
|
1091
|
+
end
|
|
1092
|
+
alias at at_css
|
|
1093
|
+
|
|
1094
|
+
# Run N selectors in ONE C call, returning an Array of results
|
|
1095
|
+
# parallel to `selectors`. Each result is either a `LazyIds`
|
|
1096
|
+
# (wrapped by Document#css as a lazy NodeSet) or an Array of
|
|
1097
|
+
# strings (for `::text` / `::attr` pseudo-elements). Selectors
|
|
1098
|
+
# the native engine can't compile fall through to the per-query
|
|
1099
|
+
# Ruby path; the rest amortise to one Ruby dispatch.
|
|
1100
|
+
def batch_css(doc, selectors)
|
|
1101
|
+
plans = Array.new(selectors.size)
|
|
1102
|
+
kinds = Array.new(selectors.size)
|
|
1103
|
+
args = Array.new(selectors.size)
|
|
1104
|
+
natives = []
|
|
1105
|
+
native_to_orig = []
|
|
1106
|
+
fallback_indices = []
|
|
1107
|
+
|
|
1108
|
+
selectors.each_with_index do |sel, i|
|
|
1109
|
+
str = sel.to_s
|
|
1110
|
+
stripped, kind, arg = Native.peel_pseudo_element(str)
|
|
1111
|
+
stripped = "*" if stripped.empty?
|
|
1112
|
+
kinds[i] = kind
|
|
1113
|
+
args[i] = arg
|
|
1114
|
+
if @dom_mode || stripped.include?(",")
|
|
1115
|
+
fallback_indices << i
|
|
1116
|
+
next
|
|
1117
|
+
end
|
|
1118
|
+
plan = compiled_plan(stripped)
|
|
1119
|
+
if plan
|
|
1120
|
+
plans[i] = plan
|
|
1121
|
+
natives << plan
|
|
1122
|
+
native_to_orig << i
|
|
1123
|
+
else
|
|
1124
|
+
fallback_indices << i
|
|
1125
|
+
end
|
|
1126
|
+
end
|
|
1127
|
+
|
|
1128
|
+
out = Array.new(selectors.size)
|
|
1129
|
+
|
|
1130
|
+
# One C call across all native plans.
|
|
1131
|
+
unless natives.empty?
|
|
1132
|
+
id_lists = @native.batch_chain(natives, nil)
|
|
1133
|
+
id_lists.each_with_index do |ids, j|
|
|
1134
|
+
orig = native_to_orig[j]
|
|
1135
|
+
out[orig] = case kinds[orig]
|
|
1136
|
+
when :text, :text_approx
|
|
1137
|
+
wire_parent_nodes!(@native.bulk_text(ids), ids)
|
|
1138
|
+
when :attr
|
|
1139
|
+
wire_parent_nodes!(@native.bulk_attr(ids, args[orig]), ids)
|
|
1140
|
+
else
|
|
1141
|
+
LazyIds.new(self, @native, ids)
|
|
1142
|
+
end
|
|
1143
|
+
end
|
|
1144
|
+
end
|
|
1145
|
+
|
|
1146
|
+
# Per-selector Ruby path for the few that need it.
|
|
1147
|
+
fallback_indices.each do |i|
|
|
1148
|
+
out[i] = lazy_css(selectors[i])
|
|
1149
|
+
end
|
|
1150
|
+
|
|
1151
|
+
# Wrap each result as Document#css would. Lazy NodeSet for
|
|
1152
|
+
# node-based results; pass strings through.
|
|
1153
|
+
out.map! do |r|
|
|
1154
|
+
if r.is_a?(LazyIds)
|
|
1155
|
+
Scrapetor::NodeSet.new(doc, r)
|
|
1156
|
+
else
|
|
1157
|
+
r
|
|
1158
|
+
end
|
|
1159
|
+
end
|
|
1160
|
+
out
|
|
1161
|
+
end
|
|
1162
|
+
|
|
1163
|
+
def xpath(_expr); []; end
|
|
1164
|
+
def at_xpath(_expr); nil; end
|
|
1165
|
+
|
|
1166
|
+
def traverse(&block)
|
|
1167
|
+
return enum_for(:traverse) unless block_given?
|
|
1168
|
+
root.traverse(&block)
|
|
1169
|
+
self
|
|
1170
|
+
end
|
|
1171
|
+
|
|
1172
|
+
def to_html
|
|
1173
|
+
@dom_mode ? @dom_doc.to_html : @native.html
|
|
1174
|
+
end
|
|
1175
|
+
alias to_s to_html
|
|
1176
|
+
|
|
1177
|
+
def html
|
|
1178
|
+
root
|
|
1179
|
+
end
|
|
1180
|
+
|
|
1181
|
+
def body
|
|
1182
|
+
at_css("body")
|
|
1183
|
+
end
|
|
1184
|
+
|
|
1185
|
+
def head
|
|
1186
|
+
at_css("head")
|
|
1187
|
+
end
|
|
1188
|
+
|
|
1189
|
+
# ----- internals for the mutation fallback -----
|
|
1190
|
+
|
|
1191
|
+
def dom_mode?; @dom_mode; end
|
|
1192
|
+
|
|
1193
|
+
# Build (once) and return the Ruby DOM view of this document.
|
|
1194
|
+
# Used by Element#css fallback when the selector exceeds the
|
|
1195
|
+
# native engine's grammar, and by Element mutations.
|
|
1196
|
+
#
|
|
1197
|
+
# The previous implementation re-tokenised the entire HTML
|
|
1198
|
+
# through the Ruby SAX parser — for a 400 KB page that's
|
|
1199
|
+
# 50–100 ms on the first mutating call. The native arena is
|
|
1200
|
+
# already parsed; we can build the Dom tree by walking it
|
|
1201
|
+
# node-by-node in O(N) instead of O(bytes). That drops to
|
|
1202
|
+
# ~5–10 ms on the same page.
|
|
1203
|
+
def fallback_dom
|
|
1204
|
+
@dom_doc ||= build_dom_from_native
|
|
1205
|
+
end
|
|
1206
|
+
|
|
1207
|
+
# O(N nodes) tree-walk that materialises a Scrapetor::Dom
|
|
1208
|
+
# mirror of the native arena. Used for the mutation fallback
|
|
1209
|
+
# path so node mutations have a Ruby-side handle to operate
|
|
1210
|
+
# on, without re-tokenising the source HTML.
|
|
1211
|
+
def build_dom_from_native
|
|
1212
|
+
doc = Scrapetor::Dom::Document.new
|
|
1213
|
+
size = @native.size
|
|
1214
|
+
return doc if size <= 1
|
|
1215
|
+
id_to_dom = Array.new(size)
|
|
1216
|
+
id_to_dom[0] = doc
|
|
1217
|
+
i = 1
|
|
1218
|
+
while i < size
|
|
1219
|
+
type = @native.node_type(i)
|
|
1220
|
+
# Skip removed (tombstoned via dom_node_remove). Type
|
|
1221
|
+
# constants: 1=element, 3=text, 8=comment, 9=doc,
|
|
1222
|
+
# 0xFE=REMOVED.
|
|
1223
|
+
if type != 1 && type != 3 && type != 8
|
|
1224
|
+
i += 1
|
|
1225
|
+
next
|
|
1226
|
+
end
|
|
1227
|
+
parent_id = @native.node_parent(i) || 0
|
|
1228
|
+
parent_dom = id_to_dom[parent_id] || doc
|
|
1229
|
+
node = case type
|
|
1230
|
+
when 1
|
|
1231
|
+
name = @native.node_name(i)
|
|
1232
|
+
attrs = @native.node_attributes(i)
|
|
1233
|
+
Scrapetor::Dom::Element.new(name, attrs)
|
|
1234
|
+
when 3
|
|
1235
|
+
Scrapetor::Dom::Text.new(@native.node_text(i))
|
|
1236
|
+
when 8
|
|
1237
|
+
Scrapetor::Dom::Comment.new(@native.node_text(i))
|
|
1238
|
+
end
|
|
1239
|
+
parent_dom.add_child(node)
|
|
1240
|
+
id_to_dom[i] = node
|
|
1241
|
+
i += 1
|
|
1242
|
+
end
|
|
1243
|
+
doc
|
|
1244
|
+
end
|
|
1245
|
+
|
|
1246
|
+
# Promote the document to dom-mode. After this, css() runs only
|
|
1247
|
+
# against the Dom view (it is the source of truth for mutations
|
|
1248
|
+
# the user has already made).
|
|
1249
|
+
def switch_to_dom!
|
|
1250
|
+
fallback_dom
|
|
1251
|
+
@dom_mode = true
|
|
1252
|
+
# Cached paths may not survive a mutation series; let them
|
|
1253
|
+
# rebuild lazily after the switch.
|
|
1254
|
+
@path_cache = {}
|
|
1255
|
+
end
|
|
1256
|
+
|
|
1257
|
+
# Walk a `/tag[idx]/.../tag[@id='x']` path inside the lazy Dom
|
|
1258
|
+
# view. Used by Element#ensure_dom! to relocate itself after
|
|
1259
|
+
# promotion.
|
|
1260
|
+
def locate_in_dom(path_str)
|
|
1261
|
+
doc = fallback_dom
|
|
1262
|
+
parts = path_str.to_s.split("/").reject(&:empty?)
|
|
1263
|
+
cur = doc
|
|
1264
|
+
parts.each do |part|
|
|
1265
|
+
if (m = part.match(/\A([\w-]+)\[@id='([^']+)'\]\z/))
|
|
1266
|
+
tag = m[1]; id = m[2]
|
|
1267
|
+
found = nil
|
|
1268
|
+
walk_elements(doc) do |el|
|
|
1269
|
+
if el.name == tag && el["id"] == id
|
|
1270
|
+
found = el
|
|
1271
|
+
break
|
|
1272
|
+
end
|
|
1273
|
+
end
|
|
1274
|
+
return nil if found.nil?
|
|
1275
|
+
cur = found
|
|
1276
|
+
elsif (m = part.match(/\A([\w-]+)\[(\d+)\]\z/))
|
|
1277
|
+
tag = m[1]; idx = m[2].to_i
|
|
1278
|
+
children = cur.respond_to?(:children) ? cur.children : []
|
|
1279
|
+
same = children.select { |c| c.respond_to?(:element?) && c.element? && c.name == tag }
|
|
1280
|
+
return nil if same.empty? || idx < 1 || idx > same.length
|
|
1281
|
+
cur = same[idx - 1]
|
|
1282
|
+
else
|
|
1283
|
+
return nil
|
|
1284
|
+
end
|
|
1285
|
+
end
|
|
1286
|
+
cur
|
|
1287
|
+
end
|
|
1288
|
+
|
|
1289
|
+
# Robust cross-DOM lookup. Native ids enumerate every node in
|
|
1290
|
+
# the arena (text, comments, elements). Both parsers visit
|
|
1291
|
+
# ELEMENT nodes in document order, so the N-th element on the
|
|
1292
|
+
# native side is the N-th element on the Ruby side — even when
|
|
1293
|
+
# the two parsers disagree on whitespace text nodes or implicit
|
|
1294
|
+
# close-tag handling. Used as a fallback when the path-based
|
|
1295
|
+
# locator can't find a match.
|
|
1296
|
+
def locate_dom_by_native_id(native_id)
|
|
1297
|
+
@native_element_offset_map ||= build_native_element_offset_map
|
|
1298
|
+
offset = @native_element_offset_map[native_id]
|
|
1299
|
+
return nil if offset.nil?
|
|
1300
|
+
@dom_element_index ||= build_dom_element_index
|
|
1301
|
+
@dom_element_index[offset]
|
|
1302
|
+
end
|
|
1303
|
+
|
|
1304
|
+
private
|
|
1305
|
+
|
|
1306
|
+
def build_native_element_offset_map
|
|
1307
|
+
map = {}
|
|
1308
|
+
count = 0
|
|
1309
|
+
size = @native.size
|
|
1310
|
+
i = 0
|
|
1311
|
+
while i < size
|
|
1312
|
+
if @native.node_is_element(i)
|
|
1313
|
+
map[i] = count
|
|
1314
|
+
count += 1
|
|
1315
|
+
end
|
|
1316
|
+
i += 1
|
|
1317
|
+
end
|
|
1318
|
+
map
|
|
1319
|
+
end
|
|
1320
|
+
|
|
1321
|
+
def build_dom_element_index
|
|
1322
|
+
list = []
|
|
1323
|
+
walk_elements(fallback_dom) { |el| list << el }
|
|
1324
|
+
list
|
|
1325
|
+
end
|
|
1326
|
+
|
|
1327
|
+
public
|
|
1328
|
+
|
|
1329
|
+
# Run the cached plan(s) for a selector and return the raw id
|
|
1330
|
+
# Array, or nil if any group needs the Ruby fallback. Used by
|
|
1331
|
+
# css() to feed bulk_text / bulk_attr without intermediate
|
|
1332
|
+
# Element allocations.
|
|
1333
|
+
def native_ids(selector_str)
|
|
1334
|
+
if !selector_str.include?(",")
|
|
1335
|
+
plan = compiled_plan(selector_str)
|
|
1336
|
+
return @native.run_chain(plan, nil) if plan
|
|
1337
|
+
expanded = Native.expand_is_groups(selector_str)
|
|
1338
|
+
return nil if expanded.size <= 1
|
|
1339
|
+
ids = []
|
|
1340
|
+
seen = nil
|
|
1341
|
+
expanded.each do |g|
|
|
1342
|
+
p = compiled_plan(g)
|
|
1343
|
+
return nil unless p
|
|
1344
|
+
@native.run_chain(p, nil).each do |nid|
|
|
1345
|
+
seen ||= {}
|
|
1346
|
+
next if seen[nid]
|
|
1347
|
+
seen[nid] = true
|
|
1348
|
+
ids << nid
|
|
1349
|
+
end
|
|
1350
|
+
end
|
|
1351
|
+
return ids
|
|
1352
|
+
end
|
|
1353
|
+
ids = []
|
|
1354
|
+
seen = nil
|
|
1355
|
+
groups = Native.split_selector_groups(selector_str)
|
|
1356
|
+
.flat_map { |g| Native.expand_is_groups(g) }
|
|
1357
|
+
groups.each do |g|
|
|
1358
|
+
plan = compiled_plan(g)
|
|
1359
|
+
return nil unless plan
|
|
1360
|
+
@native.run_chain(plan, nil).each do |nid|
|
|
1361
|
+
seen ||= {}
|
|
1362
|
+
next if seen[nid]
|
|
1363
|
+
seen[nid] = true
|
|
1364
|
+
ids << nid
|
|
1365
|
+
end
|
|
1366
|
+
end
|
|
1367
|
+
ids
|
|
1368
|
+
end
|
|
1369
|
+
|
|
1370
|
+
def apply_transform(nodes, kind, arg)
|
|
1371
|
+
case kind
|
|
1372
|
+
when nil then nodes
|
|
1373
|
+
when :text, :text_approx
|
|
1374
|
+
nodes.map do |n|
|
|
1375
|
+
t = Scrapetor::TextNode.new(n.respond_to?(:text) ? n.text.to_s : n.to_s)
|
|
1376
|
+
t.parent_node = n if n.respond_to?(:element?) && n.element?
|
|
1377
|
+
t
|
|
1378
|
+
end
|
|
1379
|
+
when :attr
|
|
1380
|
+
nodes.map do |n|
|
|
1381
|
+
v = n.respond_to?(:[]) ? n[arg] : nil
|
|
1382
|
+
next nil if v.nil?
|
|
1383
|
+
t = Scrapetor::TextNode.new(v)
|
|
1384
|
+
t.parent_node = n if n.respond_to?(:element?) && n.element?
|
|
1385
|
+
t
|
|
1386
|
+
end
|
|
1387
|
+
when :direct_text
|
|
1388
|
+
nodes.map do |n|
|
|
1389
|
+
t = Scrapetor::TextNode.new(direct_text_of_any(n))
|
|
1390
|
+
t.parent_node = n if n.respond_to?(:element?) && n.element?
|
|
1391
|
+
t
|
|
1392
|
+
end
|
|
1393
|
+
when :direct_attr
|
|
1394
|
+
out = []
|
|
1395
|
+
nodes.each do |n|
|
|
1396
|
+
v = n.respond_to?(:[]) ? n[arg] : nil
|
|
1397
|
+
next if v.nil?
|
|
1398
|
+
t = Scrapetor::TextNode.new(v)
|
|
1399
|
+
t.parent_node = n if n.respond_to?(:element?) && n.element?
|
|
1400
|
+
out << t
|
|
1401
|
+
end
|
|
1402
|
+
out
|
|
1403
|
+
end
|
|
1404
|
+
end
|
|
1405
|
+
|
|
1406
|
+
# Direct text-node children of an element. Used at the
|
|
1407
|
+
# Document/wrapper level — accepts either a native Element or a
|
|
1408
|
+
# Dom-fallback node and pulls only the immediate text children.
|
|
1409
|
+
def direct_text_of_any(n)
|
|
1410
|
+
buf = +""
|
|
1411
|
+
if n.is_a?(Element) && !n.send(:dom_node?)
|
|
1412
|
+
cid = @native.node_first_child(n.id)
|
|
1413
|
+
while cid
|
|
1414
|
+
if @native.node_type(cid) == 3
|
|
1415
|
+
buf << @native.node_text(cid).to_s
|
|
1416
|
+
end
|
|
1417
|
+
cid = @native.node_next_sibling(cid)
|
|
1418
|
+
end
|
|
1419
|
+
elsif n.respond_to?(:children)
|
|
1420
|
+
n.children.each do |c|
|
|
1421
|
+
if c.respond_to?(:text?) && c.text?
|
|
1422
|
+
buf << (c.respond_to?(:text) ? c.text.to_s : c.to_s)
|
|
1423
|
+
elsif !c.respond_to?(:element?) || !c.element?
|
|
1424
|
+
buf << c.to_s
|
|
1425
|
+
end
|
|
1426
|
+
end
|
|
1427
|
+
end
|
|
1428
|
+
buf
|
|
1429
|
+
end
|
|
1430
|
+
|
|
1431
|
+
private
|
|
1432
|
+
|
|
1433
|
+
def walk_elements(scope, &block)
|
|
1434
|
+
children = scope.respond_to?(:children) ? scope.children : []
|
|
1435
|
+
children.each do |c|
|
|
1436
|
+
next unless c.respond_to?(:element?) && c.element?
|
|
1437
|
+
yield c
|
|
1438
|
+
walk_elements(c, &block)
|
|
1439
|
+
end
|
|
1440
|
+
end
|
|
1441
|
+
|
|
1442
|
+
def css_native_or_fallback(selector_str, limit_one: false)
|
|
1443
|
+
# Once in dom-mode, native arena is stale wrt user mutations.
|
|
1444
|
+
if @dom_mode
|
|
1445
|
+
doc = fallback_dom
|
|
1446
|
+
list = doc.css(selector_str).to_a
|
|
1447
|
+
list = list.first(1) if limit_one
|
|
1448
|
+
return list.map { |n| wrap_dom_node(n) }
|
|
1449
|
+
end
|
|
1450
|
+
|
|
1451
|
+
# Fast path: single-group selector with cached plan.
|
|
1452
|
+
if !selector_str.include?(",")
|
|
1453
|
+
plan = compiled_plan(selector_str)
|
|
1454
|
+
if plan
|
|
1455
|
+
ids = @native.run_chain(plan, nil)
|
|
1456
|
+
ids = ids.first(1) if limit_one
|
|
1457
|
+
return ids.map { |nid| Element.new(@native, nid, self) }
|
|
1458
|
+
end
|
|
1459
|
+
expanded = Native.expand_is_groups(selector_str)
|
|
1460
|
+
if expanded.size > 1
|
|
1461
|
+
all = []
|
|
1462
|
+
seen = nil
|
|
1463
|
+
all_ok = true
|
|
1464
|
+
expanded.each do |g|
|
|
1465
|
+
p = compiled_plan(g)
|
|
1466
|
+
if p.nil?
|
|
1467
|
+
all_ok = false
|
|
1468
|
+
break
|
|
1469
|
+
end
|
|
1470
|
+
@native.run_chain(p, nil).each do |nid|
|
|
1471
|
+
seen ||= {}
|
|
1472
|
+
next if seen[nid]
|
|
1473
|
+
seen[nid] = true
|
|
1474
|
+
all << Element.new(@native, nid, self)
|
|
1475
|
+
break if limit_one
|
|
1476
|
+
end
|
|
1477
|
+
break if limit_one && !all.empty?
|
|
1478
|
+
end
|
|
1479
|
+
return all if all_ok
|
|
1480
|
+
end
|
|
1481
|
+
# Not natively supported — route to Dom fallback.
|
|
1482
|
+
list = fallback_dom.css(selector_str).to_a
|
|
1483
|
+
list = list.first(1) if limit_one
|
|
1484
|
+
return list.map { |n| wrap_dom_node(n) }
|
|
1485
|
+
end
|
|
1486
|
+
|
|
1487
|
+
# Comma-separated groups.
|
|
1488
|
+
all = []
|
|
1489
|
+
seen = nil
|
|
1490
|
+
ok = true
|
|
1491
|
+
groups = Native.split_selector_groups(selector_str)
|
|
1492
|
+
.flat_map { |g| Native.expand_is_groups(g) }
|
|
1493
|
+
groups.each do |g|
|
|
1494
|
+
plan = compiled_plan(g)
|
|
1495
|
+
if plan.nil?
|
|
1496
|
+
ok = false
|
|
1497
|
+
break
|
|
1498
|
+
end
|
|
1499
|
+
@native.run_chain(plan, nil).each do |nid|
|
|
1500
|
+
seen ||= {}
|
|
1501
|
+
next if seen[nid]
|
|
1502
|
+
seen[nid] = true
|
|
1503
|
+
all << Element.new(@native, nid, self)
|
|
1504
|
+
break if limit_one
|
|
1505
|
+
end
|
|
1506
|
+
break if limit_one && !all.empty?
|
|
1507
|
+
end
|
|
1508
|
+
return all if ok
|
|
1509
|
+
|
|
1510
|
+
list = fallback_dom.css(selector_str).to_a
|
|
1511
|
+
list = list.first(1) if limit_one
|
|
1512
|
+
list.map { |n| wrap_dom_node(n) }
|
|
1513
|
+
end
|
|
1514
|
+
|
|
1515
|
+
def wrap_dom_node(dom_el)
|
|
1516
|
+
el = Element.new(@native, 0, self)
|
|
1517
|
+
el.instance_variable_set(:@dom_node, dom_el)
|
|
1518
|
+
el
|
|
1519
|
+
end
|
|
1520
|
+
end
|
|
1521
|
+
end # if AVAILABLE_DOM
|
|
1522
|
+
|
|
1523
|
+
# ----- selector compilation: CSS string -> chain of native plans -----
|
|
1524
|
+
|
|
1525
|
+
# Each plan entry is `[selector_atom, combinator_or_nil]`.
|
|
1526
|
+
# selector_atom = [tag, classes, id, attrs]
|
|
1527
|
+
# selector_atom = [tag, classes, id, attrs, pseudo_data] # extended
|
|
1528
|
+
# pseudo_data = nil | [flags, nth_a, nth_b, nth_type_a, nth_type_b,
|
|
1529
|
+
# not_inner, is_inner, has_inner]
|
|
1530
|
+
# combinator = nil | "descendant" | "child"
|
|
1531
|
+
#
|
|
1532
|
+
# Returns nil (never raises) when the selector contains a feature
|
|
1533
|
+
# the native engine doesn't accept (sibling combinator, comma at
|
|
1534
|
+
# top level, pseudo with a non-simple inner selector). Callers route
|
|
1535
|
+
# those to the Ruby DOM fallback in this same gem.
|
|
1536
|
+
|
|
1537
|
+
# Mirrors C_PS_* in ext/scrapetor/native/scrapetor_dom.c. Keep in sync.
|
|
1538
|
+
NATIVE_PSEUDO_FLAGS = {
|
|
1539
|
+
"first-child" => 1 << 0,
|
|
1540
|
+
"last-child" => 1 << 1,
|
|
1541
|
+
"only-child" => 1 << 2,
|
|
1542
|
+
"first-of-type" => 1 << 3,
|
|
1543
|
+
"last-of-type" => 1 << 4,
|
|
1544
|
+
"only-of-type" => 1 << 5,
|
|
1545
|
+
"empty" => 1 << 6,
|
|
1546
|
+
"root" => 1 << 7,
|
|
1547
|
+
"checked" => 1 << 8,
|
|
1548
|
+
"disabled" => 1 << 9,
|
|
1549
|
+
"enabled" => 1 << 10,
|
|
1550
|
+
"required" => 1 << 11,
|
|
1551
|
+
"optional" => 1 << 12,
|
|
1552
|
+
"read-only" => 1 << 13,
|
|
1553
|
+
"read-write" => 1 << 14,
|
|
1554
|
+
"any-link" => 1 << 15,
|
|
1555
|
+
"link" => 1 << 15,
|
|
1556
|
+
"scope" => 1 << 23
|
|
1557
|
+
}.freeze
|
|
1558
|
+
|
|
1559
|
+
NATIVE_NTH_BITS = {
|
|
1560
|
+
"nth-child" => 1 << 16,
|
|
1561
|
+
"nth-last-child" => 1 << 17,
|
|
1562
|
+
"nth-of-type" => 1 << 18,
|
|
1563
|
+
"nth-last-of-type" => 1 << 19
|
|
1564
|
+
}.freeze
|
|
1565
|
+
|
|
1566
|
+
NATIVE_PSEUDO_FALLBACK = :__scrapetor_native_fallback__
|
|
1567
|
+
|
|
1568
|
+
def self.compile_selector_chain(selector_str)
|
|
1569
|
+
plan = Scrapetor::Selector.compile(selector_str)
|
|
1570
|
+
out = []
|
|
1571
|
+
plan.each do |atom|
|
|
1572
|
+
pseudo_data = nil
|
|
1573
|
+
if atom.pseudos && !atom.pseudos.empty?
|
|
1574
|
+
pseudo_data = native_pseudo_data(atom.pseudos)
|
|
1575
|
+
return nil if pseudo_data == NATIVE_PSEUDO_FALLBACK
|
|
1576
|
+
end
|
|
1577
|
+
sel = [
|
|
1578
|
+
atom.tag ? atom.tag.to_s : nil,
|
|
1579
|
+
atom.classes,
|
|
1580
|
+
atom.id,
|
|
1581
|
+
atom.attrs,
|
|
1582
|
+
pseudo_data
|
|
1583
|
+
]
|
|
1584
|
+
combo =
|
|
1585
|
+
case atom.combinator
|
|
1586
|
+
when :descendant then "descendant"
|
|
1587
|
+
when :child then "child"
|
|
1588
|
+
when :adj then "adjacent"
|
|
1589
|
+
when :gen then "sibling"
|
|
1590
|
+
else nil
|
|
1591
|
+
end
|
|
1592
|
+
out << [sel, combo]
|
|
1593
|
+
end
|
|
1594
|
+
out
|
|
1595
|
+
rescue ArgumentError
|
|
1596
|
+
nil
|
|
1597
|
+
end
|
|
1598
|
+
|
|
1599
|
+
# Compile the Atom#pseudos list into the eight-element Array the C
|
|
1600
|
+
# side reads. Returns NATIVE_PSEUDO_FALLBACK if any pseudo is
|
|
1601
|
+
# outside the native subset (in which case the whole chain falls
|
|
1602
|
+
# back to Ruby).
|
|
1603
|
+
def self.native_pseudo_data(pseudos)
|
|
1604
|
+
flags = 0
|
|
1605
|
+
nth_a = nth_b = 0
|
|
1606
|
+
nth_type_a = nth_type_b = 0
|
|
1607
|
+
not_inner = []
|
|
1608
|
+
is_inner = []
|
|
1609
|
+
has_inner = []
|
|
1610
|
+
not_has_inner = []
|
|
1611
|
+
has_child_inner = []
|
|
1612
|
+
not_has_child_inner = []
|
|
1613
|
+
has_chain_inner = []
|
|
1614
|
+
not_has_chain_inner = []
|
|
1615
|
+
|
|
1616
|
+
pseudos.each do |name, arg, double_colon|
|
|
1617
|
+
return NATIVE_PSEUDO_FALLBACK if double_colon
|
|
1618
|
+
|
|
1619
|
+
if (bit = NATIVE_PSEUDO_FLAGS[name])
|
|
1620
|
+
flags |= bit
|
|
1621
|
+
elsif (bit = NATIVE_NTH_BITS[name])
|
|
1622
|
+
a, b = Scrapetor::Selector.parse_nth(arg)
|
|
1623
|
+
return NATIVE_PSEUDO_FALLBACK unless a
|
|
1624
|
+
flags |= bit
|
|
1625
|
+
if name == "nth-of-type" || name == "nth-last-of-type"
|
|
1626
|
+
nth_type_a, nth_type_b = a, b
|
|
1627
|
+
else
|
|
1628
|
+
nth_a, nth_b = a, b
|
|
1629
|
+
end
|
|
1630
|
+
elsif name == "not"
|
|
1631
|
+
# `:not(:has(X, Y))` — common scrape pattern. Rather than
|
|
1632
|
+
# forcing a Ruby Dom fallback (which is ~3-5 ms per call on a
|
|
1633
|
+
# 100KB page), recognise the shape at compile time and emit
|
|
1634
|
+
# a C_PS_NOT_HAS bit on the outer atom. The C side checks
|
|
1635
|
+
# "no descendant matches any of these simple atoms" — same
|
|
1636
|
+
# cost as C_PS_HAS, just inverted.
|
|
1637
|
+
if (nh = parse_not_has_form(arg))
|
|
1638
|
+
not_has_inner.concat(nh)
|
|
1639
|
+
flags |= (1 << 24)
|
|
1640
|
+
next
|
|
1641
|
+
end
|
|
1642
|
+
# `:not(:has(> X))` direct-child variant.
|
|
1643
|
+
if (nhc = parse_not_has_child_form(arg))
|
|
1644
|
+
not_has_child_inner.concat(nhc)
|
|
1645
|
+
flags |= (1 << 26)
|
|
1646
|
+
next
|
|
1647
|
+
end
|
|
1648
|
+
# `:not(:has(X Y, A B, ...))` — chain inner with multiple
|
|
1649
|
+
# alternatives. Mirrors `:has(X Y, A B)` (1<<27) but with
|
|
1650
|
+
# the negated descendant check.
|
|
1651
|
+
if (nchains = parse_not_has_chains_form(arg))
|
|
1652
|
+
not_has_chain_inner = nchains
|
|
1653
|
+
flags |= (1 << 29)
|
|
1654
|
+
next
|
|
1655
|
+
end
|
|
1656
|
+
inner = native_inner_simples(arg)
|
|
1657
|
+
return NATIVE_PSEUDO_FALLBACK if inner == NATIVE_PSEUDO_FALLBACK
|
|
1658
|
+
not_inner.concat(inner)
|
|
1659
|
+
flags |= (1 << 20)
|
|
1660
|
+
elsif name == "is" || name == "matches" || name == "where"
|
|
1661
|
+
inner = native_inner_simples(arg)
|
|
1662
|
+
return NATIVE_PSEUDO_FALLBACK if inner == NATIVE_PSEUDO_FALLBACK
|
|
1663
|
+
is_inner.concat(inner)
|
|
1664
|
+
flags |= (1 << 21)
|
|
1665
|
+
elsif name == "has"
|
|
1666
|
+
# `:has(>::text)` / `:has(::text)` — "node has a direct
|
|
1667
|
+
# text-node child". Non-standard but appears in production
|
|
1668
|
+
# parsers. Maps to a one-bit flag the C side evaluates with
|
|
1669
|
+
# a single child walk.
|
|
1670
|
+
if has_text_child_form?(arg)
|
|
1671
|
+
flags |= (1 << 28)
|
|
1672
|
+
next
|
|
1673
|
+
end
|
|
1674
|
+
# `:has(> X, > Y)` — leading combinator inside :has. The
|
|
1675
|
+
# arg's compile output starts with `:scope` (compile()
|
|
1676
|
+
# desugars the leading `>`), giving each group two atoms.
|
|
1677
|
+
# native_inner_simples requires a single atom, so detect
|
|
1678
|
+
# this shape explicitly and lift the *child* atoms into
|
|
1679
|
+
# has_child_inner.
|
|
1680
|
+
if (hc = parse_has_child_form(arg))
|
|
1681
|
+
has_child_inner.concat(hc)
|
|
1682
|
+
flags |= (1 << 25)
|
|
1683
|
+
next
|
|
1684
|
+
end
|
|
1685
|
+
# `:has(+ X, + Y)` / `:has(~ X, ~ Y)` — sibling-from-scope
|
|
1686
|
+
# variants. Same lifting machinery but the walk is on the
|
|
1687
|
+
# outer node's siblings, not its descendants.
|
|
1688
|
+
if (hs = parse_has_sib_form(arg, "+"))
|
|
1689
|
+
has_inner.concat(hs)
|
|
1690
|
+
flags |= (1 << 30)
|
|
1691
|
+
next
|
|
1692
|
+
end
|
|
1693
|
+
if (hs = parse_has_sib_form(arg, "~"))
|
|
1694
|
+
has_inner.concat(hs)
|
|
1695
|
+
flags |= (1 << 31)
|
|
1696
|
+
next
|
|
1697
|
+
end
|
|
1698
|
+
# `:is(...)` inside :has: distribute alternatives so an inner
|
|
1699
|
+
# like `:is(h2, span).a-color-base` becomes
|
|
1700
|
+
# `h2.a-color-base, span.a-color-base` before we hand it to
|
|
1701
|
+
# native_inner_simples (which needs single-atom groups). Force
|
|
1702
|
+
# distribution even for single-atom alternatives — the comma-
|
|
1703
|
+
# joined form is exactly the shape native_inner_simples wants.
|
|
1704
|
+
arg_expanded = Native.split_selector_groups(arg)
|
|
1705
|
+
.flat_map { |g| Native.expand_is_groups(g, force: true) }
|
|
1706
|
+
.join(", ")
|
|
1707
|
+
inner = native_inner_simples(arg_expanded)
|
|
1708
|
+
if inner != NATIVE_PSEUDO_FALLBACK
|
|
1709
|
+
has_inner.concat(inner)
|
|
1710
|
+
flags |= (1 << 22)
|
|
1711
|
+
next
|
|
1712
|
+
end
|
|
1713
|
+
# `:has(X Y, A B, ...)` — multi-chain. Each comma alternative
|
|
1714
|
+
# is its own chain of simple atoms with descendant/child/
|
|
1715
|
+
# sibling combinators between them. The native engine matches
|
|
1716
|
+
# if ANY chain has a descendant match.
|
|
1717
|
+
if (chains = parse_has_chains_form(arg))
|
|
1718
|
+
has_chain_inner = chains
|
|
1719
|
+
flags |= (1 << 27)
|
|
1720
|
+
next
|
|
1721
|
+
end
|
|
1722
|
+
return NATIVE_PSEUDO_FALLBACK
|
|
1723
|
+
else
|
|
1724
|
+
return NATIVE_PSEUDO_FALLBACK
|
|
1725
|
+
end
|
|
1726
|
+
end
|
|
1727
|
+
|
|
1728
|
+
[flags, nth_a, nth_b, nth_type_a, nth_type_b, not_inner, is_inner, has_inner,
|
|
1729
|
+
not_has_inner, has_child_inner, not_has_child_inner, has_chain_inner,
|
|
1730
|
+
not_has_chain_inner]
|
|
1731
|
+
end
|
|
1732
|
+
|
|
1733
|
+
# `:not(:has(X Y))` — :not wrapping a single :has with a multi-atom
|
|
1734
|
+
# chain. Returns the chain shape (same as parse_has_chain_form) or
|
|
1735
|
+
# nil. The matching is the negated descendant-chain check.
|
|
1736
|
+
def self.parse_not_has_chain_form(arg)
|
|
1737
|
+
r = parse_not_has_chains_form(arg)
|
|
1738
|
+
return nil if r.nil? || r.size != 1
|
|
1739
|
+
r.first
|
|
1740
|
+
end
|
|
1741
|
+
|
|
1742
|
+
def self.parse_not_has_chains_form(arg)
|
|
1743
|
+
return nil if arg.nil? || arg.empty?
|
|
1744
|
+
groups = Scrapetor::Dom::Selectors.selector_groups(arg)
|
|
1745
|
+
return nil if groups.size != 1
|
|
1746
|
+
plan = Scrapetor::Selector.compile(groups.first)
|
|
1747
|
+
return nil if plan.size != 1
|
|
1748
|
+
atom = plan.first
|
|
1749
|
+
return nil unless atom.pseudos && atom.pseudos.size == 1
|
|
1750
|
+
name, inner_arg, double_colon = atom.pseudos.first
|
|
1751
|
+
return nil if double_colon || name != "has"
|
|
1752
|
+
return nil if atom.tag || !atom.classes.empty? || atom.id || !atom.attrs.empty?
|
|
1753
|
+
parse_has_chains_form(inner_arg)
|
|
1754
|
+
rescue ArgumentError
|
|
1755
|
+
nil
|
|
1756
|
+
end
|
|
1757
|
+
|
|
1758
|
+
# `:has(+ X, + Y)` / `:has(~ X, ~ Y)` — every group of the argument
|
|
1759
|
+
# must start with the given sibling combinator. Returns the list of
|
|
1760
|
+
# leaf simple-atom entries (right of the combinator) on success.
|
|
1761
|
+
def self.parse_has_sib_form(arg, combinator_char)
|
|
1762
|
+
return nil if arg.nil? || arg.empty?
|
|
1763
|
+
groups = Scrapetor::Dom::Selectors.selector_groups(arg)
|
|
1764
|
+
out = []
|
|
1765
|
+
groups.each do |g|
|
|
1766
|
+
gs = g.strip
|
|
1767
|
+
return nil unless gs.start_with?(combinator_char)
|
|
1768
|
+
inner = gs[1..].lstrip
|
|
1769
|
+
plan = Scrapetor::Selector.compile(inner)
|
|
1770
|
+
return nil if plan.size != 1
|
|
1771
|
+
atom = plan.first
|
|
1772
|
+
leaf_pseudo = nil
|
|
1773
|
+
if atom.pseudos && !atom.pseudos.empty?
|
|
1774
|
+
leaf_pseudo = native_leaf_pseudo_data(atom.pseudos)
|
|
1775
|
+
return nil if leaf_pseudo.nil?
|
|
1776
|
+
end
|
|
1777
|
+
entry = [atom.tag ? atom.tag.to_s : nil, atom.classes, atom.id, atom.attrs]
|
|
1778
|
+
entry << leaf_pseudo if leaf_pseudo
|
|
1779
|
+
out << entry
|
|
1780
|
+
end
|
|
1781
|
+
out
|
|
1782
|
+
rescue ArgumentError
|
|
1783
|
+
nil
|
|
1784
|
+
end
|
|
1785
|
+
|
|
1786
|
+
# `:has(>::text)` / `:has(::text)` — "node has at least one direct
|
|
1787
|
+
# text-node child". The compile would otherwise reject the bare
|
|
1788
|
+
# pseudo-element inside :has, forcing the whole selector to the
|
|
1789
|
+
# Ruby Dom fallback. Cheap-as-shrimp shape detector — just trims
|
|
1790
|
+
# whitespace and an optional leading `>`.
|
|
1791
|
+
def self.has_text_child_form?(arg)
|
|
1792
|
+
return false if arg.nil?
|
|
1793
|
+
s = arg.strip
|
|
1794
|
+
s = s[1..].lstrip if s.start_with?(">")
|
|
1795
|
+
s == "::text"
|
|
1796
|
+
end
|
|
1797
|
+
|
|
1798
|
+
# `:has(X Y)` — single chain (no commas, no leading combinator). The
|
|
1799
|
+
# arg's compile output is multiple atoms joined by descendant/child
|
|
1800
|
+
# combinators. Returns an Array of [simple_atom_entry, combo_str]
|
|
1801
|
+
# pairs (combo_str is "descendant" / "child" / nil). Rejects forms
|
|
1802
|
+
# native_inner_simples already handles (single atom) and forms that
|
|
1803
|
+
# need recursive pseudos.
|
|
1804
|
+
def self.parse_has_chain_form(arg)
|
|
1805
|
+
r = parse_has_chains_form(arg)
|
|
1806
|
+
return nil if r.nil? || r.size != 1
|
|
1807
|
+
r.first
|
|
1808
|
+
end
|
|
1809
|
+
|
|
1810
|
+
# `:has(X Y, A B, ...)` — multi-chain. Returns an Array of chains.
|
|
1811
|
+
# Each chain is an Array of [atom_entry, combinator_string] pairs.
|
|
1812
|
+
# The first entry's combinator is nil; subsequent entries carry
|
|
1813
|
+
# descendant/child/adjacent/sibling. Returns nil when any group's
|
|
1814
|
+
# shape isn't a supported chain form (no recursive pseudos beyond
|
|
1815
|
+
# leaf, etc.). Single-atom alternatives are also lifted as 1-long
|
|
1816
|
+
# chains so the caller doesn't have to distinguish.
|
|
1817
|
+
def self.parse_has_chains_form(arg)
|
|
1818
|
+
return nil if arg.nil? || arg.empty?
|
|
1819
|
+
groups = Scrapetor::Dom::Selectors.selector_groups(arg)
|
|
1820
|
+
return nil if groups.empty? || groups.size > 8
|
|
1821
|
+
chains = []
|
|
1822
|
+
groups.each do |g|
|
|
1823
|
+
plan = Scrapetor::Selector.compile(g)
|
|
1824
|
+
return nil if plan.empty?
|
|
1825
|
+
chain = []
|
|
1826
|
+
plan.each_with_index do |atom, idx|
|
|
1827
|
+
leaf_pseudo = nil
|
|
1828
|
+
if atom.pseudos && !atom.pseudos.empty?
|
|
1829
|
+
leaf_pseudo = native_inner_simple_pseudo(atom.pseudos) ||
|
|
1830
|
+
native_leaf_pseudo_data(atom.pseudos)
|
|
1831
|
+
return nil if leaf_pseudo.nil?
|
|
1832
|
+
end
|
|
1833
|
+
entry = [atom.tag ? atom.tag.to_s : nil, atom.classes, atom.id, atom.attrs]
|
|
1834
|
+
entry << leaf_pseudo if leaf_pseudo
|
|
1835
|
+
combo =
|
|
1836
|
+
case atom.combinator
|
|
1837
|
+
when :descendant then "descendant"
|
|
1838
|
+
when :child then "child"
|
|
1839
|
+
when :adj then "adjacent"
|
|
1840
|
+
when :gen then "sibling"
|
|
1841
|
+
when nil then (idx.zero? ? nil : "descendant")
|
|
1842
|
+
else nil
|
|
1843
|
+
end
|
|
1844
|
+
chain << [entry, combo]
|
|
1845
|
+
end
|
|
1846
|
+
chains << chain
|
|
1847
|
+
end
|
|
1848
|
+
chains
|
|
1849
|
+
rescue ArgumentError
|
|
1850
|
+
nil
|
|
1851
|
+
end
|
|
1852
|
+
|
|
1853
|
+
# `:has(> X, > Y)` — every group of the argument must be of shape
|
|
1854
|
+
# `:scope > simple`. Returns the simple atoms (each is the right
|
|
1855
|
+
# side of the `>`) if so, nil otherwise.
|
|
1856
|
+
def self.parse_has_child_form(arg)
|
|
1857
|
+
return nil if arg.nil? || arg.empty?
|
|
1858
|
+
groups = Scrapetor::Dom::Selectors.selector_groups(arg)
|
|
1859
|
+
out = []
|
|
1860
|
+
groups.each do |g|
|
|
1861
|
+
gs = g.strip
|
|
1862
|
+
return nil unless gs.start_with?(">")
|
|
1863
|
+
inner = gs[1..].lstrip
|
|
1864
|
+
plan = Scrapetor::Selector.compile(inner)
|
|
1865
|
+
return nil if plan.size != 1
|
|
1866
|
+
atom = plan.first
|
|
1867
|
+
leaf_pseudo = nil
|
|
1868
|
+
if atom.pseudos && !atom.pseudos.empty?
|
|
1869
|
+
leaf_pseudo = native_leaf_pseudo_data(atom.pseudos)
|
|
1870
|
+
return nil if leaf_pseudo.nil?
|
|
1871
|
+
end
|
|
1872
|
+
entry = [atom.tag ? atom.tag.to_s : nil, atom.classes, atom.id, atom.attrs]
|
|
1873
|
+
entry << leaf_pseudo if leaf_pseudo
|
|
1874
|
+
out << entry
|
|
1875
|
+
end
|
|
1876
|
+
out
|
|
1877
|
+
rescue ArgumentError
|
|
1878
|
+
nil
|
|
1879
|
+
end
|
|
1880
|
+
|
|
1881
|
+
# `:not(:has(> X))` — direct-child negative form.
|
|
1882
|
+
def self.parse_not_has_child_form(arg)
|
|
1883
|
+
return nil if arg.nil? || arg.empty?
|
|
1884
|
+
groups = Scrapetor::Dom::Selectors.selector_groups(arg)
|
|
1885
|
+
return nil if groups.size != 1
|
|
1886
|
+
plan = Scrapetor::Selector.compile(groups.first)
|
|
1887
|
+
return nil if plan.size != 1
|
|
1888
|
+
atom = plan.first
|
|
1889
|
+
return nil unless atom.pseudos && atom.pseudos.size == 1
|
|
1890
|
+
name, inner_arg, double_colon = atom.pseudos.first
|
|
1891
|
+
return nil if double_colon || name != "has"
|
|
1892
|
+
return nil if atom.tag || !atom.classes.empty? || atom.id || !atom.attrs.empty?
|
|
1893
|
+
parse_has_child_form(inner_arg)
|
|
1894
|
+
rescue ArgumentError
|
|
1895
|
+
nil
|
|
1896
|
+
end
|
|
1897
|
+
|
|
1898
|
+
# Inspect a `:not(...)` argument; if the argument compiles to exactly
|
|
1899
|
+
# `:has(simple, simple, ...)` (no other tag/class/id/attr constraints
|
|
1900
|
+
# outside the :has), return the array of inner simple-atom forms so
|
|
1901
|
+
# the caller can lift them into the C_PS_NOT_HAS path. Returns nil
|
|
1902
|
+
# for anything else.
|
|
1903
|
+
def self.parse_not_has_form(arg)
|
|
1904
|
+
return nil if arg.nil? || arg.empty?
|
|
1905
|
+
groups = Scrapetor::Dom::Selectors.selector_groups(arg)
|
|
1906
|
+
return nil if groups.size != 1
|
|
1907
|
+
plan = Scrapetor::Selector.compile(groups.first)
|
|
1908
|
+
return nil if plan.size != 1
|
|
1909
|
+
atom = plan.first
|
|
1910
|
+
return nil unless atom.pseudos && atom.pseudos.size == 1
|
|
1911
|
+
name, inner_arg, double_colon = atom.pseudos.first
|
|
1912
|
+
return nil if double_colon || name != "has"
|
|
1913
|
+
return nil if atom.tag || !atom.classes.empty? || atom.id || !atom.attrs.empty?
|
|
1914
|
+
inner = native_inner_simples(inner_arg)
|
|
1915
|
+
return nil if inner == NATIVE_PSEUDO_FALLBACK
|
|
1916
|
+
inner
|
|
1917
|
+
rescue ArgumentError
|
|
1918
|
+
nil
|
|
1919
|
+
end
|
|
1920
|
+
|
|
1921
|
+
# Compile an inner-selector argument (`:not(.x, :empty, .y[z])`) into
|
|
1922
|
+
# an array of simple-atom descriptors the C engine can read. Each
|
|
1923
|
+
# inner is `[tag, classes, id, attrs]` or, when pseudo flags are
|
|
1924
|
+
# present, `[tag, classes, id, attrs, leaf_pseudo_data]`. Combinators
|
|
1925
|
+
# and recursive pseudos (a `:not` inside a `:not`) still force the
|
|
1926
|
+
# Ruby fallback — the C side only flattens one level deep.
|
|
1927
|
+
def self.native_inner_simples(arg, depth = 0)
|
|
1928
|
+
return NATIVE_PSEUDO_FALLBACK if arg.nil? || arg.empty?
|
|
1929
|
+
return NATIVE_PSEUDO_FALLBACK if depth > 4
|
|
1930
|
+
groups = Scrapetor::Dom::Selectors.selector_groups(arg)
|
|
1931
|
+
out = []
|
|
1932
|
+
groups.each do |g|
|
|
1933
|
+
plan = Scrapetor::Selector.compile(g)
|
|
1934
|
+
return NATIVE_PSEUDO_FALLBACK if plan.size != 1
|
|
1935
|
+
atom = plan.first
|
|
1936
|
+
# `:has(:is(X, Y))` / `:not(:is(X, Y))` etc.: unwrap a pure
|
|
1937
|
+
# `:is(...)` atom into its alternatives so the inner pool
|
|
1938
|
+
# receives the leaf simples without the recursive :is.
|
|
1939
|
+
if pure_is_atom?(atom)
|
|
1940
|
+
inner_arg = atom.pseudos.first[1]
|
|
1941
|
+
sub = native_inner_simples(inner_arg, depth + 1)
|
|
1942
|
+
return NATIVE_PSEUDO_FALLBACK if sub == NATIVE_PSEUDO_FALLBACK
|
|
1943
|
+
out.concat(sub)
|
|
1944
|
+
next
|
|
1945
|
+
end
|
|
1946
|
+
leaf_pseudo = nil
|
|
1947
|
+
if atom.pseudos && !atom.pseudos.empty?
|
|
1948
|
+
# Try the nested (one-level-recursive) shape first — accepts
|
|
1949
|
+
# `:not(simple)` / `:has(simple)` / `:not(:has(simple))` on the
|
|
1950
|
+
# inner atom, lifting them into inner pools on the inner
|
|
1951
|
+
# c_simple_atom. Falls back to leaf-only if that doesn't apply.
|
|
1952
|
+
leaf_pseudo = native_inner_simple_pseudo(atom.pseudos) ||
|
|
1953
|
+
native_leaf_pseudo_data(atom.pseudos)
|
|
1954
|
+
return NATIVE_PSEUDO_FALLBACK if leaf_pseudo.nil?
|
|
1955
|
+
end
|
|
1956
|
+
entry = [atom.tag ? atom.tag.to_s : nil, atom.classes, atom.id, atom.attrs]
|
|
1957
|
+
entry << leaf_pseudo if leaf_pseudo
|
|
1958
|
+
out << entry
|
|
1959
|
+
end
|
|
1960
|
+
out
|
|
1961
|
+
rescue ArgumentError
|
|
1962
|
+
NATIVE_PSEUDO_FALLBACK
|
|
1963
|
+
end
|
|
1964
|
+
|
|
1965
|
+
# An atom that is *only* `:is(...)` — no tag/class/id/attrs and no
|
|
1966
|
+
# other pseudos — so the `:is` wraps a list of alternatives that
|
|
1967
|
+
# can be unwrapped into the surrounding inner pool. Anything else
|
|
1968
|
+
# on the atom (e.g. `.x:is(...)`) would change semantics and isn't
|
|
1969
|
+
# eligible for this rewrite.
|
|
1970
|
+
def self.pure_is_atom?(atom)
|
|
1971
|
+
return false if atom.tag || !atom.classes.empty? || atom.id || !atom.attrs.empty?
|
|
1972
|
+
return false unless atom.pseudos && atom.pseudos.size == 1
|
|
1973
|
+
name, _arg, double_colon = atom.pseudos.first
|
|
1974
|
+
!double_colon && %w[is matches where].include?(name)
|
|
1975
|
+
end
|
|
1976
|
+
|
|
1977
|
+
# Build the extended pseudo_data slot for a c_simple_atom that
|
|
1978
|
+
# itself carries `:not(simple)` / `:has(simple)` / `:not(:has(simple))`
|
|
1979
|
+
# constraints. The C layer reads optional indices 5, 6, 7 as
|
|
1980
|
+
# inner_not / inner_has / inner_not_has pools and applies them in
|
|
1981
|
+
# matches_simple_atom. Returns nil when the shape isn't supported
|
|
1982
|
+
# (sibling combinators inside, recursive pseudos beyond one level,
|
|
1983
|
+
# etc.) — the caller falls back to native_leaf_pseudo_data which
|
|
1984
|
+
# rejects the atom entirely if leaves aren't enough.
|
|
1985
|
+
def self.native_inner_simple_pseudo(pseudos)
|
|
1986
|
+
flags = 0
|
|
1987
|
+
nth_a = nth_b = 0
|
|
1988
|
+
nth_type_a = nth_type_b = 0
|
|
1989
|
+
inner_not = []
|
|
1990
|
+
inner_has = []
|
|
1991
|
+
inner_not_has = []
|
|
1992
|
+
inner_has_chain = nil
|
|
1993
|
+
pseudos.each do |name, arg, double_colon|
|
|
1994
|
+
return nil if double_colon
|
|
1995
|
+
if (bit = NATIVE_PSEUDO_FLAGS[name])
|
|
1996
|
+
flags |= bit
|
|
1997
|
+
elsif (bit = NATIVE_NTH_BITS[name])
|
|
1998
|
+
a, b = Scrapetor::Selector.parse_nth(arg)
|
|
1999
|
+
return nil unless a
|
|
2000
|
+
flags |= bit
|
|
2001
|
+
if name == "nth-of-type" || name == "nth-last-of-type"
|
|
2002
|
+
nth_type_a, nth_type_b = a, b
|
|
2003
|
+
else
|
|
2004
|
+
nth_a, nth_b = a, b
|
|
2005
|
+
end
|
|
2006
|
+
elsif name == "not"
|
|
2007
|
+
# `:not(:has(simple))` → inner_not_has
|
|
2008
|
+
if (nh = parse_inner_not_has_form(arg))
|
|
2009
|
+
inner_not_has.concat(nh)
|
|
2010
|
+
next
|
|
2011
|
+
end
|
|
2012
|
+
sub = inner_pool_for(arg)
|
|
2013
|
+
return nil if sub.nil?
|
|
2014
|
+
inner_not.concat(sub)
|
|
2015
|
+
elsif name == "has"
|
|
2016
|
+
# Try simple-atom inner first.
|
|
2017
|
+
sub = inner_pool_for(arg)
|
|
2018
|
+
if sub
|
|
2019
|
+
inner_has.concat(sub)
|
|
2020
|
+
elsif (chain = parse_has_chains_form(arg))
|
|
2021
|
+
# Multi-atom chain alternatives. Lift into inner_has_chain
|
|
2022
|
+
# so the C engine evaluates the chain match natively.
|
|
2023
|
+
inner_has_chain = chain
|
|
2024
|
+
else
|
|
2025
|
+
return nil
|
|
2026
|
+
end
|
|
2027
|
+
else
|
|
2028
|
+
return nil
|
|
2029
|
+
end
|
|
2030
|
+
end
|
|
2031
|
+
out = [flags, nth_a, nth_b, nth_type_a, nth_type_b]
|
|
2032
|
+
# Pad with empty arrays as needed so the C layer indexes work.
|
|
2033
|
+
need_8 = inner_has_chain && !inner_has_chain.empty?
|
|
2034
|
+
need_7 = need_8 || !inner_not_has.empty?
|
|
2035
|
+
need_6 = need_7 || !inner_has.empty?
|
|
2036
|
+
need_5 = need_6 || !inner_not.empty?
|
|
2037
|
+
out << inner_not if need_5
|
|
2038
|
+
out << inner_has if need_6
|
|
2039
|
+
out << inner_not_has if need_7
|
|
2040
|
+
out << inner_has_chain if need_8
|
|
2041
|
+
out
|
|
2042
|
+
end
|
|
2043
|
+
|
|
2044
|
+
# Compile a `:not(arg)` / `:has(arg)` payload as a list of leaf
|
|
2045
|
+
# simple atoms (no further pseudo recursion). Used to fill an inner
|
|
2046
|
+
# pool on a c_simple_atom — limit one level deep.
|
|
2047
|
+
def self.inner_pool_for(arg)
|
|
2048
|
+
return nil if arg.nil? || arg.empty?
|
|
2049
|
+
groups = Scrapetor::Dom::Selectors.selector_groups(arg)
|
|
2050
|
+
out = []
|
|
2051
|
+
groups.each do |g|
|
|
2052
|
+
plan = Scrapetor::Selector.compile(g)
|
|
2053
|
+
return nil if plan.size != 1
|
|
2054
|
+
atom = plan.first
|
|
2055
|
+
if pure_is_atom?(atom)
|
|
2056
|
+
sub = inner_pool_for(atom.pseudos.first[1])
|
|
2057
|
+
return nil if sub.nil?
|
|
2058
|
+
out.concat(sub)
|
|
2059
|
+
next
|
|
2060
|
+
end
|
|
2061
|
+
leaf_pseudo = nil
|
|
2062
|
+
if atom.pseudos && !atom.pseudos.empty?
|
|
2063
|
+
leaf_pseudo = native_leaf_pseudo_data(atom.pseudos)
|
|
2064
|
+
return nil if leaf_pseudo.nil?
|
|
2065
|
+
end
|
|
2066
|
+
entry = [atom.tag ? atom.tag.to_s : nil, atom.classes, atom.id, atom.attrs]
|
|
2067
|
+
entry << leaf_pseudo if leaf_pseudo
|
|
2068
|
+
out << entry
|
|
2069
|
+
end
|
|
2070
|
+
out
|
|
2071
|
+
rescue ArgumentError
|
|
2072
|
+
nil
|
|
2073
|
+
end
|
|
2074
|
+
|
|
2075
|
+
# `:not(:has(simple))` payload — used by inner_simple_pseudo to lift
|
|
2076
|
+
# the nested negation into inner_not_has on the simple atom.
|
|
2077
|
+
def self.parse_inner_not_has_form(arg)
|
|
2078
|
+
return nil if arg.nil? || arg.empty?
|
|
2079
|
+
groups = Scrapetor::Dom::Selectors.selector_groups(arg)
|
|
2080
|
+
return nil if groups.size != 1
|
|
2081
|
+
plan = Scrapetor::Selector.compile(groups.first)
|
|
2082
|
+
return nil if plan.size != 1
|
|
2083
|
+
atom = plan.first
|
|
2084
|
+
return nil unless atom.pseudos && atom.pseudos.size == 1
|
|
2085
|
+
name, inner_arg, double_colon = atom.pseudos.first
|
|
2086
|
+
return nil if double_colon || name != "has"
|
|
2087
|
+
return nil if atom.tag || !atom.classes.empty? || atom.id || !atom.attrs.empty?
|
|
2088
|
+
inner_pool_for(inner_arg)
|
|
2089
|
+
rescue ArgumentError
|
|
2090
|
+
nil
|
|
2091
|
+
end
|
|
2092
|
+
|
|
2093
|
+
# Like native_pseudo_data, but rejects any pseudo that requires a
|
|
2094
|
+
# nested sub-selector (`:not`/`:is`/`:has`). The C `c_simple_atom`
|
|
2095
|
+
# only has the leaf pseudo fields; the recursive ones would need
|
|
2096
|
+
# their own inner pool which we don't allocate.
|
|
2097
|
+
def self.native_leaf_pseudo_data(pseudos)
|
|
2098
|
+
flags = 0
|
|
2099
|
+
nth_a = nth_b = 0
|
|
2100
|
+
nth_type_a = nth_type_b = 0
|
|
2101
|
+
pseudos.each do |name, arg, double_colon|
|
|
2102
|
+
return nil if double_colon
|
|
2103
|
+
if (bit = NATIVE_PSEUDO_FLAGS[name])
|
|
2104
|
+
flags |= bit
|
|
2105
|
+
elsif (bit = NATIVE_NTH_BITS[name])
|
|
2106
|
+
a, b = Scrapetor::Selector.parse_nth(arg)
|
|
2107
|
+
return nil unless a
|
|
2108
|
+
flags |= bit
|
|
2109
|
+
if name == "nth-of-type" || name == "nth-last-of-type"
|
|
2110
|
+
nth_type_a, nth_type_b = a, b
|
|
2111
|
+
else
|
|
2112
|
+
nth_a, nth_b = a, b
|
|
2113
|
+
end
|
|
2114
|
+
else
|
|
2115
|
+
return nil
|
|
2116
|
+
end
|
|
2117
|
+
end
|
|
2118
|
+
[flags, nth_a, nth_b, nth_type_a, nth_type_b]
|
|
2119
|
+
end
|
|
2120
|
+
|
|
2121
|
+
# Split a CSS selector on top-level commas (outside [...] and (...)).
|
|
2122
|
+
def self.split_selector_groups(s)
|
|
2123
|
+
groups = []
|
|
2124
|
+
buf = +""
|
|
2125
|
+
depth = 0
|
|
2126
|
+
paren = 0
|
|
2127
|
+
s.each_char do |ch|
|
|
2128
|
+
case ch
|
|
2129
|
+
when "[" then depth += 1; buf << ch
|
|
2130
|
+
when "]" then depth -= 1 if depth.positive?; buf << ch
|
|
2131
|
+
when "(" then paren += 1; buf << ch
|
|
2132
|
+
when ")" then paren -= 1 if paren.positive?; buf << ch
|
|
2133
|
+
when ","
|
|
2134
|
+
if depth.zero? && paren.zero?
|
|
2135
|
+
groups << buf.strip
|
|
2136
|
+
buf = +""
|
|
2137
|
+
else
|
|
2138
|
+
buf << ch
|
|
2139
|
+
end
|
|
2140
|
+
else
|
|
2141
|
+
buf << ch
|
|
2142
|
+
end
|
|
2143
|
+
end
|
|
2144
|
+
groups << buf.strip
|
|
2145
|
+
groups.reject(&:empty?)
|
|
2146
|
+
end
|
|
2147
|
+
|
|
2148
|
+
# Returns true if the comma-separated selector has groups with
|
|
2149
|
+
# different pseudo-element shapes — e.g. `.a > ::text, .b` — so
|
|
2150
|
+
# callers can split + peel per-group instead of one shared peel.
|
|
2151
|
+
# Compile a {key => selector_string} fields map into the parallel
|
|
2152
|
+
# (keys, plans, kinds, args) arrays the C extract_one_native /
|
|
2153
|
+
# extract_each_native entry points consume. Returns the 4-tuple
|
|
2154
|
+
# on success, nil when any selector can't be compiled natively
|
|
2155
|
+
# (caller falls back to the slow per-row at_css loop).
|
|
2156
|
+
#
|
|
2157
|
+
# kinds:
|
|
2158
|
+
# 0 = Element (C side allocates the wrapper)
|
|
2159
|
+
# 1 = ::text (TextNode of subtree text)
|
|
2160
|
+
# 2 = ::attr (TextNode of attribute value)
|
|
2161
|
+
#
|
|
2162
|
+
# plan = nil + kind = 2 means bare `::attr(name)` against the scope
|
|
2163
|
+
# element itself — the C side reads the attribute directly without
|
|
2164
|
+
# running a plan. The peel + plan-cache lookups here cost a few
|
|
2165
|
+
# hundred nanoseconds and are amortised across every iteration
|
|
2166
|
+
# of the resulting C-side loop.
|
|
2167
|
+
def self.compile_extract_fields(fields, wrapper)
|
|
2168
|
+
keys = []
|
|
2169
|
+
plans = []
|
|
2170
|
+
kinds = []
|
|
2171
|
+
args = []
|
|
2172
|
+
fields.each_pair do |key, sel|
|
|
2173
|
+
keys << key
|
|
2174
|
+
sel_str = sel.is_a?(String) ? sel : sel.to_s
|
|
2175
|
+
stripped, kind, arg = peel_pseudo_element(sel_str)
|
|
2176
|
+
stripped = "*" if stripped.empty? && kind.nil?
|
|
2177
|
+
if stripped.empty? && (kind == :attr || kind == :direct_attr)
|
|
2178
|
+
plans << nil; kinds << 2; args << arg.to_s
|
|
2179
|
+
next
|
|
2180
|
+
end
|
|
2181
|
+
return nil if stripped.include?(",")
|
|
2182
|
+
plan = wrapper.compiled_plan(stripped)
|
|
2183
|
+
return nil unless plan
|
|
2184
|
+
plans << plan
|
|
2185
|
+
case kind
|
|
2186
|
+
when :text, :text_approx then kinds << 1; args << ""
|
|
2187
|
+
when :attr then kinds << 2; args << arg.to_s
|
|
2188
|
+
when nil then kinds << 0; args << ""
|
|
2189
|
+
else return nil # :direct_text / :direct_attr / unsupported
|
|
2190
|
+
end
|
|
2191
|
+
end
|
|
2192
|
+
[keys, plans, kinds, args]
|
|
2193
|
+
end
|
|
2194
|
+
|
|
2195
|
+
HET_PSEUDO_CACHE = {}
|
|
2196
|
+
HET_PSEUDO_CACHE_CAP = 1024
|
|
2197
|
+
def self.heterogeneous_pseudo_groups?(s)
|
|
2198
|
+
cached = HET_PSEUDO_CACHE[s]
|
|
2199
|
+
return cached unless cached.nil?
|
|
2200
|
+
groups = split_selector_groups(s)
|
|
2201
|
+
kinds = groups.map { |g| peel_pseudo_element(g)[1] }
|
|
2202
|
+
result = kinds.uniq.size > 1
|
|
2203
|
+
HET_PSEUDO_CACHE.shift if HET_PSEUDO_CACHE.size >= HET_PSEUDO_CACHE_CAP
|
|
2204
|
+
HET_PSEUDO_CACHE[s] = result
|
|
2205
|
+
result
|
|
2206
|
+
end
|
|
2207
|
+
|
|
2208
|
+
# `:is(A, B C)`-distribution. Finds a `:is(...)` / `:matches(...)` /
|
|
2209
|
+
# `:where(...)` token that sits at an atom boundary (i.e. preceded
|
|
2210
|
+
# and followed by start/end/combinator/whitespace) and whose
|
|
2211
|
+
# alternatives include at least one with a combinator/whitespace
|
|
2212
|
+
# inside. Returns one group string per alternative, with the
|
|
2213
|
+
# alternative substituted in. Without this rewrite a selector like
|
|
2214
|
+
# `:is(aside, main .x) .y` falls back to the Ruby DOM parser because
|
|
2215
|
+
# the native engine can't represent multi-atom alternatives inside
|
|
2216
|
+
# `:is`. Returns `[group_str]` (single element) when no rewrite
|
|
2217
|
+
# applies — caller treats that as a no-op.
|
|
2218
|
+
IS_AT_BOUNDARY_RE = /
|
|
2219
|
+
(?:\A|(?<=[\s>+~,]))
|
|
2220
|
+
:(?:is|matches|where)\(
|
|
2221
|
+
/x.freeze
|
|
2222
|
+
def self.expand_is_groups(group_str, force: false)
|
|
2223
|
+
m = IS_AT_BOUNDARY_RE.match(group_str)
|
|
2224
|
+
return [group_str] unless m
|
|
2225
|
+
paren_start = m.end(0) - 1 # position of '('
|
|
2226
|
+
depth = 1
|
|
2227
|
+
i = paren_start + 1
|
|
2228
|
+
len = group_str.length
|
|
2229
|
+
while i < len && depth > 0
|
|
2230
|
+
ch = group_str[i]
|
|
2231
|
+
if ch == "("
|
|
2232
|
+
depth += 1
|
|
2233
|
+
elsif ch == ")"
|
|
2234
|
+
depth -= 1
|
|
2235
|
+
end
|
|
2236
|
+
i += 1
|
|
2237
|
+
end
|
|
2238
|
+
return [group_str] if depth != 0
|
|
2239
|
+
paren_end = i - 1 # position of matching ')'
|
|
2240
|
+
inner = group_str[(paren_start + 1)...paren_end]
|
|
2241
|
+
alts = split_selector_groups(inner)
|
|
2242
|
+
return [group_str] if alts.size < 2
|
|
2243
|
+
# By default only distribute when an alternative has a combinator
|
|
2244
|
+
# (multi-atom) — single-atom alternatives compile natively as
|
|
2245
|
+
# is_inner. When called from inside `:has`, force distribution so
|
|
2246
|
+
# the inner pool sees plain single atoms rather than `:is(...)`
|
|
2247
|
+
# wrappers that don't fit native_inner_simples.
|
|
2248
|
+
multi = alts.any? { |a| a =~ /[\s>+~]/ }
|
|
2249
|
+
return [group_str] unless multi || force
|
|
2250
|
+
prefix = group_str[0...m.begin(0)]
|
|
2251
|
+
suffix = group_str[(paren_end + 1)..]
|
|
2252
|
+
alts.flat_map do |alt|
|
|
2253
|
+
merged = "#{prefix}#{alt}#{suffix}".strip
|
|
2254
|
+
expand_is_groups(merged, force: force)
|
|
2255
|
+
end
|
|
2256
|
+
end
|
|
2257
|
+
end
|
|
2258
|
+
end
|