scrapetor 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +242 -0
- data/LICENSE +21 -0
- data/README.md +440 -0
- data/bin/scrapetor +190 -0
- data/bin/scrapetor-bench +5 -0
- data/ext/scrapetor/README.md +53 -0
- data/ext/scrapetor/native/extconf.rb +67 -0
- data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
- data/ext/scrapetor/native/scrapetor_http.c +2591 -0
- data/ext/scrapetor/native/scrapetor_native.c +1156 -0
- data/lib/scrapetor/builder.rb +158 -0
- data/lib/scrapetor/cleaner.rb +10 -0
- data/lib/scrapetor/comment_node.rb +67 -0
- data/lib/scrapetor/document.rb +457 -0
- data/lib/scrapetor/dom/parser.rb +69 -0
- data/lib/scrapetor/dom/selectors.rb +208 -0
- data/lib/scrapetor/dom.rb +563 -0
- data/lib/scrapetor/encoding.rb +85 -0
- data/lib/scrapetor/entities.rb +90 -0
- data/lib/scrapetor/errors.rb +12 -0
- data/lib/scrapetor/extractor.rb +147 -0
- data/lib/scrapetor/fetcher.rb +390 -0
- data/lib/scrapetor/fingerprint.rb +29 -0
- data/lib/scrapetor/form.rb +141 -0
- data/lib/scrapetor/http.rb +114 -0
- data/lib/scrapetor/microdata.rb +132 -0
- data/lib/scrapetor/money.rb +30 -0
- data/lib/scrapetor/native.rb +291 -0
- data/lib/scrapetor/native_dom.rb +2258 -0
- data/lib/scrapetor/node.rb +539 -0
- data/lib/scrapetor/node_set.rb +301 -0
- data/lib/scrapetor/page_type.rb +95 -0
- data/lib/scrapetor/pagination.rb +109 -0
- data/lib/scrapetor/persistent_cache.rb +130 -0
- data/lib/scrapetor/robots.rb +159 -0
- data/lib/scrapetor/sax.rb +285 -0
- data/lib/scrapetor/schema.rb +144 -0
- data/lib/scrapetor/selector.rb +576 -0
- data/lib/scrapetor/session.rb +141 -0
- data/lib/scrapetor/sitemap.rb +52 -0
- data/lib/scrapetor/stream.rb +111 -0
- data/lib/scrapetor/structured_data.rb +74 -0
- data/lib/scrapetor/template_registry.rb +24 -0
- data/lib/scrapetor/text_node.rb +101 -0
- data/lib/scrapetor/url.rb +21 -0
- data/lib/scrapetor/version.rb +5 -0
- data/lib/scrapetor/xpath.rb +1603 -0
- data/lib/scrapetor.rb +167 -0
- data/scrapetor.gemspec +77 -0
- metadata +200 -0
|
@@ -0,0 +1,576 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
# Selector compiler + executor.
|
|
5
|
+
#
|
|
6
|
+
# The plan is the architectural win that lets Phase 1 beat Nokogiri on
|
|
7
|
+
# repeated-extraction workloads without a native backend: every selector
|
|
8
|
+
# compiles into a list of "atoms" (each `tag.class#id[attr=value]` plus
|
|
9
|
+
# optional pseudo-classes) and a combinator linking it to the previous
|
|
10
|
+
# atom. Execution evaluates the rightmost atom first, sources candidates
|
|
11
|
+
# from structural indexes (O(1) per class/id), then walks ancestor
|
|
12
|
+
# chains backward to verify the rest.
|
|
13
|
+
module Selector
|
|
14
|
+
Atom = Struct.new(:tag, :classes, :id, :attrs, :combinator, :pseudos)
|
|
15
|
+
|
|
16
|
+
# Attribute selector: `[name]`, `[name=value]`, `[name*='v']`, etc.
|
|
17
|
+
# The value is captured in one of three slots depending on the quote
|
|
18
|
+
# style so an attribute value like `[class*="L'appareil"]` (single
|
|
19
|
+
# quote inside double-quoted) parses cleanly — the older `[^"']*`
|
|
20
|
+
# value class excluded both quotes and broke on every apostrophe in
|
|
21
|
+
# a double-quoted value, taking out an eBay product fixture.
|
|
22
|
+
# m[1]: attribute name (Unicode-aware)
|
|
23
|
+
# m[2]: operator (= / *= / ^= / $= / ~= / |=)
|
|
24
|
+
# m[3]: value inside double quotes (allows ', escaped chars)
|
|
25
|
+
# m[4]: value inside single quotes (allows ", escaped chars)
|
|
26
|
+
# m[5]: bare unquoted value
|
|
27
|
+
ATTR_RE = /
|
|
28
|
+
\A\[
|
|
29
|
+
([\w:\-\u{0080}-\u{10FFFF}]+)
|
|
30
|
+
(?:
|
|
31
|
+
([*^$~|]?=)
|
|
32
|
+
(?:
|
|
33
|
+
"((?:[^"\\]|\\.)*)"
|
|
34
|
+
| '((?:[^'\\]|\\.)*)'
|
|
35
|
+
| ([^\]\s]+)
|
|
36
|
+
)
|
|
37
|
+
)?
|
|
38
|
+
(?:\s+([isIS]))?\s*
|
|
39
|
+
\]
|
|
40
|
+
/x.freeze
|
|
41
|
+
PSEUDO_NAME_RE = /\A([a-zA-Z][\w-]*)/.freeze
|
|
42
|
+
|
|
43
|
+
# Pseudo-classes Scrapetor can evaluate on a node. Pseudo-elements
|
|
44
|
+
# (`::text`, `::attr(name)`) live on the atom too but are post-
|
|
45
|
+
# processed at the css() boundary, not used for matching.
|
|
46
|
+
KNOWN_PSEUDO_CLASSES = %w[
|
|
47
|
+
not has is matches where
|
|
48
|
+
first-child last-child only-child
|
|
49
|
+
first-of-type last-of-type only-of-type
|
|
50
|
+
nth-child nth-last-child nth-of-type nth-last-of-type
|
|
51
|
+
empty root scope
|
|
52
|
+
checked disabled enabled
|
|
53
|
+
any-link link visited target focus hover active
|
|
54
|
+
required optional read-only read-write placeholder-shown
|
|
55
|
+
].freeze
|
|
56
|
+
|
|
57
|
+
# Pseudo-elements we recognise (Scrapy/Parsel-style). Stored on the
|
|
58
|
+
# last atom; consumed by the public css() entry points.
|
|
59
|
+
KNOWN_PSEUDO_ELEMENTS = %w[text attr first-letter first-line before after].freeze
|
|
60
|
+
|
|
61
|
+
def self.compile(selector)
|
|
62
|
+
sel = selector.to_s.strip
|
|
63
|
+
# CSS Selectors Level 4 scope-relative selector: a leading `>`/`+`/`~`
|
|
64
|
+
# is shorthand for `:scope <combinator> rest`. Production code (Scrapy,
|
|
65
|
+
# Parsel, jQuery, real-world scraping parsers) leans on this when
|
|
66
|
+
# calling `node.css("> .child")` or `:has(> .x)`. We desugar it here so
|
|
67
|
+
# the rest of the compiler stays single-shape.
|
|
68
|
+
if !sel.empty? && (sel[0] == ">" || sel[0] == "+" || sel[0] == "~")
|
|
69
|
+
sel = ":scope " + sel
|
|
70
|
+
end
|
|
71
|
+
atoms = []
|
|
72
|
+
remainder = sel
|
|
73
|
+
combinator = nil
|
|
74
|
+
until remainder.empty?
|
|
75
|
+
atom, rest = take_atom(remainder, combinator)
|
|
76
|
+
atoms << atom
|
|
77
|
+
remainder = rest
|
|
78
|
+
break if remainder.empty?
|
|
79
|
+
combinator, remainder = take_combinator(remainder)
|
|
80
|
+
end
|
|
81
|
+
raise ArgumentError, "Empty selector" if atoms.empty?
|
|
82
|
+
atoms
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Identifier characters. CSS Selectors Level 3 §10.1 allows non-ASCII
|
|
86
|
+
# (>= U+00A0) in identifiers in addition to [a-zA-Z0-9_-]. Real-world
|
|
87
|
+
# class names like `caractéristiquesPrincipalesDuProduit` (eBay FR)
|
|
88
|
+
# or Cyrillic/CJK class names need the Unicode-aware character set
|
|
89
|
+
# — `\w` on its own matches ASCII only.
|
|
90
|
+
IDENT_TAG_RE = /\A([a-zA-Z][\w\-\u{0080}-\u{10FFFF}]*|\*)/.freeze
|
|
91
|
+
IDENT_CLASS_RE = /\A\.([\w\-\u{0080}-\u{10FFFF}]+)/.freeze
|
|
92
|
+
IDENT_ID_RE = /\A#([\w\-\u{0080}-\u{10FFFF}]+)/.freeze
|
|
93
|
+
|
|
94
|
+
def self.take_atom(s, combinator)
|
|
95
|
+
atom = Atom.new(nil, [], nil, [], combinator, nil)
|
|
96
|
+
scanner = s
|
|
97
|
+
saw_universal = false
|
|
98
|
+
m = scanner.match(IDENT_TAG_RE)
|
|
99
|
+
if m
|
|
100
|
+
tag = m[1]
|
|
101
|
+
if tag == "*"
|
|
102
|
+
saw_universal = true
|
|
103
|
+
else
|
|
104
|
+
atom.tag = tag.downcase.to_sym
|
|
105
|
+
end
|
|
106
|
+
scanner = scanner[m[0].size..]
|
|
107
|
+
end
|
|
108
|
+
loop do
|
|
109
|
+
case scanner[0]
|
|
110
|
+
when "."
|
|
111
|
+
m = scanner.match(IDENT_CLASS_RE) || raise(ArgumentError, "Bad class selector: #{s}")
|
|
112
|
+
atom.classes << m[1]
|
|
113
|
+
scanner = scanner[m[0].size..]
|
|
114
|
+
when "#"
|
|
115
|
+
m = scanner.match(IDENT_ID_RE) || raise(ArgumentError, "Bad id selector: #{s}")
|
|
116
|
+
atom.id = m[1]
|
|
117
|
+
scanner = scanner[m[0].size..]
|
|
118
|
+
when "["
|
|
119
|
+
m = scanner.match(ATTR_RE) || raise(ArgumentError, "Bad attribute selector: #{s}")
|
|
120
|
+
# m[3] = double-quoted value, m[4] = single-quoted, m[5] = bare.
|
|
121
|
+
# Whichever capture matched is the actual value; the others are
|
|
122
|
+
# nil. The unquoted slot is `[^\]\s]+`, so values with embedded
|
|
123
|
+
# whitespace must be quoted — same as the CSS Selectors Level 3
|
|
124
|
+
# grammar requires.
|
|
125
|
+
val = m[3] || m[4] || m[5]
|
|
126
|
+
ci = m[6] && m[6].downcase == "i"
|
|
127
|
+
atom.attrs << [m[1], m[2], val, ci]
|
|
128
|
+
scanner = scanner[m[0].size..]
|
|
129
|
+
when ":"
|
|
130
|
+
name, arg, double_colon, rest = take_pseudo(scanner)
|
|
131
|
+
atom.pseudos ||= []
|
|
132
|
+
atom.pseudos << [name, arg, double_colon]
|
|
133
|
+
scanner = rest
|
|
134
|
+
else
|
|
135
|
+
break
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
empty = !saw_universal &&
|
|
139
|
+
atom.tag.nil? &&
|
|
140
|
+
atom.classes.empty? &&
|
|
141
|
+
atom.id.nil? &&
|
|
142
|
+
atom.attrs.empty? &&
|
|
143
|
+
(atom.pseudos.nil? || atom.pseudos.empty?)
|
|
144
|
+
raise ArgumentError, "Cannot parse selector atom near: #{s}" if empty
|
|
145
|
+
[atom, scanner]
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Consume a pseudo-class (`:name`) or pseudo-element (`::name`),
|
|
149
|
+
# optionally followed by a parenthesised argument. Balanced-paren
|
|
150
|
+
# matching so `:has(div:not(.x))` etc. parse cleanly.
|
|
151
|
+
def self.take_pseudo(s)
|
|
152
|
+
double_colon = s.start_with?("::")
|
|
153
|
+
tail = s[(double_colon ? 2 : 1)..]
|
|
154
|
+
m = tail.match(PSEUDO_NAME_RE) || raise(ArgumentError, "Bad pseudo: #{s}")
|
|
155
|
+
name = m[1].downcase
|
|
156
|
+
tail = tail[m[0].size..]
|
|
157
|
+
arg = nil
|
|
158
|
+
if tail.start_with?("(")
|
|
159
|
+
depth = 1
|
|
160
|
+
i = 1
|
|
161
|
+
len = tail.length
|
|
162
|
+
bracket = 0
|
|
163
|
+
quote = nil
|
|
164
|
+
while i < len && depth > 0
|
|
165
|
+
ch = tail[i]
|
|
166
|
+
if quote
|
|
167
|
+
if ch == "\\" && i + 1 < len
|
|
168
|
+
i += 2
|
|
169
|
+
next
|
|
170
|
+
end
|
|
171
|
+
quote = nil if ch == quote
|
|
172
|
+
elsif ch == "\"" || ch == "'"
|
|
173
|
+
quote = ch
|
|
174
|
+
elsif ch == "["
|
|
175
|
+
bracket += 1
|
|
176
|
+
elsif ch == "]"
|
|
177
|
+
bracket -= 1 if bracket > 0
|
|
178
|
+
elsif bracket == 0
|
|
179
|
+
depth += 1 if ch == "("
|
|
180
|
+
depth -= 1 if ch == ")"
|
|
181
|
+
end
|
|
182
|
+
i += 1
|
|
183
|
+
end
|
|
184
|
+
if depth > 0
|
|
185
|
+
arg = tail[1..]
|
|
186
|
+
tail = ""
|
|
187
|
+
else
|
|
188
|
+
arg = tail[1...(i - 1)]
|
|
189
|
+
tail = tail[i..]
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
[name, arg, double_colon, tail]
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def self.take_combinator(s)
|
|
196
|
+
had_ws = false
|
|
197
|
+
while !s.empty? && (s[0] == " " || s[0] == "\t" || s[0] == "\n")
|
|
198
|
+
had_ws = true
|
|
199
|
+
s = s[1..]
|
|
200
|
+
end
|
|
201
|
+
return [nil, ""] if s.empty?
|
|
202
|
+
case s[0]
|
|
203
|
+
when ">"
|
|
204
|
+
s = s[1..]
|
|
205
|
+
while !s.empty? && (s[0] == " " || s[0] == "\t" || s[0] == "\n")
|
|
206
|
+
s = s[1..]
|
|
207
|
+
end
|
|
208
|
+
[:child, s]
|
|
209
|
+
when "+"
|
|
210
|
+
s = s[1..]
|
|
211
|
+
while !s.empty? && (s[0] == " " || s[0] == "\t" || s[0] == "\n")
|
|
212
|
+
s = s[1..]
|
|
213
|
+
end
|
|
214
|
+
[:adj, s]
|
|
215
|
+
when "~"
|
|
216
|
+
s = s[1..]
|
|
217
|
+
while !s.empty? && (s[0] == " " || s[0] == "\t" || s[0] == "\n")
|
|
218
|
+
s = s[1..]
|
|
219
|
+
end
|
|
220
|
+
[:gen, s]
|
|
221
|
+
else
|
|
222
|
+
if had_ws
|
|
223
|
+
[:descendant, s]
|
|
224
|
+
else
|
|
225
|
+
raise ArgumentError, "Cannot parse combinator near: #{s}"
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Parse an `an+b` formula used by :nth-child(...) and friends.
|
|
231
|
+
# Returns [a, b] or nil if the argument can't be parsed.
|
|
232
|
+
def self.parse_nth(arg)
|
|
233
|
+
return nil if arg.nil?
|
|
234
|
+
s = arg.to_s.strip.downcase.gsub(/\s+/, "")
|
|
235
|
+
return nil if s.empty?
|
|
236
|
+
return [2, 1] if s == "odd"
|
|
237
|
+
return [2, 0] if s == "even"
|
|
238
|
+
if (m = s.match(/\A([+-]?\d+)\z/))
|
|
239
|
+
return [0, m[1].to_i]
|
|
240
|
+
end
|
|
241
|
+
if (m = s.match(/\A([+-]?\d*)n([+-]\d+)?\z/))
|
|
242
|
+
a_str = m[1]
|
|
243
|
+
a = case a_str
|
|
244
|
+
when "", "+" then 1
|
|
245
|
+
when "-" then -1
|
|
246
|
+
else a_str.to_i
|
|
247
|
+
end
|
|
248
|
+
b = m[2] ? m[2].to_i : 0
|
|
249
|
+
return [a, b]
|
|
250
|
+
end
|
|
251
|
+
nil
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
# Given coefficients (a, b) and a 1-based position idx, true if
|
|
255
|
+
# there is a non-negative integer k with idx == a*k + b.
|
|
256
|
+
def self.nth_matches?(a, b, idx)
|
|
257
|
+
return idx == b if a.zero?
|
|
258
|
+
diff = idx - b
|
|
259
|
+
return false if a.positive? && diff.negative?
|
|
260
|
+
return false if a.negative? && diff.positive?
|
|
261
|
+
(diff % a).zero?
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# ----- Execution (used by the Dom::Document selector path). -----
|
|
265
|
+
|
|
266
|
+
def self.execute(doc, plan, scope)
|
|
267
|
+
return [] if plan.empty?
|
|
268
|
+
last_idx = plan.size - 1
|
|
269
|
+
candidates = candidates_for_atom(doc, plan[last_idx], scope)
|
|
270
|
+
return candidates if plan.size == 1
|
|
271
|
+
candidates.select do |node|
|
|
272
|
+
match_chain_backwards?(node, plan, last_idx - 1, scope)
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
def self.candidates_for_atom(doc, atom, scope)
|
|
277
|
+
sets = []
|
|
278
|
+
if atom.id
|
|
279
|
+
n = doc.id_index[atom.id]
|
|
280
|
+
return [] if n.nil?
|
|
281
|
+
return [] unless in_scope?(n, scope)
|
|
282
|
+
return [n] if atom_matches?(atom, n)
|
|
283
|
+
return []
|
|
284
|
+
end
|
|
285
|
+
atom.classes.each do |c|
|
|
286
|
+
sets << (doc.class_index[c] || [])
|
|
287
|
+
end
|
|
288
|
+
sets << (doc.tag_index[atom.tag] || []) if atom.tag
|
|
289
|
+
candidates = if sets.empty?
|
|
290
|
+
if defined?(Dom::Document) && scope.is_a?(Dom::Document)
|
|
291
|
+
doc.all_elements
|
|
292
|
+
else
|
|
293
|
+
scope.css("*").to_a
|
|
294
|
+
end
|
|
295
|
+
else
|
|
296
|
+
sets.min_by(&:size)
|
|
297
|
+
end
|
|
298
|
+
candidates.select do |n|
|
|
299
|
+
atom_matches?(atom, n) && in_scope?(n, scope)
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
def self.atom_matches?(atom, node)
|
|
304
|
+
return false unless node.respond_to?(:element?) && node.element?
|
|
305
|
+
return false if atom.tag && node.name.to_sym != atom.tag
|
|
306
|
+
if atom.classes.any?
|
|
307
|
+
nc = node["class"]
|
|
308
|
+
return false if nc.nil?
|
|
309
|
+
ncs = nc.split(/\s+/)
|
|
310
|
+
atom.classes.each { |c| return false unless ncs.include?(c) }
|
|
311
|
+
end
|
|
312
|
+
return false if atom.id && node["id"] != atom.id
|
|
313
|
+
atom.attrs.each do |name, op, val, ci|
|
|
314
|
+
v = node[name]
|
|
315
|
+
if ci && v && val
|
|
316
|
+
v = v.downcase
|
|
317
|
+
val = val.downcase
|
|
318
|
+
end
|
|
319
|
+
case op
|
|
320
|
+
when nil
|
|
321
|
+
return false if v.nil?
|
|
322
|
+
when "="
|
|
323
|
+
return false unless v == val
|
|
324
|
+
when "*="
|
|
325
|
+
return false if v.nil? || !v.include?(val)
|
|
326
|
+
when "^="
|
|
327
|
+
return false if v.nil? || !v.start_with?(val)
|
|
328
|
+
when "$="
|
|
329
|
+
return false if v.nil? || !v.end_with?(val)
|
|
330
|
+
when "~="
|
|
331
|
+
return false if v.nil? || !v.split(/\s+/).include?(val)
|
|
332
|
+
when "|="
|
|
333
|
+
return false if v.nil? || (v != val && !v.start_with?("#{val}-"))
|
|
334
|
+
end
|
|
335
|
+
end
|
|
336
|
+
if atom.pseudos && !atom.pseudos.empty?
|
|
337
|
+
atom.pseudos.each do |name, arg, double_colon|
|
|
338
|
+
next if double_colon # pseudo-elements aren't matchers
|
|
339
|
+
return false unless pseudo_matches?(node, name, arg)
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
true
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
def self.pseudo_matches?(node, name, arg)
|
|
346
|
+
case name
|
|
347
|
+
when "not"
|
|
348
|
+
return true if arg.nil? || arg.empty?
|
|
349
|
+
# Any sub-selector matching node disqualifies it.
|
|
350
|
+
Scrapetor::Dom::Selectors.selector_groups(arg).each do |g|
|
|
351
|
+
plan = compile(g)
|
|
352
|
+
return false if matches_chain_at_node?(node, plan)
|
|
353
|
+
end
|
|
354
|
+
true
|
|
355
|
+
when "is", "matches", "where"
|
|
356
|
+
return false if arg.nil? || arg.empty?
|
|
357
|
+
Scrapetor::Dom::Selectors.selector_groups(arg).any? do |g|
|
|
358
|
+
plan = compile(g)
|
|
359
|
+
matches_chain_at_node?(node, plan)
|
|
360
|
+
end
|
|
361
|
+
when "has"
|
|
362
|
+
return false if arg.nil? || arg.empty?
|
|
363
|
+
has_descendant_matching?(node, arg)
|
|
364
|
+
when "first-child"
|
|
365
|
+
node.respond_to?(:previous_element_sibling) && node.previous_element_sibling.nil?
|
|
366
|
+
when "last-child"
|
|
367
|
+
node.respond_to?(:next_element_sibling) && node.next_element_sibling.nil?
|
|
368
|
+
when "only-child"
|
|
369
|
+
node.respond_to?(:next_element_sibling) &&
|
|
370
|
+
node.previous_element_sibling.nil? && node.next_element_sibling.nil?
|
|
371
|
+
when "first-of-type"
|
|
372
|
+
first_of_type?(node)
|
|
373
|
+
when "last-of-type"
|
|
374
|
+
last_of_type?(node)
|
|
375
|
+
when "only-of-type"
|
|
376
|
+
first_of_type?(node) && last_of_type?(node)
|
|
377
|
+
when "nth-child"
|
|
378
|
+
nth_position_match?(node, arg, by_type: false, reverse: false)
|
|
379
|
+
when "nth-last-child"
|
|
380
|
+
nth_position_match?(node, arg, by_type: false, reverse: true)
|
|
381
|
+
when "nth-of-type"
|
|
382
|
+
nth_position_match?(node, arg, by_type: true, reverse: false)
|
|
383
|
+
when "nth-last-of-type"
|
|
384
|
+
nth_position_match?(node, arg, by_type: true, reverse: true)
|
|
385
|
+
when "empty"
|
|
386
|
+
if node.respond_to?(:children)
|
|
387
|
+
node.children.none? { |c| c.element? || (c.respond_to?(:text?) && c.text? && !c.text.to_s.empty?) }
|
|
388
|
+
else
|
|
389
|
+
true
|
|
390
|
+
end
|
|
391
|
+
when "root"
|
|
392
|
+
p = node.respond_to?(:parent) ? node.parent : nil
|
|
393
|
+
p.nil? || (defined?(Scrapetor::Dom::Document) && p.is_a?(Scrapetor::Dom::Document))
|
|
394
|
+
when "scope"
|
|
395
|
+
true
|
|
396
|
+
when "checked"
|
|
397
|
+
truthy_attr?(node, "checked")
|
|
398
|
+
when "disabled"
|
|
399
|
+
truthy_attr?(node, "disabled")
|
|
400
|
+
when "enabled"
|
|
401
|
+
!truthy_attr?(node, "disabled")
|
|
402
|
+
when "required"
|
|
403
|
+
truthy_attr?(node, "required")
|
|
404
|
+
when "optional"
|
|
405
|
+
!truthy_attr?(node, "required")
|
|
406
|
+
when "read-only"
|
|
407
|
+
truthy_attr?(node, "readonly")
|
|
408
|
+
when "read-write"
|
|
409
|
+
!truthy_attr?(node, "readonly")
|
|
410
|
+
when "any-link", "link"
|
|
411
|
+
(node.name == "a" || node.name == "area") && !node["href"].nil?
|
|
412
|
+
else
|
|
413
|
+
# Unknown pseudo-class: fail closed (don't match) so the user
|
|
414
|
+
# sees a missing-result rather than a silent wrong-result.
|
|
415
|
+
false
|
|
416
|
+
end
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
def self.truthy_attr?(node, name)
|
|
420
|
+
v = node[name]
|
|
421
|
+
!v.nil? && v != "false"
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
def self.first_of_type?(node)
|
|
425
|
+
return true unless node.respond_to?(:previous_sibling)
|
|
426
|
+
sib = node.previous_sibling
|
|
427
|
+
while sib
|
|
428
|
+
return false if sib.respond_to?(:element?) && sib.element? && sib.name == node.name
|
|
429
|
+
sib = sib.previous_sibling
|
|
430
|
+
end
|
|
431
|
+
true
|
|
432
|
+
end
|
|
433
|
+
|
|
434
|
+
def self.last_of_type?(node)
|
|
435
|
+
return true unless node.respond_to?(:next_sibling)
|
|
436
|
+
sib = node.next_sibling
|
|
437
|
+
while sib
|
|
438
|
+
return false if sib.respond_to?(:element?) && sib.element? && sib.name == node.name
|
|
439
|
+
sib = sib.next_sibling
|
|
440
|
+
end
|
|
441
|
+
true
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
def self.nth_position_match?(node, arg, by_type:, reverse:)
|
|
445
|
+
formula = parse_nth(arg)
|
|
446
|
+
return false unless formula
|
|
447
|
+
parent = node.respond_to?(:parent) ? node.parent : nil
|
|
448
|
+
return false if parent.nil?
|
|
449
|
+
sibs =
|
|
450
|
+
if parent.respond_to?(:children)
|
|
451
|
+
parent.children.select { |c| c.respond_to?(:element?) && c.element? }
|
|
452
|
+
else
|
|
453
|
+
[]
|
|
454
|
+
end
|
|
455
|
+
sibs = sibs.select { |s| s.name == node.name } if by_type
|
|
456
|
+
sibs = sibs.reverse if reverse
|
|
457
|
+
idx = sibs.index { |s| s.equal?(node) }
|
|
458
|
+
return false unless idx
|
|
459
|
+
nth_matches?(formula[0], formula[1], idx + 1)
|
|
460
|
+
end
|
|
461
|
+
|
|
462
|
+
# Does the node itself match the entire chain (treat the rightmost
|
|
463
|
+
# atom as the anchor, walk back from there). Used by :not, :is, :has.
|
|
464
|
+
def self.matches_chain_at_node?(node, plan)
|
|
465
|
+
return false if plan.empty?
|
|
466
|
+
last_idx = plan.size - 1
|
|
467
|
+
return false unless atom_matches?(plan[last_idx], node)
|
|
468
|
+
return true if plan.size == 1
|
|
469
|
+
match_chain_backwards?(node, plan, last_idx - 1, nil)
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
def self.has_descendant_matching?(node, selector_str)
|
|
473
|
+
groups = Scrapetor::Dom::Selectors.selector_groups(selector_str)
|
|
474
|
+
groups.each do |raw_group|
|
|
475
|
+
g = raw_group.strip
|
|
476
|
+
# Scope-relative inner: `:has(> .child)` / `:has(+ .x)` / `:has(~ .x)`.
|
|
477
|
+
# Honour the combinator directly instead of compiling against a
|
|
478
|
+
# synthetic scope atom — that's both more accurate (matches CSS
|
|
479
|
+
# spec) and dodges the "Cannot parse selector atom near: > ..."
|
|
480
|
+
# crash that took out four production scrape fixtures.
|
|
481
|
+
if g.start_with?(">")
|
|
482
|
+
inner = g[1..].lstrip
|
|
483
|
+
plan = compile(inner)
|
|
484
|
+
node.children.each do |c|
|
|
485
|
+
next unless c.respond_to?(:element?) && c.element?
|
|
486
|
+
return true if matches_chain_at_node?(c, plan)
|
|
487
|
+
end
|
|
488
|
+
next
|
|
489
|
+
elsif g.start_with?("+")
|
|
490
|
+
inner = g[1..].lstrip
|
|
491
|
+
plan = compile(inner)
|
|
492
|
+
sib = node.respond_to?(:next_element_sibling) ? node.next_element_sibling : nil
|
|
493
|
+
return true if sib && matches_chain_at_node?(sib, plan)
|
|
494
|
+
next
|
|
495
|
+
elsif g.start_with?("~")
|
|
496
|
+
inner = g[1..].lstrip
|
|
497
|
+
plan = compile(inner)
|
|
498
|
+
sib = node.respond_to?(:next_element_sibling) ? node.next_element_sibling : nil
|
|
499
|
+
while sib
|
|
500
|
+
return true if matches_chain_at_node?(sib, plan)
|
|
501
|
+
sib = sib.next_element_sibling
|
|
502
|
+
end
|
|
503
|
+
next
|
|
504
|
+
end
|
|
505
|
+
plan = compile(g)
|
|
506
|
+
walk_descendants(node) do |d|
|
|
507
|
+
return true if matches_chain_at_node?(d, plan)
|
|
508
|
+
end
|
|
509
|
+
end
|
|
510
|
+
false
|
|
511
|
+
end
|
|
512
|
+
|
|
513
|
+
def self.walk_descendants(node, &block)
|
|
514
|
+
return unless node.respond_to?(:children)
|
|
515
|
+
node.children.each do |c|
|
|
516
|
+
next unless c.respond_to?(:element?) && c.element?
|
|
517
|
+
yield c
|
|
518
|
+
walk_descendants(c, &block)
|
|
519
|
+
end
|
|
520
|
+
end
|
|
521
|
+
|
|
522
|
+
def self.match_chain_backwards?(node, plan, idx, scope)
|
|
523
|
+
return true if idx < 0
|
|
524
|
+
atom = plan[idx]
|
|
525
|
+
combinator = plan[idx + 1].combinator
|
|
526
|
+
case combinator
|
|
527
|
+
when :child
|
|
528
|
+
parent = node.parent
|
|
529
|
+
return false if parent.nil?
|
|
530
|
+
return false unless parent.respond_to?(:element?) && parent.element?
|
|
531
|
+
return false unless in_scope?(parent, scope)
|
|
532
|
+
return false unless atom_matches?(atom, parent)
|
|
533
|
+
match_chain_backwards?(parent, plan, idx - 1, scope)
|
|
534
|
+
when :descendant, nil
|
|
535
|
+
cur = node.parent
|
|
536
|
+
while cur && cur.respond_to?(:element?) && cur.element?
|
|
537
|
+
if in_scope?(cur, scope) && atom_matches?(atom, cur) &&
|
|
538
|
+
match_chain_backwards?(cur, plan, idx - 1, scope)
|
|
539
|
+
return true
|
|
540
|
+
end
|
|
541
|
+
cur = cur.parent
|
|
542
|
+
end
|
|
543
|
+
false
|
|
544
|
+
when :adj
|
|
545
|
+
prev = node.respond_to?(:previous_element_sibling) ? node.previous_element_sibling : nil
|
|
546
|
+
return false if prev.nil?
|
|
547
|
+
return false unless in_scope?(prev, scope)
|
|
548
|
+
return false unless atom_matches?(atom, prev)
|
|
549
|
+
match_chain_backwards?(prev, plan, idx - 1, scope)
|
|
550
|
+
when :gen
|
|
551
|
+
prev = node.respond_to?(:previous_element_sibling) ? node.previous_element_sibling : nil
|
|
552
|
+
while prev
|
|
553
|
+
if in_scope?(prev, scope) && atom_matches?(atom, prev) &&
|
|
554
|
+
match_chain_backwards?(prev, plan, idx - 1, scope)
|
|
555
|
+
return true
|
|
556
|
+
end
|
|
557
|
+
prev = prev.previous_element_sibling
|
|
558
|
+
end
|
|
559
|
+
false
|
|
560
|
+
else
|
|
561
|
+
false
|
|
562
|
+
end
|
|
563
|
+
end
|
|
564
|
+
|
|
565
|
+
def self.in_scope?(node, scope)
|
|
566
|
+
return true if scope.nil?
|
|
567
|
+
return true if defined?(Dom::Document) && scope.is_a?(Dom::Document)
|
|
568
|
+
cur = node
|
|
569
|
+
while cur
|
|
570
|
+
return true if cur == scope
|
|
571
|
+
cur = cur.parent
|
|
572
|
+
end
|
|
573
|
+
false
|
|
574
|
+
end
|
|
575
|
+
end
|
|
576
|
+
end
|