scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,576 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Selector compiler + executor.
5
+ #
6
+ # The plan is the architectural win that lets Phase 1 beat Nokogiri on
7
+ # repeated-extraction workloads without a native backend: every selector
8
+ # compiles into a list of "atoms" (each `tag.class#id[attr=value]` plus
9
+ # optional pseudo-classes) and a combinator linking it to the previous
10
+ # atom. Execution evaluates the rightmost atom first, sources candidates
11
+ # from structural indexes (O(1) per class/id), then walks ancestor
12
+ # chains backward to verify the rest.
13
+ module Selector
14
+ Atom = Struct.new(:tag, :classes, :id, :attrs, :combinator, :pseudos)
15
+
16
+ # Attribute selector: `[name]`, `[name=value]`, `[name*='v']`, etc.
17
+ # The value is captured in one of three slots depending on the quote
18
+ # style so an attribute value like `[class*="L'appareil"]` (single
19
+ # quote inside double-quoted) parses cleanly — the older `[^"']*`
20
+ # value class excluded both quotes and broke on every apostrophe in
21
+ # a double-quoted value, taking out an eBay product fixture.
22
+ # m[1]: attribute name (Unicode-aware)
23
+ # m[2]: operator (= / *= / ^= / $= / ~= / |=)
24
+ # m[3]: value inside double quotes (allows ', escaped chars)
25
+ # m[4]: value inside single quotes (allows ", escaped chars)
26
+ # m[5]: bare unquoted value
27
+ ATTR_RE = /
28
+ \A\[
29
+ ([\w:\-\u{0080}-\u{10FFFF}]+)
30
+ (?:
31
+ ([*^$~|]?=)
32
+ (?:
33
+ "((?:[^"\\]|\\.)*)"
34
+ | '((?:[^'\\]|\\.)*)'
35
+ | ([^\]\s]+)
36
+ )
37
+ )?
38
+ (?:\s+([isIS]))?\s*
39
+ \]
40
+ /x.freeze
41
+ PSEUDO_NAME_RE = /\A([a-zA-Z][\w-]*)/.freeze
42
+
43
+ # Pseudo-classes Scrapetor can evaluate on a node. Pseudo-elements
44
+ # (`::text`, `::attr(name)`) live on the atom too but are post-
45
+ # processed at the css() boundary, not used for matching.
46
+ KNOWN_PSEUDO_CLASSES = %w[
47
+ not has is matches where
48
+ first-child last-child only-child
49
+ first-of-type last-of-type only-of-type
50
+ nth-child nth-last-child nth-of-type nth-last-of-type
51
+ empty root scope
52
+ checked disabled enabled
53
+ any-link link visited target focus hover active
54
+ required optional read-only read-write placeholder-shown
55
+ ].freeze
56
+
57
+ # Pseudo-elements we recognise (Scrapy/Parsel-style). Stored on the
58
+ # last atom; consumed by the public css() entry points.
59
+ KNOWN_PSEUDO_ELEMENTS = %w[text attr first-letter first-line before after].freeze
60
+
61
+ def self.compile(selector)
62
+ sel = selector.to_s.strip
63
+ # CSS Selectors Level 4 scope-relative selector: a leading `>`/`+`/`~`
64
+ # is shorthand for `:scope <combinator> rest`. Production code (Scrapy,
65
+ # Parsel, jQuery, real-world scraping parsers) leans on this when
66
+ # calling `node.css("> .child")` or `:has(> .x)`. We desugar it here so
67
+ # the rest of the compiler stays single-shape.
68
+ if !sel.empty? && (sel[0] == ">" || sel[0] == "+" || sel[0] == "~")
69
+ sel = ":scope " + sel
70
+ end
71
+ atoms = []
72
+ remainder = sel
73
+ combinator = nil
74
+ until remainder.empty?
75
+ atom, rest = take_atom(remainder, combinator)
76
+ atoms << atom
77
+ remainder = rest
78
+ break if remainder.empty?
79
+ combinator, remainder = take_combinator(remainder)
80
+ end
81
+ raise ArgumentError, "Empty selector" if atoms.empty?
82
+ atoms
83
+ end
84
+
85
+ # Identifier characters. CSS Selectors Level 3 §10.1 allows non-ASCII
86
+ # (>= U+00A0) in identifiers in addition to [a-zA-Z0-9_-]. Real-world
87
+ # class names like `caractéristiquesPrincipalesDuProduit` (eBay FR)
88
+ # or Cyrillic/CJK class names need the Unicode-aware character set
89
+ # — `\w` on its own matches ASCII only.
90
+ IDENT_TAG_RE = /\A([a-zA-Z][\w\-\u{0080}-\u{10FFFF}]*|\*)/.freeze
91
+ IDENT_CLASS_RE = /\A\.([\w\-\u{0080}-\u{10FFFF}]+)/.freeze
92
+ IDENT_ID_RE = /\A#([\w\-\u{0080}-\u{10FFFF}]+)/.freeze
93
+
94
+ def self.take_atom(s, combinator)
95
+ atom = Atom.new(nil, [], nil, [], combinator, nil)
96
+ scanner = s
97
+ saw_universal = false
98
+ m = scanner.match(IDENT_TAG_RE)
99
+ if m
100
+ tag = m[1]
101
+ if tag == "*"
102
+ saw_universal = true
103
+ else
104
+ atom.tag = tag.downcase.to_sym
105
+ end
106
+ scanner = scanner[m[0].size..]
107
+ end
108
+ loop do
109
+ case scanner[0]
110
+ when "."
111
+ m = scanner.match(IDENT_CLASS_RE) || raise(ArgumentError, "Bad class selector: #{s}")
112
+ atom.classes << m[1]
113
+ scanner = scanner[m[0].size..]
114
+ when "#"
115
+ m = scanner.match(IDENT_ID_RE) || raise(ArgumentError, "Bad id selector: #{s}")
116
+ atom.id = m[1]
117
+ scanner = scanner[m[0].size..]
118
+ when "["
119
+ m = scanner.match(ATTR_RE) || raise(ArgumentError, "Bad attribute selector: #{s}")
120
+ # m[3] = double-quoted value, m[4] = single-quoted, m[5] = bare.
121
+ # Whichever capture matched is the actual value; the others are
122
+ # nil. The unquoted slot is `[^\]\s]+`, so values with embedded
123
+ # whitespace must be quoted — same as the CSS Selectors Level 3
124
+ # grammar requires.
125
+ val = m[3] || m[4] || m[5]
126
+ ci = m[6] && m[6].downcase == "i"
127
+ atom.attrs << [m[1], m[2], val, ci]
128
+ scanner = scanner[m[0].size..]
129
+ when ":"
130
+ name, arg, double_colon, rest = take_pseudo(scanner)
131
+ atom.pseudos ||= []
132
+ atom.pseudos << [name, arg, double_colon]
133
+ scanner = rest
134
+ else
135
+ break
136
+ end
137
+ end
138
+ empty = !saw_universal &&
139
+ atom.tag.nil? &&
140
+ atom.classes.empty? &&
141
+ atom.id.nil? &&
142
+ atom.attrs.empty? &&
143
+ (atom.pseudos.nil? || atom.pseudos.empty?)
144
+ raise ArgumentError, "Cannot parse selector atom near: #{s}" if empty
145
+ [atom, scanner]
146
+ end
147
+
148
+ # Consume a pseudo-class (`:name`) or pseudo-element (`::name`),
149
+ # optionally followed by a parenthesised argument. Balanced-paren
150
+ # matching so `:has(div:not(.x))` etc. parse cleanly.
151
+ def self.take_pseudo(s)
152
+ double_colon = s.start_with?("::")
153
+ tail = s[(double_colon ? 2 : 1)..]
154
+ m = tail.match(PSEUDO_NAME_RE) || raise(ArgumentError, "Bad pseudo: #{s}")
155
+ name = m[1].downcase
156
+ tail = tail[m[0].size..]
157
+ arg = nil
158
+ if tail.start_with?("(")
159
+ depth = 1
160
+ i = 1
161
+ len = tail.length
162
+ bracket = 0
163
+ quote = nil
164
+ while i < len && depth > 0
165
+ ch = tail[i]
166
+ if quote
167
+ if ch == "\\" && i + 1 < len
168
+ i += 2
169
+ next
170
+ end
171
+ quote = nil if ch == quote
172
+ elsif ch == "\"" || ch == "'"
173
+ quote = ch
174
+ elsif ch == "["
175
+ bracket += 1
176
+ elsif ch == "]"
177
+ bracket -= 1 if bracket > 0
178
+ elsif bracket == 0
179
+ depth += 1 if ch == "("
180
+ depth -= 1 if ch == ")"
181
+ end
182
+ i += 1
183
+ end
184
+ if depth > 0
185
+ arg = tail[1..]
186
+ tail = ""
187
+ else
188
+ arg = tail[1...(i - 1)]
189
+ tail = tail[i..]
190
+ end
191
+ end
192
+ [name, arg, double_colon, tail]
193
+ end
194
+
195
+ def self.take_combinator(s)
196
+ had_ws = false
197
+ while !s.empty? && (s[0] == " " || s[0] == "\t" || s[0] == "\n")
198
+ had_ws = true
199
+ s = s[1..]
200
+ end
201
+ return [nil, ""] if s.empty?
202
+ case s[0]
203
+ when ">"
204
+ s = s[1..]
205
+ while !s.empty? && (s[0] == " " || s[0] == "\t" || s[0] == "\n")
206
+ s = s[1..]
207
+ end
208
+ [:child, s]
209
+ when "+"
210
+ s = s[1..]
211
+ while !s.empty? && (s[0] == " " || s[0] == "\t" || s[0] == "\n")
212
+ s = s[1..]
213
+ end
214
+ [:adj, s]
215
+ when "~"
216
+ s = s[1..]
217
+ while !s.empty? && (s[0] == " " || s[0] == "\t" || s[0] == "\n")
218
+ s = s[1..]
219
+ end
220
+ [:gen, s]
221
+ else
222
+ if had_ws
223
+ [:descendant, s]
224
+ else
225
+ raise ArgumentError, "Cannot parse combinator near: #{s}"
226
+ end
227
+ end
228
+ end
229
+
230
+ # Parse an `an+b` formula used by :nth-child(...) and friends.
231
+ # Returns [a, b] or nil if the argument can't be parsed.
232
+ def self.parse_nth(arg)
233
+ return nil if arg.nil?
234
+ s = arg.to_s.strip.downcase.gsub(/\s+/, "")
235
+ return nil if s.empty?
236
+ return [2, 1] if s == "odd"
237
+ return [2, 0] if s == "even"
238
+ if (m = s.match(/\A([+-]?\d+)\z/))
239
+ return [0, m[1].to_i]
240
+ end
241
+ if (m = s.match(/\A([+-]?\d*)n([+-]\d+)?\z/))
242
+ a_str = m[1]
243
+ a = case a_str
244
+ when "", "+" then 1
245
+ when "-" then -1
246
+ else a_str.to_i
247
+ end
248
+ b = m[2] ? m[2].to_i : 0
249
+ return [a, b]
250
+ end
251
+ nil
252
+ end
253
+
254
+ # Given coefficients (a, b) and a 1-based position idx, true if
255
+ # there is a non-negative integer k with idx == a*k + b.
256
+ def self.nth_matches?(a, b, idx)
257
+ return idx == b if a.zero?
258
+ diff = idx - b
259
+ return false if a.positive? && diff.negative?
260
+ return false if a.negative? && diff.positive?
261
+ (diff % a).zero?
262
+ end
263
+
264
+ # ----- Execution (used by the Dom::Document selector path). -----
265
+
266
+ def self.execute(doc, plan, scope)
267
+ return [] if plan.empty?
268
+ last_idx = plan.size - 1
269
+ candidates = candidates_for_atom(doc, plan[last_idx], scope)
270
+ return candidates if plan.size == 1
271
+ candidates.select do |node|
272
+ match_chain_backwards?(node, plan, last_idx - 1, scope)
273
+ end
274
+ end
275
+
276
+ def self.candidates_for_atom(doc, atom, scope)
277
+ sets = []
278
+ if atom.id
279
+ n = doc.id_index[atom.id]
280
+ return [] if n.nil?
281
+ return [] unless in_scope?(n, scope)
282
+ return [n] if atom_matches?(atom, n)
283
+ return []
284
+ end
285
+ atom.classes.each do |c|
286
+ sets << (doc.class_index[c] || [])
287
+ end
288
+ sets << (doc.tag_index[atom.tag] || []) if atom.tag
289
+ candidates = if sets.empty?
290
+ if defined?(Dom::Document) && scope.is_a?(Dom::Document)
291
+ doc.all_elements
292
+ else
293
+ scope.css("*").to_a
294
+ end
295
+ else
296
+ sets.min_by(&:size)
297
+ end
298
+ candidates.select do |n|
299
+ atom_matches?(atom, n) && in_scope?(n, scope)
300
+ end
301
+ end
302
+
303
+ def self.atom_matches?(atom, node)
304
+ return false unless node.respond_to?(:element?) && node.element?
305
+ return false if atom.tag && node.name.to_sym != atom.tag
306
+ if atom.classes.any?
307
+ nc = node["class"]
308
+ return false if nc.nil?
309
+ ncs = nc.split(/\s+/)
310
+ atom.classes.each { |c| return false unless ncs.include?(c) }
311
+ end
312
+ return false if atom.id && node["id"] != atom.id
313
+ atom.attrs.each do |name, op, val, ci|
314
+ v = node[name]
315
+ if ci && v && val
316
+ v = v.downcase
317
+ val = val.downcase
318
+ end
319
+ case op
320
+ when nil
321
+ return false if v.nil?
322
+ when "="
323
+ return false unless v == val
324
+ when "*="
325
+ return false if v.nil? || !v.include?(val)
326
+ when "^="
327
+ return false if v.nil? || !v.start_with?(val)
328
+ when "$="
329
+ return false if v.nil? || !v.end_with?(val)
330
+ when "~="
331
+ return false if v.nil? || !v.split(/\s+/).include?(val)
332
+ when "|="
333
+ return false if v.nil? || (v != val && !v.start_with?("#{val}-"))
334
+ end
335
+ end
336
+ if atom.pseudos && !atom.pseudos.empty?
337
+ atom.pseudos.each do |name, arg, double_colon|
338
+ next if double_colon # pseudo-elements aren't matchers
339
+ return false unless pseudo_matches?(node, name, arg)
340
+ end
341
+ end
342
+ true
343
+ end
344
+
345
+ def self.pseudo_matches?(node, name, arg)
346
+ case name
347
+ when "not"
348
+ return true if arg.nil? || arg.empty?
349
+ # Any sub-selector matching node disqualifies it.
350
+ Scrapetor::Dom::Selectors.selector_groups(arg).each do |g|
351
+ plan = compile(g)
352
+ return false if matches_chain_at_node?(node, plan)
353
+ end
354
+ true
355
+ when "is", "matches", "where"
356
+ return false if arg.nil? || arg.empty?
357
+ Scrapetor::Dom::Selectors.selector_groups(arg).any? do |g|
358
+ plan = compile(g)
359
+ matches_chain_at_node?(node, plan)
360
+ end
361
+ when "has"
362
+ return false if arg.nil? || arg.empty?
363
+ has_descendant_matching?(node, arg)
364
+ when "first-child"
365
+ node.respond_to?(:previous_element_sibling) && node.previous_element_sibling.nil?
366
+ when "last-child"
367
+ node.respond_to?(:next_element_sibling) && node.next_element_sibling.nil?
368
+ when "only-child"
369
+ node.respond_to?(:next_element_sibling) &&
370
+ node.previous_element_sibling.nil? && node.next_element_sibling.nil?
371
+ when "first-of-type"
372
+ first_of_type?(node)
373
+ when "last-of-type"
374
+ last_of_type?(node)
375
+ when "only-of-type"
376
+ first_of_type?(node) && last_of_type?(node)
377
+ when "nth-child"
378
+ nth_position_match?(node, arg, by_type: false, reverse: false)
379
+ when "nth-last-child"
380
+ nth_position_match?(node, arg, by_type: false, reverse: true)
381
+ when "nth-of-type"
382
+ nth_position_match?(node, arg, by_type: true, reverse: false)
383
+ when "nth-last-of-type"
384
+ nth_position_match?(node, arg, by_type: true, reverse: true)
385
+ when "empty"
386
+ if node.respond_to?(:children)
387
+ node.children.none? { |c| c.element? || (c.respond_to?(:text?) && c.text? && !c.text.to_s.empty?) }
388
+ else
389
+ true
390
+ end
391
+ when "root"
392
+ p = node.respond_to?(:parent) ? node.parent : nil
393
+ p.nil? || (defined?(Scrapetor::Dom::Document) && p.is_a?(Scrapetor::Dom::Document))
394
+ when "scope"
395
+ true
396
+ when "checked"
397
+ truthy_attr?(node, "checked")
398
+ when "disabled"
399
+ truthy_attr?(node, "disabled")
400
+ when "enabled"
401
+ !truthy_attr?(node, "disabled")
402
+ when "required"
403
+ truthy_attr?(node, "required")
404
+ when "optional"
405
+ !truthy_attr?(node, "required")
406
+ when "read-only"
407
+ truthy_attr?(node, "readonly")
408
+ when "read-write"
409
+ !truthy_attr?(node, "readonly")
410
+ when "any-link", "link"
411
+ (node.name == "a" || node.name == "area") && !node["href"].nil?
412
+ else
413
+ # Unknown pseudo-class: fail closed (don't match) so the user
414
+ # sees a missing-result rather than a silent wrong-result.
415
+ false
416
+ end
417
+ end
418
+
419
+ def self.truthy_attr?(node, name)
420
+ v = node[name]
421
+ !v.nil? && v != "false"
422
+ end
423
+
424
+ def self.first_of_type?(node)
425
+ return true unless node.respond_to?(:previous_sibling)
426
+ sib = node.previous_sibling
427
+ while sib
428
+ return false if sib.respond_to?(:element?) && sib.element? && sib.name == node.name
429
+ sib = sib.previous_sibling
430
+ end
431
+ true
432
+ end
433
+
434
+ def self.last_of_type?(node)
435
+ return true unless node.respond_to?(:next_sibling)
436
+ sib = node.next_sibling
437
+ while sib
438
+ return false if sib.respond_to?(:element?) && sib.element? && sib.name == node.name
439
+ sib = sib.next_sibling
440
+ end
441
+ true
442
+ end
443
+
444
+ def self.nth_position_match?(node, arg, by_type:, reverse:)
445
+ formula = parse_nth(arg)
446
+ return false unless formula
447
+ parent = node.respond_to?(:parent) ? node.parent : nil
448
+ return false if parent.nil?
449
+ sibs =
450
+ if parent.respond_to?(:children)
451
+ parent.children.select { |c| c.respond_to?(:element?) && c.element? }
452
+ else
453
+ []
454
+ end
455
+ sibs = sibs.select { |s| s.name == node.name } if by_type
456
+ sibs = sibs.reverse if reverse
457
+ idx = sibs.index { |s| s.equal?(node) }
458
+ return false unless idx
459
+ nth_matches?(formula[0], formula[1], idx + 1)
460
+ end
461
+
462
+ # Does the node itself match the entire chain (treat the rightmost
463
+ # atom as the anchor, walk back from there). Used by :not, :is, :has.
464
+ def self.matches_chain_at_node?(node, plan)
465
+ return false if plan.empty?
466
+ last_idx = plan.size - 1
467
+ return false unless atom_matches?(plan[last_idx], node)
468
+ return true if plan.size == 1
469
+ match_chain_backwards?(node, plan, last_idx - 1, nil)
470
+ end
471
+
472
+ def self.has_descendant_matching?(node, selector_str)
473
+ groups = Scrapetor::Dom::Selectors.selector_groups(selector_str)
474
+ groups.each do |raw_group|
475
+ g = raw_group.strip
476
+ # Scope-relative inner: `:has(> .child)` / `:has(+ .x)` / `:has(~ .x)`.
477
+ # Honour the combinator directly instead of compiling against a
478
+ # synthetic scope atom — that's both more accurate (matches CSS
479
+ # spec) and dodges the "Cannot parse selector atom near: > ..."
480
+ # crash that took out four production scrape fixtures.
481
+ if g.start_with?(">")
482
+ inner = g[1..].lstrip
483
+ plan = compile(inner)
484
+ node.children.each do |c|
485
+ next unless c.respond_to?(:element?) && c.element?
486
+ return true if matches_chain_at_node?(c, plan)
487
+ end
488
+ next
489
+ elsif g.start_with?("+")
490
+ inner = g[1..].lstrip
491
+ plan = compile(inner)
492
+ sib = node.respond_to?(:next_element_sibling) ? node.next_element_sibling : nil
493
+ return true if sib && matches_chain_at_node?(sib, plan)
494
+ next
495
+ elsif g.start_with?("~")
496
+ inner = g[1..].lstrip
497
+ plan = compile(inner)
498
+ sib = node.respond_to?(:next_element_sibling) ? node.next_element_sibling : nil
499
+ while sib
500
+ return true if matches_chain_at_node?(sib, plan)
501
+ sib = sib.next_element_sibling
502
+ end
503
+ next
504
+ end
505
+ plan = compile(g)
506
+ walk_descendants(node) do |d|
507
+ return true if matches_chain_at_node?(d, plan)
508
+ end
509
+ end
510
+ false
511
+ end
512
+
513
+ def self.walk_descendants(node, &block)
514
+ return unless node.respond_to?(:children)
515
+ node.children.each do |c|
516
+ next unless c.respond_to?(:element?) && c.element?
517
+ yield c
518
+ walk_descendants(c, &block)
519
+ end
520
+ end
521
+
522
+ def self.match_chain_backwards?(node, plan, idx, scope)
523
+ return true if idx < 0
524
+ atom = plan[idx]
525
+ combinator = plan[idx + 1].combinator
526
+ case combinator
527
+ when :child
528
+ parent = node.parent
529
+ return false if parent.nil?
530
+ return false unless parent.respond_to?(:element?) && parent.element?
531
+ return false unless in_scope?(parent, scope)
532
+ return false unless atom_matches?(atom, parent)
533
+ match_chain_backwards?(parent, plan, idx - 1, scope)
534
+ when :descendant, nil
535
+ cur = node.parent
536
+ while cur && cur.respond_to?(:element?) && cur.element?
537
+ if in_scope?(cur, scope) && atom_matches?(atom, cur) &&
538
+ match_chain_backwards?(cur, plan, idx - 1, scope)
539
+ return true
540
+ end
541
+ cur = cur.parent
542
+ end
543
+ false
544
+ when :adj
545
+ prev = node.respond_to?(:previous_element_sibling) ? node.previous_element_sibling : nil
546
+ return false if prev.nil?
547
+ return false unless in_scope?(prev, scope)
548
+ return false unless atom_matches?(atom, prev)
549
+ match_chain_backwards?(prev, plan, idx - 1, scope)
550
+ when :gen
551
+ prev = node.respond_to?(:previous_element_sibling) ? node.previous_element_sibling : nil
552
+ while prev
553
+ if in_scope?(prev, scope) && atom_matches?(atom, prev) &&
554
+ match_chain_backwards?(prev, plan, idx - 1, scope)
555
+ return true
556
+ end
557
+ prev = prev.previous_element_sibling
558
+ end
559
+ false
560
+ else
561
+ false
562
+ end
563
+ end
564
+
565
+ def self.in_scope?(node, scope)
566
+ return true if scope.nil?
567
+ return true if defined?(Dom::Document) && scope.is_a?(Dom::Document)
568
+ cur = node
569
+ while cur
570
+ return true if cur == scope
571
+ cur = cur.parent
572
+ end
573
+ false
574
+ end
575
+ end
576
+ end