scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,158 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Pure-Ruby HTML construction DSL. No external dependency.
5
+ #
6
+ # Two usage patterns:
7
+ #
8
+ # # 1. Block with explicit receiver:
9
+ # html = Scrapetor::Builder.build do |b|
10
+ # b.html do
11
+ # b.head { b.title "My Page" }
12
+ # b.body do
13
+ # b.h1 "Hello", class: "hdr"
14
+ # b.p "world", id: "lead"
15
+ # b.a("More", href: "/x")
16
+ # end
17
+ # end
18
+ # end
19
+ #
20
+ # # 2. Direct instance:
21
+ # b = Scrapetor::Builder.new
22
+ # b.div(class: "card") { b.h1 "Title" }
23
+ # b.to_html
24
+ class Builder
25
+ VOID = %w[
26
+ area base br col embed hr img input link meta source track wbr
27
+ ].freeze
28
+ private_constant :VOID
29
+
30
+ def self.build(&block)
31
+ new(&block).to_html
32
+ end
33
+
34
+ def initialize(&block)
35
+ @stack = []
36
+ @root = []
37
+ if block
38
+ if block.arity == 1
39
+ block.call(self)
40
+ else
41
+ instance_eval(&block)
42
+ end
43
+ end
44
+ end
45
+
46
+ # Inject a raw text node at the current position.
47
+ def text(s)
48
+ append(s.to_s)
49
+ self
50
+ end
51
+
52
+ # Inject pre-escaped raw HTML.
53
+ def raw(s)
54
+ append(RawHTML.new(s.to_s))
55
+ self
56
+ end
57
+
58
+ # Inject an HTML comment.
59
+ def comment(s)
60
+ append(Comment.new(s.to_s))
61
+ self
62
+ end
63
+
64
+ # Inject a doctype.
65
+ def doctype(name = "html")
66
+ append(Doctype.new(name.to_s))
67
+ self
68
+ end
69
+
70
+ # Method-missing dispatch: any unknown method becomes a tag.
71
+ #
72
+ # b.div("hi", class: "card") { b.span "x" }
73
+ # -> <div class="card">hi<span>x</span></div>
74
+ def method_missing(name, *args, &block)
75
+ content = nil
76
+ attrs = {}
77
+ args.each do |a|
78
+ case a
79
+ when Hash then attrs = attrs.merge(a)
80
+ when String then content ||= a
81
+ else content ||= a.to_s
82
+ end
83
+ end
84
+ element = Element.new(name.to_s, attrs, [])
85
+ append(element)
86
+ @stack.push(element)
87
+ element.children << content unless content.nil?
88
+ if block
89
+ if block.arity == 1
90
+ block.call(self)
91
+ else
92
+ instance_eval(&block)
93
+ end
94
+ end
95
+ @stack.pop
96
+ self
97
+ end
98
+
99
+ def respond_to_missing?(_name, _include_private = false)
100
+ true
101
+ end
102
+
103
+ def to_html
104
+ @root.map { |n| serialize(n) }.join
105
+ end
106
+ alias to_s to_html
107
+
108
+ private
109
+
110
+ def append(node)
111
+ if @stack.empty?
112
+ @root << node
113
+ else
114
+ @stack.last.children << node
115
+ end
116
+ end
117
+
118
+ def serialize(node)
119
+ case node
120
+ when String
121
+ escape_text(node)
122
+ when RawHTML
123
+ node.body
124
+ when Comment
125
+ "<!--#{node.body}-->"
126
+ when Doctype
127
+ "<!DOCTYPE #{node.body}>"
128
+ when Element
129
+ attr_str = node.attrs.map { |k, v| %( #{k}="#{escape_attr(v)}") }.join
130
+ if VOID.include?(node.name) && node.children.empty?
131
+ "<#{node.name}#{attr_str}>"
132
+ else
133
+ inner = node.children.map { |c| serialize(c) }.join
134
+ "<#{node.name}#{attr_str}>#{inner}</#{node.name}>"
135
+ end
136
+ end
137
+ end
138
+
139
+ def escape_text(s)
140
+ s.to_s.gsub(/[&<>]/, "&" => "&amp;", "<" => "&lt;", ">" => "&gt;")
141
+ end
142
+
143
+ def escape_attr(s)
144
+ s.to_s.gsub(/[&<>"']/,
145
+ "&" => "&amp;",
146
+ "<" => "&lt;",
147
+ ">" => "&gt;",
148
+ '"' => "&quot;",
149
+ "'" => "&#39;")
150
+ end
151
+
152
+ Element = Struct.new(:name, :attrs, :children)
153
+ RawHTML = Struct.new(:body)
154
+ Comment = Struct.new(:body)
155
+ Doctype = Struct.new(:body)
156
+ private_constant :Element, :RawHTML, :Comment, :Doctype
157
+ end
158
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ module Cleaner
5
+ def self.clean(s)
6
+ return nil if s.nil?
7
+ s.to_s.gsub(/\s+/, " ").strip
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Result type for XPath `comment()` queries (`//comment()`,
5
+ # `child::comment()`, etc.). Carries the comment's text payload and
6
+ # implements the Node-shape predicate methods so duck-typing checks
7
+ # (`n.comment?`, `n.element?`, `n.name == "#comment"`) match what
8
+ # Nokogiri would return.
9
+ #
10
+ # The constructor accepts a String (extracted by the native engine's
11
+ # `node_comment_text`) or a Dom::Comment (Ruby fallback path); in
12
+ # either case `#text` / `#content` returns the payload between
13
+ # `<!--` and `-->`.
14
+ class CommentNode
15
+ attr_reader :document
16
+
17
+ def initialize(document, payload)
18
+ @document = document
19
+ @text = payload.is_a?(String) ? payload :
20
+ (payload.respond_to?(:content) ? payload.content.to_s : payload.to_s)
21
+ end
22
+
23
+ def text; @text; end
24
+ alias content text
25
+ alias inner_text text
26
+
27
+ def to_s; @text; end
28
+ def to_html; "<!--#{@text}-->"; end
29
+ alias outer_html to_html
30
+ alias inner_html to_html
31
+
32
+ def name; "#comment"; end
33
+ alias node_name name
34
+
35
+ def comment?; true; end
36
+ def element?; false; end
37
+ def text?; false; end
38
+ def document?; false; end
39
+ def cdata?; false; end
40
+ def node_type; 8; end
41
+
42
+ # Node-shape probes that scraping code occasionally fires against
43
+ # mixed result sets. Returning a benign default keeps a stray
44
+ # `.css(...)` or `.attributes` from raising NoMethodError when a
45
+ # caller iterates over an Array<Element + CommentNode>.
46
+ def attributes; {}; end
47
+ def attribute_nodes; []; end
48
+ def attribute(_); nil; end
49
+ def keys; []; end
50
+ def values; []; end
51
+ def children; []; end
52
+ def element_children; []; end
53
+ def classes; []; end
54
+ def has_class?(_); false; end
55
+ def [](*_args); nil; end
56
+ def css(_); []; end
57
+ def at_css(_); nil; end
58
+ def at(_); nil; end
59
+ def search(_); []; end
60
+ def xpath(*_); []; end
61
+ def at_xpath(*_); nil; end
62
+
63
+ def inspect
64
+ "#<Scrapetor::CommentNode #{@text.inspect}>"
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,457 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ class Document
5
+ attr_reader :base_url, :encoding
6
+
7
+ def initialize(html, base_url: nil, build_indexes: false, encoding: :auto, native: nil)
8
+ @base_url = base_url
9
+ raw = html.to_s
10
+ if encoding == :auto
11
+ @encoding = Scrapetor::Encoding.detect(raw)
12
+ @html_str = Scrapetor::Encoding.to_utf8(raw)
13
+ else
14
+ @encoding = encoding.to_s
15
+ @html_str = raw.dup.force_encoding(@encoding).encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
16
+ end
17
+ @backing = nil # parsed lazily; native extract bypasses this entirely
18
+ @selector_cache = {}
19
+ @indexes_built = false
20
+ @class_index = nil
21
+ @id_index = nil
22
+ @tag_index = nil
23
+ # Hot-path slots (populated by backing()): keeping these
24
+ # initialised silences "instance variable not initialized" and
25
+ # makes the fast-path test a simple nil check.
26
+ @native_doc = nil
27
+ @native_wrapper = nil
28
+ @plan_cache = nil
29
+ @lazy_ids = nil
30
+ # If a pre-parsed native handle was passed in (persistent-cache
31
+ # hit), wrap it directly and skip the lazy-parse path.
32
+ @prebuilt_native = native
33
+ build_indexes! if build_indexes
34
+ end
35
+
36
+ def html_str
37
+ @html_str
38
+ end
39
+
40
+ # CSS query entry point. Inlined hot path for the >95% case: a
41
+ # selector with no `::` pseudo-element and a cache-hit native plan.
42
+ # That bypasses backing.lazy_css, peel_pseudo_element, and the
43
+ # method dispatch chain, dropping the per-call Ruby overhead to a
44
+ # single Hash#[] + Struct.new + NodeSet.new.
45
+ def css(*selectors)
46
+ # Nokogiri-compat: `doc.css(sel1, sel2, ...)` accepts multiple
47
+ # selectors and returns the union of matches across all of them.
48
+ # Drop trailing non-string arguments (Nokogiri also accepts an
49
+ # XPath namespaces hash here — that's a no-op for CSS).
50
+ selectors = selectors.reject { |a| !a.is_a?(String) }
51
+ raise ArgumentError, "Document#css requires at least one selector" if selectors.empty?
52
+ return css_single(selectors.first) if selectors.size == 1
53
+
54
+ seen = {}
55
+ union = []
56
+ string_result = nil
57
+ selectors.each do |sel|
58
+ result = css_single(sel)
59
+ if result.is_a?(Array)
60
+ string_result = true
61
+ result.each { |s| union << s }
62
+ else
63
+ # NodeSet — pull backing items and dedupe.
64
+ string_result = false if string_result.nil?
65
+ result.each do |node|
66
+ bk = node.respond_to?(:backing_node) ? node.backing_node : node
67
+ key = bk.object_id
68
+ next if seen[key]
69
+ seen[key] = true
70
+ union << bk
71
+ end
72
+ end
73
+ end
74
+ string_result ? union : NodeSet.new(self, union)
75
+ end
76
+
77
+ def css_single(selector)
78
+ # Fast path: native backing, no mutations applied yet, plain
79
+ # String selector, no pseudo-element. One Hash lookup + one C
80
+ # call + two allocations. After any mutation the wrapper flips
81
+ # into dom_mode and we route through the slow path so reads see
82
+ # the user's edits — checking @native_wrapper.dom_mode? is one
83
+ # ivar read, negligible vs the saving when we stay native.
84
+ if @native_doc && !@native_wrapper.dom_mode? && selector.is_a?(String) && !selector.include?("::")
85
+ plan = @plan_cache[selector]
86
+ if plan
87
+ return NodeSet.new(self, @lazy_ids.new(@native_wrapper, @native_doc, @native_doc.run_chain(plan, nil)))
88
+ elsif !@plan_cache.key?(selector)
89
+ plan = Scrapetor::Native.compile_selector_chain(selector)
90
+ @plan_cache[selector] = plan || false
91
+ if plan
92
+ return NodeSet.new(self, @lazy_ids.new(@native_wrapper, @native_doc, @native_doc.run_chain(plan, nil)))
93
+ end
94
+ end
95
+ end
96
+ # Slow path: pseudo-element, comma, fallback, post-mutation, or
97
+ # non-native backing.
98
+ bk = backing
99
+ result = bk.respond_to?(:lazy_css) ? bk.lazy_css(selector) : bk.css(selector)
100
+ if result.is_a?(Array) && (result.first.is_a?(String) || (result.empty? && pseudo_element?(selector)))
101
+ return result
102
+ end
103
+ if @lazy_ids && result.is_a?(@lazy_ids)
104
+ return NodeSet.new(self, result)
105
+ end
106
+ NodeSet.new(self, result.to_a)
107
+ end
108
+
109
+ # Run an array of CSS selectors in ONE Ruby/C boundary crossing.
110
+ # On selector-heavy workloads (SERP-style pages with ~30
111
+ # selectors per scrape) this amortises the per-query Ruby overhead
112
+ # across all of them — N selectors cost roughly one selector
113
+ # worth of Ruby dispatch, not N. Returns an Array of NodeSets (or
114
+ # Arrays-of-strings, for `::text` / `::attr(name)` selectors)
115
+ # parallel to the input.
116
+ #
117
+ # title_ns, price_strs, hrefs = doc.batch_css(
118
+ # ["h1.title", ".price::text", "a::attr(href)"]
119
+ # )
120
+ def batch_css(selectors)
121
+ bk = backing
122
+ unless bk.respond_to?(:batch_css)
123
+ # Pure-Ruby Dom fallback — no native engine. Loop manually.
124
+ return selectors.map { |s| css(s) }
125
+ end
126
+ bk.batch_css(self, selectors)
127
+ end
128
+
129
+ # Hash form: `{ name => selector, ... }` -> `{ name => result, ... }`.
130
+ # The classic scrape pattern in two lines. Same one-boundary cost
131
+ # as batch_css.
132
+ def extract_css(map)
133
+ keys = map.keys
134
+ selectors = map.values
135
+ results = batch_css(selectors)
136
+ out = {}
137
+ keys.each_with_index { |k, i| out[k] = results[i] }
138
+ out
139
+ end
140
+
141
+ # Single-result extract on the document scope. One C call covers
142
+ # field compilation, plan lookup, and result assembly.
143
+ def extract(map)
144
+ bk = backing
145
+ if defined?(Scrapetor::Native::DocumentWrapper) &&
146
+ bk.is_a?(Scrapetor::Native::DocumentWrapper) && !bk.dom_mode?
147
+ r = bk.native.extract_one_h(nil, map, bk)
148
+ return r unless r.equal?(true)
149
+ end
150
+ out = {}
151
+ map.each_pair { |k, sel| out[k] = at_css(sel) }
152
+ out
153
+ end
154
+
155
+ # Iterate matches of `outer_selector` across the whole document
156
+ # and build a Hash per match using `fields` (a {key => selector}
157
+ # map). Returns Array<Hash>. The inner selectors run scoped to
158
+ # each match, so a `result.at_css(field)`-style parser becomes:
159
+ #
160
+ # doc.extract_each(".result", {
161
+ # title: ".title::text",
162
+ # price: ".price::text",
163
+ # href: "a::attr(href)",
164
+ # })
165
+ #
166
+ # When the document is native-backed and every selector compiles
167
+ # cleanly, the whole iteration runs in a single C call — one outer
168
+ # plan + N inner plans times M matches, zero Ruby↔C round-trips on
169
+ # the hot path. Falls back to the per-row Ruby loop only when a
170
+ # selector compiles to nil (rare; the engine covers nearly every
171
+ # CSS Selectors L4 shape natively after the audit-driven coverage
172
+ # work).
173
+ def extract_each(outer_selector, fields)
174
+ bk = backing
175
+ if defined?(Scrapetor::Native::DocumentWrapper) &&
176
+ bk.is_a?(Scrapetor::Native::DocumentWrapper) && !bk.dom_mode?
177
+ outer_str = outer_selector.is_a?(String) ? outer_selector : outer_selector.to_s
178
+ r = bk.native.extract_each_h(outer_str, nil, fields, bk)
179
+ return r unless r.equal?(true)
180
+ end
181
+ css(outer_selector).map { |node| node.extract(fields) }
182
+ end
183
+
184
+ # Accepts the Nokogiri-compatible signature `doc.at(sel, ns_or_handler)`.
185
+ # The extra args (namespace prefix, handler) only matter for XPath
186
+ # land — CSS selectors ignore them — so we accept varargs and
187
+ # discard everything past the selector. Without this, callers that
188
+ # pass `doc.at(sel, namespaces_hash)` (or similar Bing-style
189
+ # patterns) hit `ArgumentError: wrong number of arguments`.
190
+ def at(selector, *_extra)
191
+ result = backing.at_css(selector)
192
+ return nil if result.nil?
193
+ return result if result.is_a?(String)
194
+ Node.new(self, result)
195
+ end
196
+ alias at_css at
197
+ alias search css
198
+
199
+ # Evaluate an XPath expression against this document. Implements
200
+ # the common XPath 1.0 subset via Scrapetor::XPath (descendant /
201
+ # child / parent axes, tag / @attr / text() node tests, position +
202
+ # attr-presence + attr-equality + contains() + starts-with() +
203
+ # text() predicates). Returns an Array of Scrapetor::Node when the
204
+ # expression ends at element nodes, or an Array of String for
205
+ # `/@attr` and `/text()` terminations. See lib/scrapetor/xpath.rb
206
+ # for the full supported grammar.
207
+ def xpath(expr)
208
+ Scrapetor::XPath.evaluate(self, expr)
209
+ end
210
+
211
+ def at_xpath(expr)
212
+ result = xpath(expr)
213
+ result.is_a?(Array) ? result.first : result
214
+ end
215
+
216
+ def traverse(&block)
217
+ return enum_for(:traverse) unless block_given?
218
+ backing.traverse { |n| yield(n.respond_to?(:element?) ? Node.new(self, n) : n) } if backing.respond_to?(:traverse)
219
+ self
220
+ end
221
+
222
+ private
223
+
224
+ def pseudo_element?(selector)
225
+ selector.to_s =~ /::(text|attr\([^)]+\)|first-letter|first-line|before|after)\s*\z/i
226
+ end
227
+
228
+ public
229
+
230
+ def root
231
+ el = backing.at_css("html") || backing
232
+ Node.new(self, el)
233
+ end
234
+
235
+ def text
236
+ backing.text
237
+ end
238
+ alias content text
239
+ alias inner_text text
240
+
241
+ def title
242
+ n = backing.at_css("title")
243
+ n && n.text
244
+ end
245
+
246
+ def body
247
+ n = backing.at_css("body")
248
+ n && Node.new(self, n)
249
+ end
250
+
251
+ def head
252
+ n = backing.at_css("head")
253
+ n && Node.new(self, n)
254
+ end
255
+
256
+ def html
257
+ n = backing.at_css("html") || backing
258
+ Node.new(self, n)
259
+ end
260
+
261
+ def to_html
262
+ backing.to_html
263
+ end
264
+ alias to_s to_html
265
+
266
+ # Nokogiri-compat predicates.
267
+ def errors
268
+ []
269
+ end
270
+
271
+ def html?
272
+ true
273
+ end
274
+
275
+ def xml?
276
+ false
277
+ end
278
+
279
+ # Structured-data extractors — for SEO/RAG/structured-content pipelines.
280
+
281
+ def json_ld
282
+ Scrapetor::StructuredData.json_ld(self)
283
+ end
284
+
285
+ def opengraph
286
+ Scrapetor::StructuredData.opengraph(self)
287
+ end
288
+
289
+ def twitter_card
290
+ Scrapetor::StructuredData.twitter_card(self)
291
+ end
292
+
293
+ def schema_org(type: nil)
294
+ Scrapetor::StructuredData.schema_org(self, type: type)
295
+ end
296
+
297
+ def microdata
298
+ Scrapetor::Microdata.extract(self)
299
+ end
300
+
301
+ def rdfa
302
+ Scrapetor::RDFa.extract(self)
303
+ end
304
+
305
+ def page_type
306
+ Scrapetor::PageType.detect(self)
307
+ end
308
+
309
+ def extract(schema = nil, &block)
310
+ schema ||= Schema.build(&block)
311
+ if Native.available?
312
+ result = extract_via_native(schema)
313
+ return result unless result.nil?
314
+ end
315
+ Extractor.run(self, backing, schema)
316
+ end
317
+
318
+ private
319
+
320
+ # Try the native path. Returns the result Hash on success, nil if the
321
+ # schema can't compile (caller falls back to Ruby).
322
+ #
323
+ # Schemas with both top-level fields AND repeated groups run two
324
+ # native passes — the engine supports one active record at a time, so
325
+ # a synthetic <html>-bound root for top-level fields can't co-exist
326
+ # with a repeated group inside the same scan. Two-pass cost is a
327
+ # second 65μs scan; still ~10× ahead of Nokolexbor at this size.
328
+ def extract_via_native(schema)
329
+ has_fields = schema.fields.any?
330
+ has_groups = schema.groups.any?
331
+ return nil unless has_fields || has_groups
332
+
333
+ # Common case: only repeated groups — one pass, no schema split.
334
+ if has_groups && !has_fields
335
+ desc = Native.compile_descriptor(schema)
336
+ return nil unless desc
337
+ return Native.extract(@html_str, desc, @base_url)
338
+ end
339
+
340
+ # Top-level fields only — one pass via synthetic root.
341
+ if has_fields && !has_groups
342
+ desc = Native.compile_descriptor(schema)
343
+ return nil unless desc
344
+ raw = Native.extract(@html_str, desc, @base_url)
345
+ root_records = raw[Native::SYNTHETIC_ROOT]
346
+ return nil if !root_records.is_a?(Array) || root_records.empty?
347
+ return root_records[0]
348
+ end
349
+
350
+ # Mixed: two-pass. The C engine handles one active record at a
351
+ # time, so a synthetic root for top-level fields can't run in
352
+ # the same scan as a repeated group. We split the schema into
353
+ # two sub-descriptors and run extract twice. Both sub-descriptors
354
+ # are memoised on the original schema so the split itself only
355
+ # allocates on the first call.
356
+ groups_desc = Native.split_descriptor(schema, :groups)
357
+ return nil unless groups_desc
358
+ result = Native.extract(@html_str, groups_desc, @base_url)
359
+
360
+ fields_desc = Native.split_descriptor(schema, :fields)
361
+ return nil unless fields_desc
362
+ raw = Native.extract(@html_str, fields_desc, @base_url)
363
+ root_records = raw[Native::SYNTHETIC_ROOT]
364
+ return nil if !root_records.is_a?(Array) || root_records.empty?
365
+
366
+ root_records[0].merge(result)
367
+ end
368
+
369
+ public
370
+
371
+ def stats
372
+ {
373
+ classes: @class_index ? @class_index.size : 0,
374
+ ids: @id_index ? @id_index.size : 0,
375
+ tags: @tag_index ? @tag_index.size : 0,
376
+ selector_cache_size: @selector_cache.size,
377
+ indexes_built: @indexes_built
378
+ }
379
+ end
380
+
381
+ def backing
382
+ return @backing if @backing
383
+ @backing =
384
+ if defined?(Scrapetor::Native::DocumentWrapper) && Scrapetor::Native::AVAILABLE_DOM
385
+ native = @prebuilt_native || Scrapetor::Native::Document.parse(@html_str)
386
+ @prebuilt_native = nil
387
+ Scrapetor::Native::DocumentWrapper.new(native)
388
+ else
389
+ Dom::Parser.parse(@html_str)
390
+ end
391
+ # Cache the hot-path slots so Document#css can skip the indirection.
392
+ if defined?(Scrapetor::Native::DocumentWrapper) &&
393
+ @backing.is_a?(Scrapetor::Native::DocumentWrapper)
394
+ @native_doc = @backing.native
395
+ @native_wrapper = @backing
396
+ @plan_cache = @backing.instance_variable_get(:@compile_cache)
397
+ @lazy_ids = Scrapetor::Native::DocumentWrapper::LazyIds
398
+ end
399
+ @backing
400
+ end
401
+
402
+ # Phase-2 hooks: structural indexes. Built on demand. The native
403
+ # backend will replace these with arena-resident indexes.
404
+ def class_index
405
+ build_indexes! unless @indexes_built
406
+ @class_index
407
+ end
408
+
409
+ def id_index
410
+ build_indexes! unless @indexes_built
411
+ @id_index
412
+ end
413
+
414
+ def tag_index
415
+ build_indexes! unless @indexes_built
416
+ @tag_index
417
+ end
418
+
419
+ def all_elements
420
+ build_indexes! unless @indexes_built
421
+ @all_elements
422
+ end
423
+
424
+ def run_selector(selector, scope)
425
+ plan = @selector_cache[selector] ||= Selector.compile(selector)
426
+ Selector.execute(self, plan, scope)
427
+ end
428
+
429
+ def cache_selector(selector)
430
+ @selector_cache[selector] ||= Selector.compile(selector)
431
+ end
432
+
433
+ def selector_cache_size
434
+ @selector_cache.size
435
+ end
436
+
437
+ private
438
+
439
+ def build_indexes!
440
+ return if @indexes_built
441
+ @class_index = Hash.new { |h, k| h[k] = [] }
442
+ @id_index = {}
443
+ @tag_index = Hash.new { |h, k| h[k] = [] }
444
+ @all_elements = backing.css("*").to_a
445
+ @all_elements.each do |el|
446
+ @tag_index[el.name.to_sym] << el
447
+ id = el["id"]
448
+ @id_index[id] ||= el if id
449
+ cls = el["class"]
450
+ if cls
451
+ cls.split(/\s+/).each { |c| @class_index[c] << el unless c.empty? }
452
+ end
453
+ end
454
+ @indexes_built = true
455
+ end
456
+ end
457
+ end