scrapetor 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +242 -0
- data/LICENSE +21 -0
- data/README.md +440 -0
- data/bin/scrapetor +190 -0
- data/bin/scrapetor-bench +5 -0
- data/ext/scrapetor/README.md +53 -0
- data/ext/scrapetor/native/extconf.rb +67 -0
- data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
- data/ext/scrapetor/native/scrapetor_http.c +2591 -0
- data/ext/scrapetor/native/scrapetor_native.c +1156 -0
- data/lib/scrapetor/builder.rb +158 -0
- data/lib/scrapetor/cleaner.rb +10 -0
- data/lib/scrapetor/comment_node.rb +67 -0
- data/lib/scrapetor/document.rb +457 -0
- data/lib/scrapetor/dom/parser.rb +69 -0
- data/lib/scrapetor/dom/selectors.rb +208 -0
- data/lib/scrapetor/dom.rb +563 -0
- data/lib/scrapetor/encoding.rb +85 -0
- data/lib/scrapetor/entities.rb +90 -0
- data/lib/scrapetor/errors.rb +12 -0
- data/lib/scrapetor/extractor.rb +147 -0
- data/lib/scrapetor/fetcher.rb +390 -0
- data/lib/scrapetor/fingerprint.rb +29 -0
- data/lib/scrapetor/form.rb +141 -0
- data/lib/scrapetor/http.rb +114 -0
- data/lib/scrapetor/microdata.rb +132 -0
- data/lib/scrapetor/money.rb +30 -0
- data/lib/scrapetor/native.rb +291 -0
- data/lib/scrapetor/native_dom.rb +2258 -0
- data/lib/scrapetor/node.rb +539 -0
- data/lib/scrapetor/node_set.rb +301 -0
- data/lib/scrapetor/page_type.rb +95 -0
- data/lib/scrapetor/pagination.rb +109 -0
- data/lib/scrapetor/persistent_cache.rb +130 -0
- data/lib/scrapetor/robots.rb +159 -0
- data/lib/scrapetor/sax.rb +285 -0
- data/lib/scrapetor/schema.rb +144 -0
- data/lib/scrapetor/selector.rb +576 -0
- data/lib/scrapetor/session.rb +141 -0
- data/lib/scrapetor/sitemap.rb +52 -0
- data/lib/scrapetor/stream.rb +111 -0
- data/lib/scrapetor/structured_data.rb +74 -0
- data/lib/scrapetor/template_registry.rb +24 -0
- data/lib/scrapetor/text_node.rb +101 -0
- data/lib/scrapetor/url.rb +21 -0
- data/lib/scrapetor/version.rb +5 -0
- data/lib/scrapetor/xpath.rb +1603 -0
- data/lib/scrapetor.rb +167 -0
- data/scrapetor.gemspec +77 -0
- metadata +200 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
# Pure-Ruby HTML construction DSL. No external dependency.
|
|
5
|
+
#
|
|
6
|
+
# Two usage patterns:
|
|
7
|
+
#
|
|
8
|
+
# # 1. Block with explicit receiver:
|
|
9
|
+
# html = Scrapetor::Builder.build do |b|
|
|
10
|
+
# b.html do
|
|
11
|
+
# b.head { b.title "My Page" }
|
|
12
|
+
# b.body do
|
|
13
|
+
# b.h1 "Hello", class: "hdr"
|
|
14
|
+
# b.p "world", id: "lead"
|
|
15
|
+
# b.a("More", href: "/x")
|
|
16
|
+
# end
|
|
17
|
+
# end
|
|
18
|
+
# end
|
|
19
|
+
#
|
|
20
|
+
# # 2. Direct instance:
|
|
21
|
+
# b = Scrapetor::Builder.new
|
|
22
|
+
# b.div(class: "card") { b.h1 "Title" }
|
|
23
|
+
# b.to_html
|
|
24
|
+
class Builder
|
|
25
|
+
VOID = %w[
|
|
26
|
+
area base br col embed hr img input link meta source track wbr
|
|
27
|
+
].freeze
|
|
28
|
+
private_constant :VOID
|
|
29
|
+
|
|
30
|
+
def self.build(&block)
|
|
31
|
+
new(&block).to_html
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def initialize(&block)
|
|
35
|
+
@stack = []
|
|
36
|
+
@root = []
|
|
37
|
+
if block
|
|
38
|
+
if block.arity == 1
|
|
39
|
+
block.call(self)
|
|
40
|
+
else
|
|
41
|
+
instance_eval(&block)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Inject a raw text node at the current position.
|
|
47
|
+
def text(s)
|
|
48
|
+
append(s.to_s)
|
|
49
|
+
self
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Inject pre-escaped raw HTML.
|
|
53
|
+
def raw(s)
|
|
54
|
+
append(RawHTML.new(s.to_s))
|
|
55
|
+
self
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Inject an HTML comment.
|
|
59
|
+
def comment(s)
|
|
60
|
+
append(Comment.new(s.to_s))
|
|
61
|
+
self
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Inject a doctype.
|
|
65
|
+
def doctype(name = "html")
|
|
66
|
+
append(Doctype.new(name.to_s))
|
|
67
|
+
self
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Method-missing dispatch: any unknown method becomes a tag.
|
|
71
|
+
#
|
|
72
|
+
# b.div("hi", class: "card") { b.span "x" }
|
|
73
|
+
# -> <div class="card">hi<span>x</span></div>
|
|
74
|
+
def method_missing(name, *args, &block)
|
|
75
|
+
content = nil
|
|
76
|
+
attrs = {}
|
|
77
|
+
args.each do |a|
|
|
78
|
+
case a
|
|
79
|
+
when Hash then attrs = attrs.merge(a)
|
|
80
|
+
when String then content ||= a
|
|
81
|
+
else content ||= a.to_s
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
element = Element.new(name.to_s, attrs, [])
|
|
85
|
+
append(element)
|
|
86
|
+
@stack.push(element)
|
|
87
|
+
element.children << content unless content.nil?
|
|
88
|
+
if block
|
|
89
|
+
if block.arity == 1
|
|
90
|
+
block.call(self)
|
|
91
|
+
else
|
|
92
|
+
instance_eval(&block)
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
@stack.pop
|
|
96
|
+
self
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def respond_to_missing?(_name, _include_private = false)
|
|
100
|
+
true
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def to_html
|
|
104
|
+
@root.map { |n| serialize(n) }.join
|
|
105
|
+
end
|
|
106
|
+
alias to_s to_html
|
|
107
|
+
|
|
108
|
+
private
|
|
109
|
+
|
|
110
|
+
def append(node)
|
|
111
|
+
if @stack.empty?
|
|
112
|
+
@root << node
|
|
113
|
+
else
|
|
114
|
+
@stack.last.children << node
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def serialize(node)
|
|
119
|
+
case node
|
|
120
|
+
when String
|
|
121
|
+
escape_text(node)
|
|
122
|
+
when RawHTML
|
|
123
|
+
node.body
|
|
124
|
+
when Comment
|
|
125
|
+
"<!--#{node.body}-->"
|
|
126
|
+
when Doctype
|
|
127
|
+
"<!DOCTYPE #{node.body}>"
|
|
128
|
+
when Element
|
|
129
|
+
attr_str = node.attrs.map { |k, v| %( #{k}="#{escape_attr(v)}") }.join
|
|
130
|
+
if VOID.include?(node.name) && node.children.empty?
|
|
131
|
+
"<#{node.name}#{attr_str}>"
|
|
132
|
+
else
|
|
133
|
+
inner = node.children.map { |c| serialize(c) }.join
|
|
134
|
+
"<#{node.name}#{attr_str}>#{inner}</#{node.name}>"
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def escape_text(s)
|
|
140
|
+
s.to_s.gsub(/[&<>]/, "&" => "&", "<" => "<", ">" => ">")
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def escape_attr(s)
|
|
144
|
+
s.to_s.gsub(/[&<>"']/,
|
|
145
|
+
"&" => "&",
|
|
146
|
+
"<" => "<",
|
|
147
|
+
">" => ">",
|
|
148
|
+
'"' => """,
|
|
149
|
+
"'" => "'")
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
Element = Struct.new(:name, :attrs, :children)
|
|
153
|
+
RawHTML = Struct.new(:body)
|
|
154
|
+
Comment = Struct.new(:body)
|
|
155
|
+
Doctype = Struct.new(:body)
|
|
156
|
+
private_constant :Element, :RawHTML, :Comment, :Doctype
|
|
157
|
+
end
|
|
158
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
# Result type for XPath `comment()` queries (`//comment()`,
|
|
5
|
+
# `child::comment()`, etc.). Carries the comment's text payload and
|
|
6
|
+
# implements the Node-shape predicate methods so duck-typing checks
|
|
7
|
+
# (`n.comment?`, `n.element?`, `n.name == "#comment"`) match what
|
|
8
|
+
# Nokogiri would return.
|
|
9
|
+
#
|
|
10
|
+
# The constructor accepts a String (extracted by the native engine's
|
|
11
|
+
# `node_comment_text`) or a Dom::Comment (Ruby fallback path); in
|
|
12
|
+
# either case `#text` / `#content` returns the payload between
|
|
13
|
+
# `<!--` and `-->`.
|
|
14
|
+
class CommentNode
|
|
15
|
+
attr_reader :document
|
|
16
|
+
|
|
17
|
+
def initialize(document, payload)
|
|
18
|
+
@document = document
|
|
19
|
+
@text = payload.is_a?(String) ? payload :
|
|
20
|
+
(payload.respond_to?(:content) ? payload.content.to_s : payload.to_s)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def text; @text; end
|
|
24
|
+
alias content text
|
|
25
|
+
alias inner_text text
|
|
26
|
+
|
|
27
|
+
def to_s; @text; end
|
|
28
|
+
def to_html; "<!--#{@text}-->"; end
|
|
29
|
+
alias outer_html to_html
|
|
30
|
+
alias inner_html to_html
|
|
31
|
+
|
|
32
|
+
def name; "#comment"; end
|
|
33
|
+
alias node_name name
|
|
34
|
+
|
|
35
|
+
def comment?; true; end
|
|
36
|
+
def element?; false; end
|
|
37
|
+
def text?; false; end
|
|
38
|
+
def document?; false; end
|
|
39
|
+
def cdata?; false; end
|
|
40
|
+
def node_type; 8; end
|
|
41
|
+
|
|
42
|
+
# Node-shape probes that scraping code occasionally fires against
|
|
43
|
+
# mixed result sets. Returning a benign default keeps a stray
|
|
44
|
+
# `.css(...)` or `.attributes` from raising NoMethodError when a
|
|
45
|
+
# caller iterates over an Array<Element + CommentNode>.
|
|
46
|
+
def attributes; {}; end
|
|
47
|
+
def attribute_nodes; []; end
|
|
48
|
+
def attribute(_); nil; end
|
|
49
|
+
def keys; []; end
|
|
50
|
+
def values; []; end
|
|
51
|
+
def children; []; end
|
|
52
|
+
def element_children; []; end
|
|
53
|
+
def classes; []; end
|
|
54
|
+
def has_class?(_); false; end
|
|
55
|
+
def [](*_args); nil; end
|
|
56
|
+
def css(_); []; end
|
|
57
|
+
def at_css(_); nil; end
|
|
58
|
+
def at(_); nil; end
|
|
59
|
+
def search(_); []; end
|
|
60
|
+
def xpath(*_); []; end
|
|
61
|
+
def at_xpath(*_); nil; end
|
|
62
|
+
|
|
63
|
+
def inspect
|
|
64
|
+
"#<Scrapetor::CommentNode #{@text.inspect}>"
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
class Document
|
|
5
|
+
attr_reader :base_url, :encoding
|
|
6
|
+
|
|
7
|
+
def initialize(html, base_url: nil, build_indexes: false, encoding: :auto, native: nil)
|
|
8
|
+
@base_url = base_url
|
|
9
|
+
raw = html.to_s
|
|
10
|
+
if encoding == :auto
|
|
11
|
+
@encoding = Scrapetor::Encoding.detect(raw)
|
|
12
|
+
@html_str = Scrapetor::Encoding.to_utf8(raw)
|
|
13
|
+
else
|
|
14
|
+
@encoding = encoding.to_s
|
|
15
|
+
@html_str = raw.dup.force_encoding(@encoding).encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
|
|
16
|
+
end
|
|
17
|
+
@backing = nil # parsed lazily; native extract bypasses this entirely
|
|
18
|
+
@selector_cache = {}
|
|
19
|
+
@indexes_built = false
|
|
20
|
+
@class_index = nil
|
|
21
|
+
@id_index = nil
|
|
22
|
+
@tag_index = nil
|
|
23
|
+
# Hot-path slots (populated by backing()): keeping these
|
|
24
|
+
# initialised silences "instance variable not initialized" and
|
|
25
|
+
# makes the fast-path test a simple nil check.
|
|
26
|
+
@native_doc = nil
|
|
27
|
+
@native_wrapper = nil
|
|
28
|
+
@plan_cache = nil
|
|
29
|
+
@lazy_ids = nil
|
|
30
|
+
# If a pre-parsed native handle was passed in (persistent-cache
|
|
31
|
+
# hit), wrap it directly and skip the lazy-parse path.
|
|
32
|
+
@prebuilt_native = native
|
|
33
|
+
build_indexes! if build_indexes
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def html_str
|
|
37
|
+
@html_str
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# CSS query entry point. Inlined hot path for the >95% case: a
|
|
41
|
+
# selector with no `::` pseudo-element and a cache-hit native plan.
|
|
42
|
+
# That bypasses backing.lazy_css, peel_pseudo_element, and the
|
|
43
|
+
# method dispatch chain, dropping the per-call Ruby overhead to a
|
|
44
|
+
# single Hash#[] + Struct.new + NodeSet.new.
|
|
45
|
+
def css(*selectors)
|
|
46
|
+
# Nokogiri-compat: `doc.css(sel1, sel2, ...)` accepts multiple
|
|
47
|
+
# selectors and returns the union of matches across all of them.
|
|
48
|
+
# Drop trailing non-string arguments (Nokogiri also accepts an
|
|
49
|
+
# XPath namespaces hash here — that's a no-op for CSS).
|
|
50
|
+
selectors = selectors.reject { |a| !a.is_a?(String) }
|
|
51
|
+
raise ArgumentError, "Document#css requires at least one selector" if selectors.empty?
|
|
52
|
+
return css_single(selectors.first) if selectors.size == 1
|
|
53
|
+
|
|
54
|
+
seen = {}
|
|
55
|
+
union = []
|
|
56
|
+
string_result = nil
|
|
57
|
+
selectors.each do |sel|
|
|
58
|
+
result = css_single(sel)
|
|
59
|
+
if result.is_a?(Array)
|
|
60
|
+
string_result = true
|
|
61
|
+
result.each { |s| union << s }
|
|
62
|
+
else
|
|
63
|
+
# NodeSet — pull backing items and dedupe.
|
|
64
|
+
string_result = false if string_result.nil?
|
|
65
|
+
result.each do |node|
|
|
66
|
+
bk = node.respond_to?(:backing_node) ? node.backing_node : node
|
|
67
|
+
key = bk.object_id
|
|
68
|
+
next if seen[key]
|
|
69
|
+
seen[key] = true
|
|
70
|
+
union << bk
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
string_result ? union : NodeSet.new(self, union)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def css_single(selector)
|
|
78
|
+
# Fast path: native backing, no mutations applied yet, plain
|
|
79
|
+
# String selector, no pseudo-element. One Hash lookup + one C
|
|
80
|
+
# call + two allocations. After any mutation the wrapper flips
|
|
81
|
+
# into dom_mode and we route through the slow path so reads see
|
|
82
|
+
# the user's edits — checking @native_wrapper.dom_mode? is one
|
|
83
|
+
# ivar read, negligible vs the saving when we stay native.
|
|
84
|
+
if @native_doc && !@native_wrapper.dom_mode? && selector.is_a?(String) && !selector.include?("::")
|
|
85
|
+
plan = @plan_cache[selector]
|
|
86
|
+
if plan
|
|
87
|
+
return NodeSet.new(self, @lazy_ids.new(@native_wrapper, @native_doc, @native_doc.run_chain(plan, nil)))
|
|
88
|
+
elsif !@plan_cache.key?(selector)
|
|
89
|
+
plan = Scrapetor::Native.compile_selector_chain(selector)
|
|
90
|
+
@plan_cache[selector] = plan || false
|
|
91
|
+
if plan
|
|
92
|
+
return NodeSet.new(self, @lazy_ids.new(@native_wrapper, @native_doc, @native_doc.run_chain(plan, nil)))
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
# Slow path: pseudo-element, comma, fallback, post-mutation, or
|
|
97
|
+
# non-native backing.
|
|
98
|
+
bk = backing
|
|
99
|
+
result = bk.respond_to?(:lazy_css) ? bk.lazy_css(selector) : bk.css(selector)
|
|
100
|
+
if result.is_a?(Array) && (result.first.is_a?(String) || (result.empty? && pseudo_element?(selector)))
|
|
101
|
+
return result
|
|
102
|
+
end
|
|
103
|
+
if @lazy_ids && result.is_a?(@lazy_ids)
|
|
104
|
+
return NodeSet.new(self, result)
|
|
105
|
+
end
|
|
106
|
+
NodeSet.new(self, result.to_a)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Run an array of CSS selectors in ONE Ruby/C boundary crossing.
|
|
110
|
+
# On selector-heavy workloads (SERP-style pages with ~30
|
|
111
|
+
# selectors per scrape) this amortises the per-query Ruby overhead
|
|
112
|
+
# across all of them — N selectors cost roughly one selector
|
|
113
|
+
# worth of Ruby dispatch, not N. Returns an Array of NodeSets (or
|
|
114
|
+
# Arrays-of-strings, for `::text` / `::attr(name)` selectors)
|
|
115
|
+
# parallel to the input.
|
|
116
|
+
#
|
|
117
|
+
# title_ns, price_strs, hrefs = doc.batch_css(
|
|
118
|
+
# ["h1.title", ".price::text", "a::attr(href)"]
|
|
119
|
+
# )
|
|
120
|
+
def batch_css(selectors)
|
|
121
|
+
bk = backing
|
|
122
|
+
unless bk.respond_to?(:batch_css)
|
|
123
|
+
# Pure-Ruby Dom fallback — no native engine. Loop manually.
|
|
124
|
+
return selectors.map { |s| css(s) }
|
|
125
|
+
end
|
|
126
|
+
bk.batch_css(self, selectors)
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Hash form: `{ name => selector, ... }` -> `{ name => result, ... }`.
|
|
130
|
+
# The classic scrape pattern in two lines. Same one-boundary cost
|
|
131
|
+
# as batch_css.
|
|
132
|
+
def extract_css(map)
|
|
133
|
+
keys = map.keys
|
|
134
|
+
selectors = map.values
|
|
135
|
+
results = batch_css(selectors)
|
|
136
|
+
out = {}
|
|
137
|
+
keys.each_with_index { |k, i| out[k] = results[i] }
|
|
138
|
+
out
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Single-result extract on the document scope. One C call covers
|
|
142
|
+
# field compilation, plan lookup, and result assembly.
|
|
143
|
+
def extract(map)
|
|
144
|
+
bk = backing
|
|
145
|
+
if defined?(Scrapetor::Native::DocumentWrapper) &&
|
|
146
|
+
bk.is_a?(Scrapetor::Native::DocumentWrapper) && !bk.dom_mode?
|
|
147
|
+
r = bk.native.extract_one_h(nil, map, bk)
|
|
148
|
+
return r unless r.equal?(true)
|
|
149
|
+
end
|
|
150
|
+
out = {}
|
|
151
|
+
map.each_pair { |k, sel| out[k] = at_css(sel) }
|
|
152
|
+
out
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Iterate matches of `outer_selector` across the whole document
|
|
156
|
+
# and build a Hash per match using `fields` (a {key => selector}
|
|
157
|
+
# map). Returns Array<Hash>. The inner selectors run scoped to
|
|
158
|
+
# each match, so a `result.at_css(field)`-style parser becomes:
|
|
159
|
+
#
|
|
160
|
+
# doc.extract_each(".result", {
|
|
161
|
+
# title: ".title::text",
|
|
162
|
+
# price: ".price::text",
|
|
163
|
+
# href: "a::attr(href)",
|
|
164
|
+
# })
|
|
165
|
+
#
|
|
166
|
+
# When the document is native-backed and every selector compiles
|
|
167
|
+
# cleanly, the whole iteration runs in a single C call — one outer
|
|
168
|
+
# plan + N inner plans times M matches, zero Ruby↔C round-trips on
|
|
169
|
+
# the hot path. Falls back to the per-row Ruby loop only when a
|
|
170
|
+
# selector compiles to nil (rare; the engine covers nearly every
|
|
171
|
+
# CSS Selectors L4 shape natively after the audit-driven coverage
|
|
172
|
+
# work).
|
|
173
|
+
def extract_each(outer_selector, fields)
|
|
174
|
+
bk = backing
|
|
175
|
+
if defined?(Scrapetor::Native::DocumentWrapper) &&
|
|
176
|
+
bk.is_a?(Scrapetor::Native::DocumentWrapper) && !bk.dom_mode?
|
|
177
|
+
outer_str = outer_selector.is_a?(String) ? outer_selector : outer_selector.to_s
|
|
178
|
+
r = bk.native.extract_each_h(outer_str, nil, fields, bk)
|
|
179
|
+
return r unless r.equal?(true)
|
|
180
|
+
end
|
|
181
|
+
css(outer_selector).map { |node| node.extract(fields) }
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Accepts the Nokogiri-compatible signature `doc.at(sel, ns_or_handler)`.
|
|
185
|
+
# The extra args (namespace prefix, handler) only matter for XPath
|
|
186
|
+
# land — CSS selectors ignore them — so we accept varargs and
|
|
187
|
+
# discard everything past the selector. Without this, callers that
|
|
188
|
+
# pass `doc.at(sel, namespaces_hash)` (or similar Bing-style
|
|
189
|
+
# patterns) hit `ArgumentError: wrong number of arguments`.
|
|
190
|
+
def at(selector, *_extra)
|
|
191
|
+
result = backing.at_css(selector)
|
|
192
|
+
return nil if result.nil?
|
|
193
|
+
return result if result.is_a?(String)
|
|
194
|
+
Node.new(self, result)
|
|
195
|
+
end
|
|
196
|
+
alias at_css at
|
|
197
|
+
alias search css
|
|
198
|
+
|
|
199
|
+
# Evaluate an XPath expression against this document. Implements
|
|
200
|
+
# the common XPath 1.0 subset via Scrapetor::XPath (descendant /
|
|
201
|
+
# child / parent axes, tag / @attr / text() node tests, position +
|
|
202
|
+
# attr-presence + attr-equality + contains() + starts-with() +
|
|
203
|
+
# text() predicates). Returns an Array of Scrapetor::Node when the
|
|
204
|
+
# expression ends at element nodes, or an Array of String for
|
|
205
|
+
# `/@attr` and `/text()` terminations. See lib/scrapetor/xpath.rb
|
|
206
|
+
# for the full supported grammar.
|
|
207
|
+
def xpath(expr)
|
|
208
|
+
Scrapetor::XPath.evaluate(self, expr)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def at_xpath(expr)
|
|
212
|
+
result = xpath(expr)
|
|
213
|
+
result.is_a?(Array) ? result.first : result
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
def traverse(&block)
|
|
217
|
+
return enum_for(:traverse) unless block_given?
|
|
218
|
+
backing.traverse { |n| yield(n.respond_to?(:element?) ? Node.new(self, n) : n) } if backing.respond_to?(:traverse)
|
|
219
|
+
self
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
private
|
|
223
|
+
|
|
224
|
+
def pseudo_element?(selector)
|
|
225
|
+
selector.to_s =~ /::(text|attr\([^)]+\)|first-letter|first-line|before|after)\s*\z/i
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
public
|
|
229
|
+
|
|
230
|
+
def root
|
|
231
|
+
el = backing.at_css("html") || backing
|
|
232
|
+
Node.new(self, el)
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def text
|
|
236
|
+
backing.text
|
|
237
|
+
end
|
|
238
|
+
alias content text
|
|
239
|
+
alias inner_text text
|
|
240
|
+
|
|
241
|
+
def title
|
|
242
|
+
n = backing.at_css("title")
|
|
243
|
+
n && n.text
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
def body
|
|
247
|
+
n = backing.at_css("body")
|
|
248
|
+
n && Node.new(self, n)
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def head
|
|
252
|
+
n = backing.at_css("head")
|
|
253
|
+
n && Node.new(self, n)
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def html
|
|
257
|
+
n = backing.at_css("html") || backing
|
|
258
|
+
Node.new(self, n)
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
def to_html
|
|
262
|
+
backing.to_html
|
|
263
|
+
end
|
|
264
|
+
alias to_s to_html
|
|
265
|
+
|
|
266
|
+
# Nokogiri-compat predicates.
|
|
267
|
+
def errors
|
|
268
|
+
[]
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
def html?
|
|
272
|
+
true
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def xml?
|
|
276
|
+
false
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
# Structured-data extractors — for SEO/RAG/structured-content pipelines.
|
|
280
|
+
|
|
281
|
+
def json_ld
|
|
282
|
+
Scrapetor::StructuredData.json_ld(self)
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def opengraph
|
|
286
|
+
Scrapetor::StructuredData.opengraph(self)
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
def twitter_card
|
|
290
|
+
Scrapetor::StructuredData.twitter_card(self)
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
def schema_org(type: nil)
|
|
294
|
+
Scrapetor::StructuredData.schema_org(self, type: type)
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def microdata
|
|
298
|
+
Scrapetor::Microdata.extract(self)
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
def rdfa
|
|
302
|
+
Scrapetor::RDFa.extract(self)
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
def page_type
|
|
306
|
+
Scrapetor::PageType.detect(self)
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
def extract(schema = nil, &block)
|
|
310
|
+
schema ||= Schema.build(&block)
|
|
311
|
+
if Native.available?
|
|
312
|
+
result = extract_via_native(schema)
|
|
313
|
+
return result unless result.nil?
|
|
314
|
+
end
|
|
315
|
+
Extractor.run(self, backing, schema)
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
private
|
|
319
|
+
|
|
320
|
+
# Try the native path. Returns the result Hash on success, nil if the
|
|
321
|
+
# schema can't compile (caller falls back to Ruby).
|
|
322
|
+
#
|
|
323
|
+
# Schemas with both top-level fields AND repeated groups run two
|
|
324
|
+
# native passes — the engine supports one active record at a time, so
|
|
325
|
+
# a synthetic <html>-bound root for top-level fields can't co-exist
|
|
326
|
+
# with a repeated group inside the same scan. Two-pass cost is a
|
|
327
|
+
# second 65μs scan; still ~10× ahead of Nokolexbor at this size.
|
|
328
|
+
def extract_via_native(schema)
|
|
329
|
+
has_fields = schema.fields.any?
|
|
330
|
+
has_groups = schema.groups.any?
|
|
331
|
+
return nil unless has_fields || has_groups
|
|
332
|
+
|
|
333
|
+
# Common case: only repeated groups — one pass, no schema split.
|
|
334
|
+
if has_groups && !has_fields
|
|
335
|
+
desc = Native.compile_descriptor(schema)
|
|
336
|
+
return nil unless desc
|
|
337
|
+
return Native.extract(@html_str, desc, @base_url)
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
# Top-level fields only — one pass via synthetic root.
|
|
341
|
+
if has_fields && !has_groups
|
|
342
|
+
desc = Native.compile_descriptor(schema)
|
|
343
|
+
return nil unless desc
|
|
344
|
+
raw = Native.extract(@html_str, desc, @base_url)
|
|
345
|
+
root_records = raw[Native::SYNTHETIC_ROOT]
|
|
346
|
+
return nil if !root_records.is_a?(Array) || root_records.empty?
|
|
347
|
+
return root_records[0]
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
# Mixed: two-pass. The C engine handles one active record at a
|
|
351
|
+
# time, so a synthetic root for top-level fields can't run in
|
|
352
|
+
# the same scan as a repeated group. We split the schema into
|
|
353
|
+
# two sub-descriptors and run extract twice. Both sub-descriptors
|
|
354
|
+
# are memoised on the original schema so the split itself only
|
|
355
|
+
# allocates on the first call.
|
|
356
|
+
groups_desc = Native.split_descriptor(schema, :groups)
|
|
357
|
+
return nil unless groups_desc
|
|
358
|
+
result = Native.extract(@html_str, groups_desc, @base_url)
|
|
359
|
+
|
|
360
|
+
fields_desc = Native.split_descriptor(schema, :fields)
|
|
361
|
+
return nil unless fields_desc
|
|
362
|
+
raw = Native.extract(@html_str, fields_desc, @base_url)
|
|
363
|
+
root_records = raw[Native::SYNTHETIC_ROOT]
|
|
364
|
+
return nil if !root_records.is_a?(Array) || root_records.empty?
|
|
365
|
+
|
|
366
|
+
root_records[0].merge(result)
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
public
|
|
370
|
+
|
|
371
|
+
def stats
|
|
372
|
+
{
|
|
373
|
+
classes: @class_index ? @class_index.size : 0,
|
|
374
|
+
ids: @id_index ? @id_index.size : 0,
|
|
375
|
+
tags: @tag_index ? @tag_index.size : 0,
|
|
376
|
+
selector_cache_size: @selector_cache.size,
|
|
377
|
+
indexes_built: @indexes_built
|
|
378
|
+
}
|
|
379
|
+
end
|
|
380
|
+
|
|
381
|
+
def backing
|
|
382
|
+
return @backing if @backing
|
|
383
|
+
@backing =
|
|
384
|
+
if defined?(Scrapetor::Native::DocumentWrapper) && Scrapetor::Native::AVAILABLE_DOM
|
|
385
|
+
native = @prebuilt_native || Scrapetor::Native::Document.parse(@html_str)
|
|
386
|
+
@prebuilt_native = nil
|
|
387
|
+
Scrapetor::Native::DocumentWrapper.new(native)
|
|
388
|
+
else
|
|
389
|
+
Dom::Parser.parse(@html_str)
|
|
390
|
+
end
|
|
391
|
+
# Cache the hot-path slots so Document#css can skip the indirection.
|
|
392
|
+
if defined?(Scrapetor::Native::DocumentWrapper) &&
|
|
393
|
+
@backing.is_a?(Scrapetor::Native::DocumentWrapper)
|
|
394
|
+
@native_doc = @backing.native
|
|
395
|
+
@native_wrapper = @backing
|
|
396
|
+
@plan_cache = @backing.instance_variable_get(:@compile_cache)
|
|
397
|
+
@lazy_ids = Scrapetor::Native::DocumentWrapper::LazyIds
|
|
398
|
+
end
|
|
399
|
+
@backing
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
# Phase-2 hooks: structural indexes. Built on demand. The native
|
|
403
|
+
# backend will replace these with arena-resident indexes.
|
|
404
|
+
def class_index
|
|
405
|
+
build_indexes! unless @indexes_built
|
|
406
|
+
@class_index
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
def id_index
|
|
410
|
+
build_indexes! unless @indexes_built
|
|
411
|
+
@id_index
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
def tag_index
|
|
415
|
+
build_indexes! unless @indexes_built
|
|
416
|
+
@tag_index
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
def all_elements
|
|
420
|
+
build_indexes! unless @indexes_built
|
|
421
|
+
@all_elements
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
def run_selector(selector, scope)
|
|
425
|
+
plan = @selector_cache[selector] ||= Selector.compile(selector)
|
|
426
|
+
Selector.execute(self, plan, scope)
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
def cache_selector(selector)
|
|
430
|
+
@selector_cache[selector] ||= Selector.compile(selector)
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
def selector_cache_size
|
|
434
|
+
@selector_cache.size
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
private
|
|
438
|
+
|
|
439
|
+
def build_indexes!
|
|
440
|
+
return if @indexes_built
|
|
441
|
+
@class_index = Hash.new { |h, k| h[k] = [] }
|
|
442
|
+
@id_index = {}
|
|
443
|
+
@tag_index = Hash.new { |h, k| h[k] = [] }
|
|
444
|
+
@all_elements = backing.css("*").to_a
|
|
445
|
+
@all_elements.each do |el|
|
|
446
|
+
@tag_index[el.name.to_sym] << el
|
|
447
|
+
id = el["id"]
|
|
448
|
+
@id_index[id] ||= el if id
|
|
449
|
+
cls = el["class"]
|
|
450
|
+
if cls
|
|
451
|
+
cls.split(/\s+/).each { |c| @class_index[c] << el unless c.empty? }
|
|
452
|
+
end
|
|
453
|
+
end
|
|
454
|
+
@indexes_built = true
|
|
455
|
+
end
|
|
456
|
+
end
|
|
457
|
+
end
|