scrapetor 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +242 -0
- data/LICENSE +21 -0
- data/README.md +440 -0
- data/bin/scrapetor +190 -0
- data/bin/scrapetor-bench +5 -0
- data/ext/scrapetor/README.md +53 -0
- data/ext/scrapetor/native/extconf.rb +67 -0
- data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
- data/ext/scrapetor/native/scrapetor_http.c +2591 -0
- data/ext/scrapetor/native/scrapetor_native.c +1156 -0
- data/lib/scrapetor/builder.rb +158 -0
- data/lib/scrapetor/cleaner.rb +10 -0
- data/lib/scrapetor/comment_node.rb +67 -0
- data/lib/scrapetor/document.rb +457 -0
- data/lib/scrapetor/dom/parser.rb +69 -0
- data/lib/scrapetor/dom/selectors.rb +208 -0
- data/lib/scrapetor/dom.rb +563 -0
- data/lib/scrapetor/encoding.rb +85 -0
- data/lib/scrapetor/entities.rb +90 -0
- data/lib/scrapetor/errors.rb +12 -0
- data/lib/scrapetor/extractor.rb +147 -0
- data/lib/scrapetor/fetcher.rb +390 -0
- data/lib/scrapetor/fingerprint.rb +29 -0
- data/lib/scrapetor/form.rb +141 -0
- data/lib/scrapetor/http.rb +114 -0
- data/lib/scrapetor/microdata.rb +132 -0
- data/lib/scrapetor/money.rb +30 -0
- data/lib/scrapetor/native.rb +291 -0
- data/lib/scrapetor/native_dom.rb +2258 -0
- data/lib/scrapetor/node.rb +539 -0
- data/lib/scrapetor/node_set.rb +301 -0
- data/lib/scrapetor/page_type.rb +95 -0
- data/lib/scrapetor/pagination.rb +109 -0
- data/lib/scrapetor/persistent_cache.rb +130 -0
- data/lib/scrapetor/robots.rb +159 -0
- data/lib/scrapetor/sax.rb +285 -0
- data/lib/scrapetor/schema.rb +144 -0
- data/lib/scrapetor/selector.rb +576 -0
- data/lib/scrapetor/session.rb +141 -0
- data/lib/scrapetor/sitemap.rb +52 -0
- data/lib/scrapetor/stream.rb +111 -0
- data/lib/scrapetor/structured_data.rb +74 -0
- data/lib/scrapetor/template_registry.rb +24 -0
- data/lib/scrapetor/text_node.rb +101 -0
- data/lib/scrapetor/url.rb +21 -0
- data/lib/scrapetor/version.rb +5 -0
- data/lib/scrapetor/xpath.rb +1603 -0
- data/lib/scrapetor.rb +167 -0
- data/scrapetor.gemspec +77 -0
- metadata +200 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
module Dom
|
|
5
|
+
# Build a Dom::Document from raw HTML via the SAX tokenizer.
|
|
6
|
+
module Parser
|
|
7
|
+
VOID_TAGS = Scrapetor::Dom::VOID.to_h { |t| [t, true] }.freeze
|
|
8
|
+
|
|
9
|
+
def self.parse(html)
|
|
10
|
+
doc = Dom::Document.new
|
|
11
|
+
stack = [doc]
|
|
12
|
+
tokenizer = Scrapetor::SAX::Tokenizer.new(html)
|
|
13
|
+
tokenizer.each_event do |event|
|
|
14
|
+
type, *args = event
|
|
15
|
+
case type
|
|
16
|
+
when :doc_start, :doc_end
|
|
17
|
+
# no-op
|
|
18
|
+
when :doctype
|
|
19
|
+
doc.doctype = args[0]
|
|
20
|
+
when :start
|
|
21
|
+
name, attrs = args
|
|
22
|
+
element = Element.new(name, attrs || {})
|
|
23
|
+
stack.last.add_child(element)
|
|
24
|
+
stack.push(element) unless VOID_TAGS[element.name]
|
|
25
|
+
when :end
|
|
26
|
+
name = args[0]
|
|
27
|
+
# Pop frames until matching close or root.
|
|
28
|
+
idx = stack.rindex { |n| n.is_a?(Element) && n.name == name }
|
|
29
|
+
if idx
|
|
30
|
+
stack.slice!(idx..)
|
|
31
|
+
end
|
|
32
|
+
when :text
|
|
33
|
+
stack.last.add_child(Text.new(args[0]))
|
|
34
|
+
when :comment
|
|
35
|
+
stack.last.add_child(Comment.new(args[0]))
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
doc
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Parse a fragment — return an Array of nodes (no Document wrapper).
|
|
42
|
+
def self.fragment(html)
|
|
43
|
+
wrapper = Element.new("__fragment__")
|
|
44
|
+
stack = [wrapper]
|
|
45
|
+
Scrapetor::SAX::Tokenizer.new(html).each_event do |event|
|
|
46
|
+
type, *args = event
|
|
47
|
+
case type
|
|
48
|
+
when :start
|
|
49
|
+
name, attrs = args
|
|
50
|
+
element = Element.new(name, attrs || {})
|
|
51
|
+
stack.last.add_child(element)
|
|
52
|
+
stack.push(element) unless VOID_TAGS[element.name]
|
|
53
|
+
when :end
|
|
54
|
+
name = args[0]
|
|
55
|
+
idx = stack.rindex { |n| n.is_a?(Element) && n.name == name }
|
|
56
|
+
stack.slice!(idx..) if idx && idx > 0
|
|
57
|
+
when :text
|
|
58
|
+
stack.last.add_child(Text.new(args[0]))
|
|
59
|
+
when :comment
|
|
60
|
+
stack.last.add_child(Comment.new(args[0]))
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
nodes = wrapper.children
|
|
64
|
+
nodes.each { |n| n.parent = nil }
|
|
65
|
+
nodes
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
module Dom
|
|
5
|
+
# CSS selector engine over the pure-Ruby DOM.
|
|
6
|
+
#
|
|
7
|
+
# Pipeline:
|
|
8
|
+
# 1. Compile the selector string into a list of "atoms" with
|
|
9
|
+
# combinators (reuses `Scrapetor::Selector.compile`).
|
|
10
|
+
# 2. Find candidates matching the rightmost atom by walking the
|
|
11
|
+
# subtree once (no global indexes — the DOM is small enough
|
|
12
|
+
# that one scan is faster than maintaining indexes for the
|
|
13
|
+
# typical scraping document).
|
|
14
|
+
# 3. For each candidate, walk ancestors right-to-left to verify
|
|
15
|
+
# the rest of the chain.
|
|
16
|
+
#
|
|
17
|
+
# Atom matching delegates to `Scrapetor::Selector.atom_matches?`, so
|
|
18
|
+
# pseudo-class support (`:has`, `:not`, `:is`, `:nth-child`, etc.)
|
|
19
|
+
# lives in one place.
|
|
20
|
+
module Selectors
|
|
21
|
+
def self.css(scope, selector_str)
|
|
22
|
+
results = []
|
|
23
|
+
seen = {}
|
|
24
|
+
selector_groups(selector_str).each do |group|
|
|
25
|
+
plan = compile(group)
|
|
26
|
+
next if plan.empty?
|
|
27
|
+
execute(scope, plan).each do |n|
|
|
28
|
+
oid = n.object_id
|
|
29
|
+
next if seen[oid]
|
|
30
|
+
seen[oid] = true
|
|
31
|
+
results << n
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
results
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Cached comma-splitter. Frozen-literal selector strings hit
|
|
38
|
+
# the cache 100% of the time after first call, so a fallback
|
|
39
|
+
# loop that re-runs the same selector pays the per-char scan
|
|
40
|
+
# once across the whole iteration.
|
|
41
|
+
GROUPS_CACHE = {}
|
|
42
|
+
GROUPS_CACHE_CAP = 1024
|
|
43
|
+
|
|
44
|
+
def self.selector_groups(s)
|
|
45
|
+
cached = GROUPS_CACHE[s]
|
|
46
|
+
return cached if cached
|
|
47
|
+
depth = 0
|
|
48
|
+
paren = 0
|
|
49
|
+
groups = []
|
|
50
|
+
buf = +""
|
|
51
|
+
s.each_char do |ch|
|
|
52
|
+
if ch == "["
|
|
53
|
+
depth += 1; buf << ch
|
|
54
|
+
elsif ch == "]"
|
|
55
|
+
depth -= 1 if depth.positive?; buf << ch
|
|
56
|
+
elsif ch == "("
|
|
57
|
+
paren += 1; buf << ch
|
|
58
|
+
elsif ch == ")"
|
|
59
|
+
paren -= 1 if paren.positive?; buf << ch
|
|
60
|
+
elsif ch == "," && depth.zero? && paren.zero?
|
|
61
|
+
groups << buf.strip
|
|
62
|
+
buf = +""
|
|
63
|
+
else
|
|
64
|
+
buf << ch
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
groups << buf.strip
|
|
68
|
+
out = groups.reject(&:empty?).each(&:freeze).freeze
|
|
69
|
+
GROUPS_CACHE.shift while GROUPS_CACHE.size >= GROUPS_CACHE_CAP
|
|
70
|
+
GROUPS_CACHE[s] = out
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Cache compiled plans by selector string so a dom-mode document
|
|
74
|
+
# that re-runs the same selector dozens of times in a fallback
|
|
75
|
+
# loop only pays the parse cost once. Selector strings tend to
|
|
76
|
+
# come from frozen literals in parser code, so the cache hit
|
|
77
|
+
# rate is effectively 100%.
|
|
78
|
+
DOM_COMPILE_CACHE = {}
|
|
79
|
+
DOM_COMPILE_CACHE_CAP = 1024
|
|
80
|
+
|
|
81
|
+
def self.compile(selector)
|
|
82
|
+
cached = DOM_COMPILE_CACHE[selector]
|
|
83
|
+
return cached if cached
|
|
84
|
+
plan = Scrapetor::Selector.compile(selector)
|
|
85
|
+
DOM_COMPILE_CACHE.shift while DOM_COMPILE_CACHE.size >= DOM_COMPILE_CACHE_CAP
|
|
86
|
+
DOM_COMPILE_CACHE[selector] = plan
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def self.execute(scope, plan)
|
|
90
|
+
return [] if plan.empty?
|
|
91
|
+
last_idx = plan.size - 1
|
|
92
|
+
candidates = candidates_for_atom(scope, plan[last_idx])
|
|
93
|
+
return candidates if plan.size == 1
|
|
94
|
+
candidates.select { |n| match_chain_backwards?(n, plan, last_idx - 1, scope) }
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def self.candidates_for_atom(scope, atom)
|
|
98
|
+
# Use the document's lazy structural indexes when the atom has a
|
|
99
|
+
# narrowing anchor (id / class / tag). Falling back to a full
|
|
100
|
+
# walk_descendants on every fallback selector dominated parse
|
|
101
|
+
# time on 100KB SERP-style fixtures.
|
|
102
|
+
doc = atom_document(scope)
|
|
103
|
+
if doc.is_a?(Document) && atom.id
|
|
104
|
+
node = doc.id_index[atom.id]
|
|
105
|
+
return [] if node.nil?
|
|
106
|
+
return Scrapetor::Selector.atom_matches?(atom, node) && in_scope?(node, scope) ? [node] : []
|
|
107
|
+
end
|
|
108
|
+
if doc.is_a?(Document) && atom.classes && !atom.classes.empty?
|
|
109
|
+
# Pick the narrowest class index entry as the candidate seed.
|
|
110
|
+
sets = atom.classes.map { |c| doc.class_index[c] || [] }
|
|
111
|
+
seed = sets.min_by(&:size) || []
|
|
112
|
+
return seed.select do |node|
|
|
113
|
+
in_scope?(node, scope) && Scrapetor::Selector.atom_matches?(atom, node)
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
if doc.is_a?(Document) && atom.tag
|
|
117
|
+
seed = doc.tag_index[atom.tag.to_s] || []
|
|
118
|
+
return seed.select do |node|
|
|
119
|
+
in_scope?(node, scope) && Scrapetor::Selector.atom_matches?(atom, node)
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
result = []
|
|
123
|
+
walk_descendants(scope) do |node|
|
|
124
|
+
result << node if Scrapetor::Selector.atom_matches?(atom, node)
|
|
125
|
+
end
|
|
126
|
+
result
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def self.atom_document(scope)
|
|
130
|
+
return scope if scope.is_a?(Document)
|
|
131
|
+
cur = scope
|
|
132
|
+
cur = cur.parent while cur && cur.respond_to?(:parent) && cur.parent
|
|
133
|
+
cur
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def self.walk_descendants(scope, &block)
|
|
137
|
+
children =
|
|
138
|
+
if scope.is_a?(Document) || scope.is_a?(Element)
|
|
139
|
+
scope.children
|
|
140
|
+
else
|
|
141
|
+
[]
|
|
142
|
+
end
|
|
143
|
+
children.each do |c|
|
|
144
|
+
if c.element?
|
|
145
|
+
block.call(c)
|
|
146
|
+
walk_descendants(c, &block)
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def self.atom_matches?(atom, node)
|
|
152
|
+
Scrapetor::Selector.atom_matches?(atom, node)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def self.match_chain_backwards?(node, plan, idx, scope)
|
|
156
|
+
return true if idx < 0
|
|
157
|
+
atom = plan[idx]
|
|
158
|
+
combinator = plan[idx + 1].combinator
|
|
159
|
+
case combinator
|
|
160
|
+
when :child
|
|
161
|
+
parent = node.parent
|
|
162
|
+
return false unless parent.is_a?(Element)
|
|
163
|
+
return false unless in_scope?(parent, scope)
|
|
164
|
+
return false unless atom_matches?(atom, parent)
|
|
165
|
+
match_chain_backwards?(parent, plan, idx - 1, scope)
|
|
166
|
+
when :descendant, nil
|
|
167
|
+
cur = node.parent
|
|
168
|
+
while cur.is_a?(Element)
|
|
169
|
+
if in_scope?(cur, scope) && atom_matches?(atom, cur) &&
|
|
170
|
+
match_chain_backwards?(cur, plan, idx - 1, scope)
|
|
171
|
+
return true
|
|
172
|
+
end
|
|
173
|
+
cur = cur.parent
|
|
174
|
+
end
|
|
175
|
+
false
|
|
176
|
+
when :adj
|
|
177
|
+
prev = node.previous_element_sibling
|
|
178
|
+
return false unless prev.is_a?(Element)
|
|
179
|
+
return false unless in_scope?(prev, scope)
|
|
180
|
+
return false unless atom_matches?(atom, prev)
|
|
181
|
+
match_chain_backwards?(prev, plan, idx - 1, scope)
|
|
182
|
+
when :gen
|
|
183
|
+
prev = node.previous_element_sibling
|
|
184
|
+
while prev.is_a?(Element)
|
|
185
|
+
if in_scope?(prev, scope) && atom_matches?(atom, prev) &&
|
|
186
|
+
match_chain_backwards?(prev, plan, idx - 1, scope)
|
|
187
|
+
return true
|
|
188
|
+
end
|
|
189
|
+
prev = prev.previous_element_sibling
|
|
190
|
+
end
|
|
191
|
+
false
|
|
192
|
+
else
|
|
193
|
+
false
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def self.in_scope?(node, scope)
|
|
198
|
+
return true if scope.is_a?(Document)
|
|
199
|
+
cur = node
|
|
200
|
+
while cur
|
|
201
|
+
return true if cur.equal?(scope)
|
|
202
|
+
cur = cur.parent
|
|
203
|
+
end
|
|
204
|
+
false
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|