scrapetor 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +242 -0
- data/LICENSE +21 -0
- data/README.md +440 -0
- data/bin/scrapetor +190 -0
- data/bin/scrapetor-bench +5 -0
- data/ext/scrapetor/README.md +53 -0
- data/ext/scrapetor/native/extconf.rb +67 -0
- data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
- data/ext/scrapetor/native/scrapetor_http.c +2591 -0
- data/ext/scrapetor/native/scrapetor_native.c +1156 -0
- data/lib/scrapetor/builder.rb +158 -0
- data/lib/scrapetor/cleaner.rb +10 -0
- data/lib/scrapetor/comment_node.rb +67 -0
- data/lib/scrapetor/document.rb +457 -0
- data/lib/scrapetor/dom/parser.rb +69 -0
- data/lib/scrapetor/dom/selectors.rb +208 -0
- data/lib/scrapetor/dom.rb +563 -0
- data/lib/scrapetor/encoding.rb +85 -0
- data/lib/scrapetor/entities.rb +90 -0
- data/lib/scrapetor/errors.rb +12 -0
- data/lib/scrapetor/extractor.rb +147 -0
- data/lib/scrapetor/fetcher.rb +390 -0
- data/lib/scrapetor/fingerprint.rb +29 -0
- data/lib/scrapetor/form.rb +141 -0
- data/lib/scrapetor/http.rb +114 -0
- data/lib/scrapetor/microdata.rb +132 -0
- data/lib/scrapetor/money.rb +30 -0
- data/lib/scrapetor/native.rb +291 -0
- data/lib/scrapetor/native_dom.rb +2258 -0
- data/lib/scrapetor/node.rb +539 -0
- data/lib/scrapetor/node_set.rb +301 -0
- data/lib/scrapetor/page_type.rb +95 -0
- data/lib/scrapetor/pagination.rb +109 -0
- data/lib/scrapetor/persistent_cache.rb +130 -0
- data/lib/scrapetor/robots.rb +159 -0
- data/lib/scrapetor/sax.rb +285 -0
- data/lib/scrapetor/schema.rb +144 -0
- data/lib/scrapetor/selector.rb +576 -0
- data/lib/scrapetor/session.rb +141 -0
- data/lib/scrapetor/sitemap.rb +52 -0
- data/lib/scrapetor/stream.rb +111 -0
- data/lib/scrapetor/structured_data.rb +74 -0
- data/lib/scrapetor/template_registry.rb +24 -0
- data/lib/scrapetor/text_node.rb +101 -0
- data/lib/scrapetor/url.rb +21 -0
- data/lib/scrapetor/version.rb +5 -0
- data/lib/scrapetor/xpath.rb +1603 -0
- data/lib/scrapetor.rb +167 -0
- data/scrapetor.gemspec +77 -0
- metadata +200 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
# Bridge to the native streaming extraction engine.
|
|
5
|
+
#
|
|
6
|
+
# If the C extension is loadable, Native.available? is true and
|
|
7
|
+
# Native.compile_descriptor turns a Schema into the flat format the
|
|
8
|
+
# C side consumes. Schemas using features outside the native
|
|
9
|
+
# fast-path subset (combinators, pseudo-classes, nested repeated
|
|
10
|
+
# groups, top-level fields without a repeated context) compile to
|
|
11
|
+
# nil, and the Extractor falls back to the Ruby path.
|
|
12
|
+
module Native
|
|
13
|
+
begin
|
|
14
|
+
require "scrapetor/scrapetor_native"
|
|
15
|
+
AVAILABLE = true
|
|
16
|
+
LOAD_ERROR = nil
|
|
17
|
+
rescue LoadError => e
|
|
18
|
+
AVAILABLE = false
|
|
19
|
+
LOAD_ERROR = e
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def self.available?
|
|
23
|
+
AVAILABLE
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Compile a Schema into the descriptor format the C side consumes.
|
|
27
|
+
#
|
|
28
|
+
# desc = [group, group, ...]
|
|
29
|
+
# group = [name_sym, sel, fields_array]
|
|
30
|
+
# field = [name_sym, sel, attr_str_or_nil, type_sym, clean_bool,
|
|
31
|
+
# normalize_url_bool, multi_bool]
|
|
32
|
+
# sel = [tag_or_nil, classes_array, id_or_nil, attrs_array]
|
|
33
|
+
# attrs_array = [[name_str, op_str_or_nil, val_str_or_nil], ...]
|
|
34
|
+
#
|
|
35
|
+
# Returns nil if the schema uses features the native path doesn't
|
|
36
|
+
# support yet.
|
|
37
|
+
SYNTHETIC_ROOT = :__scrapetor_root__
|
|
38
|
+
HTML_ROOT_SEL = ["html", [], nil, []].freeze
|
|
39
|
+
|
|
40
|
+
# Memoised on the Schema instance — the descriptor Array tree is
|
|
41
|
+
# identical for every call against the same schema, so rebuilding
|
|
42
|
+
# it on each extract was just GC pressure. Both successful
|
|
43
|
+
# descriptors and the "can't compile" outcome are cached.
|
|
44
|
+
def self.compile_descriptor(schema)
|
|
45
|
+
cached = schema.instance_variable_get(:@__scrapetor_native_desc)
|
|
46
|
+
unless cached.nil?
|
|
47
|
+
return cached == false ? nil : cached
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
desc = build_descriptor(schema)
|
|
51
|
+
schema.instance_variable_set(:@__scrapetor_native_desc, desc.nil? ? false : desc)
|
|
52
|
+
desc
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# For "mixed" schemas (top-level fields + at least one repeated
|
|
56
|
+
# group) the C engine needs two passes — one for the groups, one
|
|
57
|
+
# for the synthetic root holding the fields. We split the schema
|
|
58
|
+
# here, memoise the result on the original Schema instance so the
|
|
59
|
+
# allocations only happen once, and let callers run the two
|
|
60
|
+
# extractions back-to-back.
|
|
61
|
+
def self.split_descriptor(schema, kind)
|
|
62
|
+
ivar = (kind == :groups ? :@__scrapetor_split_groups : :@__scrapetor_split_fields)
|
|
63
|
+
cached = schema.instance_variable_get(ivar)
|
|
64
|
+
unless cached.nil?
|
|
65
|
+
return cached == false ? nil : cached
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
sub = Schema.new
|
|
69
|
+
if kind == :groups
|
|
70
|
+
schema.groups.each { |g| sub.groups << g }
|
|
71
|
+
else
|
|
72
|
+
schema.fields.each { |f| sub.fields << f }
|
|
73
|
+
end
|
|
74
|
+
desc = build_descriptor(sub)
|
|
75
|
+
schema.instance_variable_set(ivar, desc.nil? ? false : desc)
|
|
76
|
+
desc
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def self.build_descriptor(schema)
|
|
80
|
+
groups = []
|
|
81
|
+
|
|
82
|
+
# Top-level fields become a synthetic group bound to the <html>
|
|
83
|
+
# element. The Document layer unwraps the single result back into
|
|
84
|
+
# the top of the response hash. Fragments without <html> fall back
|
|
85
|
+
# to the Ruby path.
|
|
86
|
+
if schema.fields.any?
|
|
87
|
+
field_descs = schema.fields.map { |f| compile_field(f) }
|
|
88
|
+
return nil if field_descs.any?(&:nil?)
|
|
89
|
+
groups << [SYNTHETIC_ROOT, HTML_ROOT_SEL, field_descs]
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
schema.groups.each do |g|
|
|
93
|
+
gd = compile_group(g)
|
|
94
|
+
return nil unless gd
|
|
95
|
+
groups << gd
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
return nil if groups.empty?
|
|
99
|
+
groups
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def self.compile_group(group)
|
|
103
|
+
sel = parse_selector(group.selector)
|
|
104
|
+
return nil unless sel
|
|
105
|
+
return nil unless group.groups.empty? # nested groups: Ruby fallback
|
|
106
|
+
fields = []
|
|
107
|
+
group.fields.each do |f|
|
|
108
|
+
fd = compile_field(f)
|
|
109
|
+
return nil unless fd
|
|
110
|
+
fields << fd
|
|
111
|
+
end
|
|
112
|
+
[group.name, sel, fields]
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def self.compile_field(field)
|
|
116
|
+
# Features the native engine doesn't yet support — fall back to Ruby.
|
|
117
|
+
return nil if field.selector.is_a?(Array)
|
|
118
|
+
return nil if field.transform
|
|
119
|
+
return nil unless field.default.nil?
|
|
120
|
+
return nil if field.required
|
|
121
|
+
return nil if %i[html list json boolean].include?(field.type)
|
|
122
|
+
|
|
123
|
+
# Try simple selector first.
|
|
124
|
+
simple = parse_selector(field.selector)
|
|
125
|
+
if simple
|
|
126
|
+
return [field.name, simple, field.attr_str, field.type,
|
|
127
|
+
!!field.clean, !!field.normalize_url, !!field.multi,
|
|
128
|
+
nil, nil]
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Try combinator selector.
|
|
132
|
+
combo = parse_selector_with_combinator(field.selector)
|
|
133
|
+
if combo
|
|
134
|
+
primary, combinator, context = combo
|
|
135
|
+
return [field.name, primary, field.attr_str, field.type,
|
|
136
|
+
!!field.clean, !!field.normalize_url, !!field.multi,
|
|
137
|
+
context, combinator]
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
nil
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Parse a CSS selector with at most one combinator (`A B` or `A > B`).
|
|
144
|
+
# Returns [primary_sel, combinator_str, context_sel] or nil if the
|
|
145
|
+
# input has multiple combinators or other unsupported syntax.
|
|
146
|
+
def self.parse_selector_with_combinator(selector)
|
|
147
|
+
s = selector.to_s.strip
|
|
148
|
+
return nil if s.empty?
|
|
149
|
+
|
|
150
|
+
# Split on first combinator at top level (outside [...] groups).
|
|
151
|
+
split = split_at_combinator(s)
|
|
152
|
+
return nil unless split
|
|
153
|
+
left_str, combinator, right_str = split
|
|
154
|
+
|
|
155
|
+
left = parse_selector(left_str)
|
|
156
|
+
right = parse_selector(right_str)
|
|
157
|
+
return nil unless left && right
|
|
158
|
+
|
|
159
|
+
[right, combinator, left]
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def self.split_at_combinator(s)
|
|
163
|
+
depth = 0
|
|
164
|
+
i = 0
|
|
165
|
+
while i < s.length
|
|
166
|
+
ch = s[i]
|
|
167
|
+
if ch == "["
|
|
168
|
+
depth += 1
|
|
169
|
+
elsif ch == "]"
|
|
170
|
+
depth -= 1 if depth.positive?
|
|
171
|
+
elsif depth.zero?
|
|
172
|
+
if ch == ">"
|
|
173
|
+
left = s[0...i].strip
|
|
174
|
+
right = s[(i + 1)..].strip
|
|
175
|
+
return nil if left.empty? || right.empty?
|
|
176
|
+
# Reject if there are further combinators in either half.
|
|
177
|
+
return nil if has_combinator?(left) || has_combinator?(right)
|
|
178
|
+
return [left, "child", right]
|
|
179
|
+
elsif ch == " " || ch == "\t" || ch == "\n"
|
|
180
|
+
left = s[0...i].strip
|
|
181
|
+
rest = s[(i + 1)..].lstrip
|
|
182
|
+
next i += 1 if rest.empty?
|
|
183
|
+
# The next non-whitespace char must not be > / + / ~ — those
|
|
184
|
+
# are picked up on their own iteration.
|
|
185
|
+
if !left.empty? && !"<>+~,".include?(rest[0] || "")
|
|
186
|
+
right = rest
|
|
187
|
+
return nil if has_combinator?(left) || has_combinator?(right)
|
|
188
|
+
return [left, "descendant", right]
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
i += 1
|
|
193
|
+
end
|
|
194
|
+
nil
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def self.has_combinator?(s)
|
|
198
|
+
depth = 0
|
|
199
|
+
s.each_char do |ch|
|
|
200
|
+
if ch == "[" then depth += 1
|
|
201
|
+
elsif ch == "]" then depth -= 1 if depth.positive?
|
|
202
|
+
elsif depth.zero?
|
|
203
|
+
return true if [" ", "\t", "\n", ">", "+", "~"].include?(ch)
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
false
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Parse a simple CSS selector into the [tag, classes, id, attrs] form
|
|
210
|
+
# that the C engine accepts. Returns nil if the selector uses
|
|
211
|
+
# combinators or pseudo-classes (those force the Ruby fallback).
|
|
212
|
+
#
|
|
213
|
+
# Supported:
|
|
214
|
+
# tag div
|
|
215
|
+
# .class .product-card
|
|
216
|
+
# tag.class.other span.price.big
|
|
217
|
+
# #id #main
|
|
218
|
+
# tag#id article#main
|
|
219
|
+
# [attr] [data-sku]
|
|
220
|
+
# [attr=val] [data-sku="A1"]
|
|
221
|
+
# [attr*=val] [class*=card]
|
|
222
|
+
# [attr^=val] [href^=https]
|
|
223
|
+
# [attr$=val] [href$=.pdf]
|
|
224
|
+
# [attr~=val] [class~=primary]
|
|
225
|
+
# [attr|=val] [lang|=en]
|
|
226
|
+
# ... and combinations
|
|
227
|
+
def self.parse_selector(selector)
|
|
228
|
+
return nil unless selector
|
|
229
|
+
s = selector.to_s.strip
|
|
230
|
+
return nil if s.empty?
|
|
231
|
+
# Check for combinators / unsupported syntax outside [...] brackets,
|
|
232
|
+
# since `*` and `~` are valid inside attribute operators.
|
|
233
|
+
outside = s.gsub(/\[[^\]]*\]/, "")
|
|
234
|
+
return nil if outside =~ /[\s>+~,*]/
|
|
235
|
+
|
|
236
|
+
tag = nil
|
|
237
|
+
classes = []
|
|
238
|
+
id = nil
|
|
239
|
+
attrs = []
|
|
240
|
+
|
|
241
|
+
i = 0
|
|
242
|
+
if (m = s[i..].match(/\A([a-zA-Z][\w-]*)/))
|
|
243
|
+
tag = m[1].downcase
|
|
244
|
+
i += m[0].length
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
while i < s.length
|
|
248
|
+
case s[i]
|
|
249
|
+
when "."
|
|
250
|
+
m = s[i..].match(/\A\.([\w-]+)/)
|
|
251
|
+
return nil unless m
|
|
252
|
+
classes << m[1]
|
|
253
|
+
i += m[0].length
|
|
254
|
+
when "#"
|
|
255
|
+
m = s[i..].match(/\A#([\w-]+)/)
|
|
256
|
+
return nil unless m
|
|
257
|
+
return nil if id # only one id allowed
|
|
258
|
+
id = m[1]
|
|
259
|
+
i += m[0].length
|
|
260
|
+
when "["
|
|
261
|
+
# Mirror Scrapetor::Selector::ATTR_RE — same quote-style-aware
|
|
262
|
+
# value extraction so an attribute like `[class*="L'appareil"]`
|
|
263
|
+
# parses without choking on the embedded apostrophe.
|
|
264
|
+
m = s[i..].match(/
|
|
265
|
+
\A\[
|
|
266
|
+
([\w:\-\u{0080}-\u{10FFFF}]+)
|
|
267
|
+
(?:
|
|
268
|
+
([*^$~|]?=)
|
|
269
|
+
(?:
|
|
270
|
+
"((?:[^"\\]|\\.)*)"
|
|
271
|
+
| '((?:[^'\\]|\\.)*)'
|
|
272
|
+
| ([^\]\s]+)
|
|
273
|
+
)
|
|
274
|
+
)?
|
|
275
|
+
\]
|
|
276
|
+
/x)
|
|
277
|
+
return nil unless m
|
|
278
|
+
attrs << [m[1], m[2], (m[3] || m[4] || m[5])]
|
|
279
|
+
i += m[0].length
|
|
280
|
+
else
|
|
281
|
+
return nil
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
return nil if tag.nil? && classes.empty? && id.nil? && attrs.empty?
|
|
286
|
+
return nil if classes.size > 8 || attrs.size > 8
|
|
287
|
+
|
|
288
|
+
[tag, classes, id, attrs]
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
end
|