scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,291 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Bridge to the native streaming extraction engine.
5
+ #
6
+ # If the C extension is loadable, Native.available? is true and
7
+ # Native.compile_descriptor turns a Schema into the flat format the
8
+ # C side consumes. Schemas using features outside the native
9
+ # fast-path subset (combinators, pseudo-classes, nested repeated
10
+ # groups, top-level fields without a repeated context) compile to
11
+ # nil, and the Extractor falls back to the Ruby path.
12
+ module Native
13
+ begin
14
+ require "scrapetor/scrapetor_native"
15
+ AVAILABLE = true
16
+ LOAD_ERROR = nil
17
+ rescue LoadError => e
18
+ AVAILABLE = false
19
+ LOAD_ERROR = e
20
+ end
21
+
22
+ def self.available?
23
+ AVAILABLE
24
+ end
25
+
26
+ # Compile a Schema into the descriptor format the C side consumes.
27
+ #
28
+ # desc = [group, group, ...]
29
+ # group = [name_sym, sel, fields_array]
30
+ # field = [name_sym, sel, attr_str_or_nil, type_sym, clean_bool,
31
+ # normalize_url_bool, multi_bool]
32
+ # sel = [tag_or_nil, classes_array, id_or_nil, attrs_array]
33
+ # attrs_array = [[name_str, op_str_or_nil, val_str_or_nil], ...]
34
+ #
35
+ # Returns nil if the schema uses features the native path doesn't
36
+ # support yet.
37
+ SYNTHETIC_ROOT = :__scrapetor_root__
38
+ HTML_ROOT_SEL = ["html", [], nil, []].freeze
39
+
40
+ # Memoised on the Schema instance — the descriptor Array tree is
41
+ # identical for every call against the same schema, so rebuilding
42
+ # it on each extract was just GC pressure. Both successful
43
+ # descriptors and the "can't compile" outcome are cached.
44
+ def self.compile_descriptor(schema)
45
+ cached = schema.instance_variable_get(:@__scrapetor_native_desc)
46
+ unless cached.nil?
47
+ return cached == false ? nil : cached
48
+ end
49
+
50
+ desc = build_descriptor(schema)
51
+ schema.instance_variable_set(:@__scrapetor_native_desc, desc.nil? ? false : desc)
52
+ desc
53
+ end
54
+
55
+ # For "mixed" schemas (top-level fields + at least one repeated
56
+ # group) the C engine needs two passes — one for the groups, one
57
+ # for the synthetic root holding the fields. We split the schema
58
+ # here, memoise the result on the original Schema instance so the
59
+ # allocations only happen once, and let callers run the two
60
+ # extractions back-to-back.
61
+ def self.split_descriptor(schema, kind)
62
+ ivar = (kind == :groups ? :@__scrapetor_split_groups : :@__scrapetor_split_fields)
63
+ cached = schema.instance_variable_get(ivar)
64
+ unless cached.nil?
65
+ return cached == false ? nil : cached
66
+ end
67
+
68
+ sub = Schema.new
69
+ if kind == :groups
70
+ schema.groups.each { |g| sub.groups << g }
71
+ else
72
+ schema.fields.each { |f| sub.fields << f }
73
+ end
74
+ desc = build_descriptor(sub)
75
+ schema.instance_variable_set(ivar, desc.nil? ? false : desc)
76
+ desc
77
+ end
78
+
79
+ def self.build_descriptor(schema)
80
+ groups = []
81
+
82
+ # Top-level fields become a synthetic group bound to the <html>
83
+ # element. The Document layer unwraps the single result back into
84
+ # the top of the response hash. Fragments without <html> fall back
85
+ # to the Ruby path.
86
+ if schema.fields.any?
87
+ field_descs = schema.fields.map { |f| compile_field(f) }
88
+ return nil if field_descs.any?(&:nil?)
89
+ groups << [SYNTHETIC_ROOT, HTML_ROOT_SEL, field_descs]
90
+ end
91
+
92
+ schema.groups.each do |g|
93
+ gd = compile_group(g)
94
+ return nil unless gd
95
+ groups << gd
96
+ end
97
+
98
+ return nil if groups.empty?
99
+ groups
100
+ end
101
+
102
+ def self.compile_group(group)
103
+ sel = parse_selector(group.selector)
104
+ return nil unless sel
105
+ return nil unless group.groups.empty? # nested groups: Ruby fallback
106
+ fields = []
107
+ group.fields.each do |f|
108
+ fd = compile_field(f)
109
+ return nil unless fd
110
+ fields << fd
111
+ end
112
+ [group.name, sel, fields]
113
+ end
114
+
115
+ def self.compile_field(field)
116
+ # Features the native engine doesn't yet support — fall back to Ruby.
117
+ return nil if field.selector.is_a?(Array)
118
+ return nil if field.transform
119
+ return nil unless field.default.nil?
120
+ return nil if field.required
121
+ return nil if %i[html list json boolean].include?(field.type)
122
+
123
+ # Try simple selector first.
124
+ simple = parse_selector(field.selector)
125
+ if simple
126
+ return [field.name, simple, field.attr_str, field.type,
127
+ !!field.clean, !!field.normalize_url, !!field.multi,
128
+ nil, nil]
129
+ end
130
+
131
+ # Try combinator selector.
132
+ combo = parse_selector_with_combinator(field.selector)
133
+ if combo
134
+ primary, combinator, context = combo
135
+ return [field.name, primary, field.attr_str, field.type,
136
+ !!field.clean, !!field.normalize_url, !!field.multi,
137
+ context, combinator]
138
+ end
139
+
140
+ nil
141
+ end
142
+
143
+ # Parse a CSS selector with at most one combinator (`A B` or `A > B`).
144
+ # Returns [primary_sel, combinator_str, context_sel] or nil if the
145
+ # input has multiple combinators or other unsupported syntax.
146
+ def self.parse_selector_with_combinator(selector)
147
+ s = selector.to_s.strip
148
+ return nil if s.empty?
149
+
150
+ # Split on first combinator at top level (outside [...] groups).
151
+ split = split_at_combinator(s)
152
+ return nil unless split
153
+ left_str, combinator, right_str = split
154
+
155
+ left = parse_selector(left_str)
156
+ right = parse_selector(right_str)
157
+ return nil unless left && right
158
+
159
+ [right, combinator, left]
160
+ end
161
+
162
+ def self.split_at_combinator(s)
163
+ depth = 0
164
+ i = 0
165
+ while i < s.length
166
+ ch = s[i]
167
+ if ch == "["
168
+ depth += 1
169
+ elsif ch == "]"
170
+ depth -= 1 if depth.positive?
171
+ elsif depth.zero?
172
+ if ch == ">"
173
+ left = s[0...i].strip
174
+ right = s[(i + 1)..].strip
175
+ return nil if left.empty? || right.empty?
176
+ # Reject if there are further combinators in either half.
177
+ return nil if has_combinator?(left) || has_combinator?(right)
178
+ return [left, "child", right]
179
+ elsif ch == " " || ch == "\t" || ch == "\n"
180
+ left = s[0...i].strip
181
+ rest = s[(i + 1)..].lstrip
182
+ next i += 1 if rest.empty?
183
+ # The next non-whitespace char must not be > / + / ~ — those
184
+ # are picked up on their own iteration.
185
+ if !left.empty? && !"<>+~,".include?(rest[0] || "")
186
+ right = rest
187
+ return nil if has_combinator?(left) || has_combinator?(right)
188
+ return [left, "descendant", right]
189
+ end
190
+ end
191
+ end
192
+ i += 1
193
+ end
194
+ nil
195
+ end
196
+
197
+ def self.has_combinator?(s)
198
+ depth = 0
199
+ s.each_char do |ch|
200
+ if ch == "[" then depth += 1
201
+ elsif ch == "]" then depth -= 1 if depth.positive?
202
+ elsif depth.zero?
203
+ return true if [" ", "\t", "\n", ">", "+", "~"].include?(ch)
204
+ end
205
+ end
206
+ false
207
+ end
208
+
209
+ # Parse a simple CSS selector into the [tag, classes, id, attrs] form
210
+ # that the C engine accepts. Returns nil if the selector uses
211
+ # combinators or pseudo-classes (those force the Ruby fallback).
212
+ #
213
+ # Supported:
214
+ # tag div
215
+ # .class .product-card
216
+ # tag.class.other span.price.big
217
+ # #id #main
218
+ # tag#id article#main
219
+ # [attr] [data-sku]
220
+ # [attr=val] [data-sku="A1"]
221
+ # [attr*=val] [class*=card]
222
+ # [attr^=val] [href^=https]
223
+ # [attr$=val] [href$=.pdf]
224
+ # [attr~=val] [class~=primary]
225
+ # [attr|=val] [lang|=en]
226
+ # ... and combinations
227
+ def self.parse_selector(selector)
228
+ return nil unless selector
229
+ s = selector.to_s.strip
230
+ return nil if s.empty?
231
+ # Check for combinators / unsupported syntax outside [...] brackets,
232
+ # since `*` and `~` are valid inside attribute operators.
233
+ outside = s.gsub(/\[[^\]]*\]/, "")
234
+ return nil if outside =~ /[\s>+~,*]/
235
+
236
+ tag = nil
237
+ classes = []
238
+ id = nil
239
+ attrs = []
240
+
241
+ i = 0
242
+ if (m = s[i..].match(/\A([a-zA-Z][\w-]*)/))
243
+ tag = m[1].downcase
244
+ i += m[0].length
245
+ end
246
+
247
+ while i < s.length
248
+ case s[i]
249
+ when "."
250
+ m = s[i..].match(/\A\.([\w-]+)/)
251
+ return nil unless m
252
+ classes << m[1]
253
+ i += m[0].length
254
+ when "#"
255
+ m = s[i..].match(/\A#([\w-]+)/)
256
+ return nil unless m
257
+ return nil if id # only one id allowed
258
+ id = m[1]
259
+ i += m[0].length
260
+ when "["
261
+ # Mirror Scrapetor::Selector::ATTR_RE — same quote-style-aware
262
+ # value extraction so an attribute like `[class*="L'appareil"]`
263
+ # parses without choking on the embedded apostrophe.
264
+ m = s[i..].match(/
265
+ \A\[
266
+ ([\w:\-\u{0080}-\u{10FFFF}]+)
267
+ (?:
268
+ ([*^$~|]?=)
269
+ (?:
270
+ "((?:[^"\\]|\\.)*)"
271
+ | '((?:[^'\\]|\\.)*)'
272
+ | ([^\]\s]+)
273
+ )
274
+ )?
275
+ \]
276
+ /x)
277
+ return nil unless m
278
+ attrs << [m[1], m[2], (m[3] || m[4] || m[5])]
279
+ i += m[0].length
280
+ else
281
+ return nil
282
+ end
283
+ end
284
+
285
+ return nil if tag.nil? && classes.empty? && id.nil? && attrs.empty?
286
+ return nil if classes.size > 8 || attrs.size > 8
287
+
288
+ [tag, classes, id, attrs]
289
+ end
290
+ end
291
+ end