scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,539 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "date"
4
+
5
+ module Scrapetor
6
+ # Featherweight node wrapper. Holds a document reference and a backing
7
+ # Nokolexbor element. Selector ops delegate to the backing engine in
8
+ # Phase 1; the native extension (Phase 2) replaces this with arena-DOM
9
+ # + bytecode VM.
10
+ class Node
11
+ def initialize(doc, backing)
12
+ @doc = doc
13
+ @nlx = backing
14
+ end
15
+
16
+ def text
17
+ @nlx.text
18
+ end
19
+
20
+ def clean_text
21
+ Cleaner.clean(text)
22
+ end
23
+
24
+ def visible_text
25
+ stripped = @nlx.dup
26
+ stripped.css("script, style, noscript").each(&:remove) if stripped.respond_to?(:css)
27
+ Cleaner.clean(stripped.text)
28
+ end
29
+
30
+ def inner_html
31
+ @nlx.inner_html
32
+ end
33
+
34
+ def outer_html
35
+ @nlx.to_html
36
+ end
37
+ alias to_html outer_html
38
+
39
+ def name
40
+ @nlx.name
41
+ end
42
+ alias node_name name
43
+ alias tag_name name
44
+
45
+ # Nokogiri-compat: `content` and `inner_text` are aliases for `text`.
46
+ alias content text
47
+ alias inner_text text
48
+
49
+ # Nokogiri-compat: return all element attributes as a Hash.
50
+ def attributes
51
+ h = {}
52
+ @nlx.attribute_nodes.each { |a| h[a.name] = a.value }
53
+ h
54
+ end
55
+
56
+ def keys
57
+ @nlx.attribute_nodes.map(&:name)
58
+ end
59
+
60
+ def values
61
+ @nlx.attribute_nodes.map(&:value)
62
+ end
63
+
64
+ def has_attribute?(name)
65
+ !@nlx[name.to_s].nil?
66
+ end
67
+ alias key? has_attribute?
68
+ alias attribute? has_attribute?
69
+
70
+ def element?
71
+ @nlx.respond_to?(:element?) ? @nlx.element? : true
72
+ end
73
+
74
+ def document?; false; end
75
+
76
+ # Iterate over attributes as Nokogiri does.
77
+ def each_attribute
78
+ return enum_for(:each_attribute) unless block_given?
79
+ @nlx.attribute_nodes.each { |a| yield [a.name, a.value] }
80
+ end
81
+
82
+ def attr(key)
83
+ @nlx[key.to_s]
84
+ end
85
+
86
+ def [](key)
87
+ @nlx[key.to_s]
88
+ end
89
+
90
+ def absolute_url(base = nil)
91
+ href = @nlx["href"] || @nlx["src"]
92
+ URL.absolute(href, base || @doc.base_url)
93
+ end
94
+
95
+ def money
96
+ Money.parse(text)
97
+ end
98
+
99
+ def number
100
+ v = text.to_s.gsub(/[^\d.\-]/, "")
101
+ return nil if v.empty? || v == "-"
102
+ v.include?(".") ? v.to_f : v.to_i
103
+ end
104
+
105
+ def date
106
+ Date.parse(text.to_s)
107
+ rescue ArgumentError, TypeError
108
+ nil
109
+ end
110
+
111
+ # Nokogiri-compat: `node.css(selector, ns_or_handler)`. Extra args
112
+ # are XPath-only and harmless to ignore for CSS.
113
+ def css(selector, *_extra)
114
+ result = @nlx.css(selector)
115
+ # `::text` / `::attr(name)` queries hand back a flat Array of
116
+ # String/TextNode. Pass that through as-is. For everything else
117
+ # — including the empty-NodeSet case — wrap in a NodeSet so the
118
+ # caller can chain `.at_css`, `.each_with_index`, etc. Detect the
119
+ # pseudo-element shape by checking the selector string; relying
120
+ # on the result shape would mis-classify zero-match queries.
121
+ if result.is_a?(Array) && selector_pseudo_element?(selector)
122
+ return result
123
+ end
124
+ NodeSet.new(@doc, result.to_a)
125
+ end
126
+
127
+ def at(selector, *_extra)
128
+ n = @nlx.at_css(selector)
129
+ return n if n.is_a?(String)
130
+ n && Node.new(@doc, n)
131
+ end
132
+
133
+ private
134
+
135
+ def selector_pseudo_element?(sel)
136
+ s = sel.to_s
137
+ s.include?("::") && s =~ /::(?:text|attr\([^)]+\)|first-letter|first-line|before|after)\s*\z/i
138
+ end
139
+
140
+ public
141
+ alias at_css at
142
+ alias search css
143
+
144
+ # Native C versions of Node#at and Node#css are installed by
145
+ # native_dom.rb after the Native extension module is loaded —
146
+ # they aren't available at this point in the require chain.
147
+
148
+ # Batch API: array of selector strings → array of results,
149
+ # one C round-trip total. Delegates to the underlying Element's
150
+ # batch_css; falls back to N individual css() calls if the
151
+ # backing node doesn't expose batch.
152
+ def batch_css(selectors)
153
+ if @nlx.respond_to?(:batch_css)
154
+ results = @nlx.batch_css(selectors)
155
+ results.map do |r|
156
+ case r
157
+ when Array
158
+ # ::text / ::attr results — array of strings; pass through.
159
+ # Element arrays — wrap in NodeSet.
160
+ if r.empty? || r.first.is_a?(String)
161
+ r
162
+ else
163
+ NodeSet.new(@doc, r)
164
+ end
165
+ else
166
+ r # NodeSet or other
167
+ end
168
+ end
169
+ else
170
+ selectors.map { |s| css(s) }
171
+ end
172
+ end
173
+
174
+ # Hash-form batch: {key => selector} → {key => result}.
175
+ def extract_css(map)
176
+ keys = map.keys
177
+ results = batch_css(map.values)
178
+ out = {}
179
+ keys.each_with_index { |k, i| out[k] = results[i] }
180
+ out
181
+ end
182
+
183
+ # Per-result extract: routes to the underlying Element's C-side
184
+ # extract entry point (one C call assembles the whole hash).
185
+ def extract(map)
186
+ if @nlx.respond_to?(:extract)
187
+ @nlx.extract(map)
188
+ else
189
+ out = {}
190
+ map.each_pair { |k, sel| out[k] = at_css(sel) }
191
+ out
192
+ end
193
+ end
194
+
195
+ # extract_each: under this node, run the outer + inner field
196
+ # plans entirely in C. One round-trip, Array<Hash> back.
197
+ def extract_each(outer_selector, fields)
198
+ if @nlx.respond_to?(:extract_each)
199
+ @nlx.extract_each(outer_selector, fields)
200
+ else
201
+ css(outer_selector).map { |n| n.extract(fields) }
202
+ end
203
+ end
204
+
205
+ def children
206
+ kids = @nlx.children.to_a.select { |c| c.respond_to?(:element?) && c.element? }
207
+ NodeSet.new(@doc, kids)
208
+ end
209
+
210
+ def parent
211
+ p = @nlx.parent
212
+ return nil if p.nil? || (defined?(Dom::Document) && p.is_a?(Dom::Document))
213
+ Node.new(@doc, p)
214
+ end
215
+
216
+ # Nokogiri-compatible: returns the literal next node (may be a text /
217
+ # comment node). Use `next_element_sibling` (or `next_element`) to skip
218
+ # non-element siblings.
219
+ def next_sibling
220
+ sib = @nlx.next_sibling
221
+ sib && Node.new(@doc, sib)
222
+ end
223
+
224
+ def previous_sibling
225
+ sib = @nlx.previous_sibling
226
+ sib && Node.new(@doc, sib)
227
+ end
228
+
229
+ def next_element_sibling
230
+ sib = @nlx.next_sibling
231
+ while sib && !(sib.respond_to?(:element?) && sib.element?)
232
+ sib = sib.next_sibling
233
+ end
234
+ sib && Node.new(@doc, sib)
235
+ end
236
+ alias next_element next_element_sibling
237
+
238
+ def previous_element_sibling
239
+ sib = @nlx.previous_sibling
240
+ while sib && !(sib.respond_to?(:element?) && sib.element?)
241
+ sib = sib.previous_sibling
242
+ end
243
+ sib && Node.new(@doc, sib)
244
+ end
245
+ alias previous_element previous_element_sibling
246
+
247
+ def fingerprint
248
+ Fingerprint.structural(self)
249
+ end
250
+
251
+ def backing_node
252
+ @nlx
253
+ end
254
+
255
+ # ----- Mutation API (delegated to Nokolexbor) -----
256
+
257
+ def []=(key, value)
258
+ @nlx[key.to_s] = value.nil? ? nil : value.to_s
259
+ value
260
+ end
261
+ alias set_attribute []=
262
+
263
+ def get_attribute(key)
264
+ @nlx[key.to_s]
265
+ end
266
+
267
+ def remove_attribute(key)
268
+ @nlx.remove_attribute(key.to_s)
269
+ self
270
+ end
271
+ alias delete_attribute remove_attribute
272
+
273
+ def content=(text)
274
+ @nlx.content = text.to_s
275
+ text
276
+ end
277
+
278
+ def inner_html=(html)
279
+ @nlx.inner_html = html.to_s
280
+ html
281
+ end
282
+
283
+ def add_child(node_or_html)
284
+ wrap_result(@nlx.add_child(unwrap_mut(node_or_html)))
285
+ end
286
+ alias << add_child
287
+ alias add_child! add_child
288
+
289
+ def add_previous_sibling(node_or_html)
290
+ wrap_result(@nlx.add_previous_sibling(unwrap_mut(node_or_html)))
291
+ end
292
+ alias before add_previous_sibling
293
+
294
+ def add_next_sibling(node_or_html)
295
+ wrap_result(@nlx.add_next_sibling(unwrap_mut(node_or_html)))
296
+ end
297
+ alias after add_next_sibling
298
+
299
+ def replace(node_or_html)
300
+ wrap_result(@nlx.replace(unwrap_mut(node_or_html)))
301
+ end
302
+ alias replace_with replace
303
+ alias swap replace
304
+
305
+ def remove
306
+ @nlx.remove
307
+ self
308
+ end
309
+ alias unlink remove
310
+ alias delete remove
311
+
312
+ # ----- Class manipulation -----
313
+
314
+ def add_class(klass)
315
+ @nlx.add_class(klass.to_s)
316
+ self
317
+ end
318
+ alias append_class add_class
319
+
320
+ def remove_class(klass = nil)
321
+ if klass.nil?
322
+ @nlx.remove_attribute("class")
323
+ else
324
+ @nlx.remove_class(klass.to_s)
325
+ end
326
+ self
327
+ end
328
+
329
+ def classes
330
+ (@nlx["class"] || "").split(/\s+/).reject(&:empty?)
331
+ end
332
+
333
+ def has_class?(klass)
334
+ classes.include?(klass.to_s)
335
+ end
336
+
337
+ # ----- Extra Nokogiri-compat aliases -----
338
+
339
+ alias prev previous_sibling
340
+ alias previous previous_sibling
341
+ alias next next_sibling
342
+
343
+ def first_element_child
344
+ c = @nlx.children.to_a.find { |x| x.respond_to?(:element?) && x.element? }
345
+ c && Node.new(@doc, c)
346
+ end
347
+
348
+ # Nokogiri-compat: `Node#child` returns the first child regardless
349
+ # of node type (text / element / comment). Used by parsers that
350
+ # poke at the immediate inner content (e.g. heading nodes whose
351
+ # text lives in a text-node child).
352
+ def child
353
+ c = @nlx.children.to_a.first
354
+ c && Node.new(@doc, c)
355
+ end
356
+
357
+ def last_element_child
358
+ c = @nlx.children.to_a.reverse.find { |x| x.respond_to?(:element?) && x.element? }
359
+ c && Node.new(@doc, c)
360
+ end
361
+
362
+ def element_children
363
+ kids = @nlx.children.to_a.select { |x| x.respond_to?(:element?) && x.element? }
364
+ NodeSet.new(@doc, kids)
365
+ end
366
+ alias elements element_children
367
+
368
+ def node_type
369
+ @nlx.respond_to?(:node_type) ? @nlx.node_type : 1
370
+ end
371
+ alias type node_type
372
+
373
+ def path
374
+ @nlx.path if @nlx.respond_to?(:path)
375
+ end
376
+
377
+ # Build a minimal CSS path back to this node (id-based when
378
+ # available, falling back to tag + :nth-of-type indexing).
379
+ def css_path
380
+ parts = []
381
+ cur = @nlx
382
+ while cur && cur.respond_to?(:name) && cur.element?
383
+ if (id = cur["id"]) && !id.empty?
384
+ parts.unshift("##{id}")
385
+ break
386
+ end
387
+ index = 1
388
+ sib = cur.previous_sibling
389
+ while sib
390
+ index += 1 if sib.respond_to?(:element?) && sib.element? && sib.name == cur.name
391
+ sib = sib.previous_sibling
392
+ end
393
+ parts.unshift("#{cur.name}:nth-of-type(#{index})")
394
+ cur = cur.parent
395
+ end
396
+ parts.join(" > ")
397
+ end
398
+
399
+ # XPath path to this node.
400
+ def xpath_path
401
+ path
402
+ end
403
+
404
+ def traverse(&block)
405
+ return enum_for(:traverse) unless block_given?
406
+ yield self
407
+ element_children.each { |c| c.traverse(&block) }
408
+ end
409
+
410
+ def ancestors(selector = nil)
411
+ list = []
412
+ cur = parent
413
+ while cur
414
+ list << cur
415
+ cur = cur.parent
416
+ end
417
+ result = NodeSet.new(@doc, list.map(&:backing_node))
418
+ selector.nil? ? result : result.select { |n| n.matches?(selector) }
419
+ end
420
+
421
+ def matches?(selector)
422
+ ns = @doc.css(selector)
423
+ ns.to_a.any? { |n| n.backing_node == @nlx }
424
+ end
425
+
426
+ # XPath helpers. The native engine doesn't yet implement XPath, so we
427
+ # return empty results rather than NoMethodError on Node — this keeps
428
+ # callers that probe both engines from crashing.
429
+ # Evaluate an XPath expression against this node (relative
430
+ # expressions are scoped to it). See Scrapetor::Document#xpath
431
+ # for the supported subset.
432
+ def xpath(expr)
433
+ Scrapetor::XPath.evaluate(self, expr)
434
+ end
435
+
436
+ def at_xpath(expr)
437
+ result = xpath(expr)
438
+ result.is_a?(Array) ? result.first : result
439
+ end
440
+
441
+ def wrap(html_or_node)
442
+ if @nlx.respond_to?(:wrap)
443
+ @nlx.wrap(html_or_node)
444
+ end
445
+ self
446
+ end
447
+
448
+ def blank?
449
+ text.to_s.strip.empty?
450
+ end
451
+
452
+ def attribute_nodes
453
+ @nlx.attribute_nodes
454
+ end
455
+
456
+ def attribute(name)
457
+ @nlx.attribute_nodes.find { |a| a.name == name.to_s }
458
+ end
459
+
460
+ def to_xml(*args)
461
+ @nlx.to_html(*args)
462
+ end
463
+ alias to_str to_html
464
+
465
+ def comment?
466
+ node_type == 8
467
+ end
468
+
469
+ def text?
470
+ node_type == 3
471
+ end
472
+ alias text_node? text?
473
+
474
+ def cdata?
475
+ node_type == 4
476
+ end
477
+
478
+ def processing_instruction?
479
+ node_type == 7
480
+ end
481
+
482
+ def fragment?
483
+ false
484
+ end
485
+
486
+ def document
487
+ @doc
488
+ end
489
+
490
+ def root
491
+ @doc.root
492
+ end
493
+
494
+ def write_to(io, *args)
495
+ io.write(to_html(*args))
496
+ end
497
+
498
+ def serialize(*args)
499
+ to_html(*args)
500
+ end
501
+
502
+ def ==(other)
503
+ other.is_a?(Node) && @nlx == other.backing_node
504
+ end
505
+ alias eql? ==
506
+
507
+ def hash
508
+ @nlx.hash
509
+ end
510
+
511
+ private
512
+
513
+ def wrap_result(result)
514
+ return nil if result.nil?
515
+ case result
516
+ when Node
517
+ result
518
+ when Array
519
+ NodeSet.new(@doc, result)
520
+ else
521
+ if result.respond_to?(:element?) && result.element?
522
+ Node.new(@doc, result)
523
+ else
524
+ result
525
+ end
526
+ end
527
+ rescue StandardError
528
+ result
529
+ end
530
+
531
+ def unwrap_mut(node_or_html)
532
+ if node_or_html.is_a?(Node)
533
+ node_or_html.backing_node
534
+ else
535
+ node_or_html
536
+ end
537
+ end
538
+ end
539
+ end