scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ module Dom
5
+ # Build a Dom::Document from raw HTML via the SAX tokenizer.
6
+ module Parser
7
+ VOID_TAGS = Scrapetor::Dom::VOID.to_h { |t| [t, true] }.freeze
8
+
9
+ def self.parse(html)
10
+ doc = Dom::Document.new
11
+ stack = [doc]
12
+ tokenizer = Scrapetor::SAX::Tokenizer.new(html)
13
+ tokenizer.each_event do |event|
14
+ type, *args = event
15
+ case type
16
+ when :doc_start, :doc_end
17
+ # no-op
18
+ when :doctype
19
+ doc.doctype = args[0]
20
+ when :start
21
+ name, attrs = args
22
+ element = Element.new(name, attrs || {})
23
+ stack.last.add_child(element)
24
+ stack.push(element) unless VOID_TAGS[element.name]
25
+ when :end
26
+ name = args[0]
27
+ # Pop frames until matching close or root.
28
+ idx = stack.rindex { |n| n.is_a?(Element) && n.name == name }
29
+ if idx
30
+ stack.slice!(idx..)
31
+ end
32
+ when :text
33
+ stack.last.add_child(Text.new(args[0]))
34
+ when :comment
35
+ stack.last.add_child(Comment.new(args[0]))
36
+ end
37
+ end
38
+ doc
39
+ end
40
+
41
+ # Parse a fragment — return an Array of nodes (no Document wrapper).
42
+ def self.fragment(html)
43
+ wrapper = Element.new("__fragment__")
44
+ stack = [wrapper]
45
+ Scrapetor::SAX::Tokenizer.new(html).each_event do |event|
46
+ type, *args = event
47
+ case type
48
+ when :start
49
+ name, attrs = args
50
+ element = Element.new(name, attrs || {})
51
+ stack.last.add_child(element)
52
+ stack.push(element) unless VOID_TAGS[element.name]
53
+ when :end
54
+ name = args[0]
55
+ idx = stack.rindex { |n| n.is_a?(Element) && n.name == name }
56
+ stack.slice!(idx..) if idx && idx > 0
57
+ when :text
58
+ stack.last.add_child(Text.new(args[0]))
59
+ when :comment
60
+ stack.last.add_child(Comment.new(args[0]))
61
+ end
62
+ end
63
+ nodes = wrapper.children
64
+ nodes.each { |n| n.parent = nil }
65
+ nodes
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,208 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ module Dom
5
+ # CSS selector engine over the pure-Ruby DOM.
6
+ #
7
+ # Pipeline:
8
+ # 1. Compile the selector string into a list of "atoms" with
9
+ # combinators (reuses `Scrapetor::Selector.compile`).
10
+ # 2. Find candidates matching the rightmost atom by walking the
11
+ # subtree once (no global indexes — the DOM is small enough
12
+ # that one scan is faster than maintaining indexes for the
13
+ # typical scraping document).
14
+ # 3. For each candidate, walk ancestors right-to-left to verify
15
+ # the rest of the chain.
16
+ #
17
+ # Atom matching delegates to `Scrapetor::Selector.atom_matches?`, so
18
+ # pseudo-class support (`:has`, `:not`, `:is`, `:nth-child`, etc.)
19
+ # lives in one place.
20
+ module Selectors
21
+ def self.css(scope, selector_str)
22
+ results = []
23
+ seen = {}
24
+ selector_groups(selector_str).each do |group|
25
+ plan = compile(group)
26
+ next if plan.empty?
27
+ execute(scope, plan).each do |n|
28
+ oid = n.object_id
29
+ next if seen[oid]
30
+ seen[oid] = true
31
+ results << n
32
+ end
33
+ end
34
+ results
35
+ end
36
+
37
+ # Cached comma-splitter. Frozen-literal selector strings hit
38
+ # the cache 100% of the time after first call, so a fallback
39
+ # loop that re-runs the same selector pays the per-char scan
40
+ # once across the whole iteration.
41
+ GROUPS_CACHE = {}
42
+ GROUPS_CACHE_CAP = 1024
43
+
44
+ def self.selector_groups(s)
45
+ cached = GROUPS_CACHE[s]
46
+ return cached if cached
47
+ depth = 0
48
+ paren = 0
49
+ groups = []
50
+ buf = +""
51
+ s.each_char do |ch|
52
+ if ch == "["
53
+ depth += 1; buf << ch
54
+ elsif ch == "]"
55
+ depth -= 1 if depth.positive?; buf << ch
56
+ elsif ch == "("
57
+ paren += 1; buf << ch
58
+ elsif ch == ")"
59
+ paren -= 1 if paren.positive?; buf << ch
60
+ elsif ch == "," && depth.zero? && paren.zero?
61
+ groups << buf.strip
62
+ buf = +""
63
+ else
64
+ buf << ch
65
+ end
66
+ end
67
+ groups << buf.strip
68
+ out = groups.reject(&:empty?).each(&:freeze).freeze
69
+ GROUPS_CACHE.shift while GROUPS_CACHE.size >= GROUPS_CACHE_CAP
70
+ GROUPS_CACHE[s] = out
71
+ end
72
+
73
+ # Cache compiled plans by selector string so a dom-mode document
74
+ # that re-runs the same selector dozens of times in a fallback
75
+ # loop only pays the parse cost once. Selector strings tend to
76
+ # come from frozen literals in parser code, so the cache hit
77
+ # rate is effectively 100%.
78
+ DOM_COMPILE_CACHE = {}
79
+ DOM_COMPILE_CACHE_CAP = 1024
80
+
81
+ def self.compile(selector)
82
+ cached = DOM_COMPILE_CACHE[selector]
83
+ return cached if cached
84
+ plan = Scrapetor::Selector.compile(selector)
85
+ DOM_COMPILE_CACHE.shift while DOM_COMPILE_CACHE.size >= DOM_COMPILE_CACHE_CAP
86
+ DOM_COMPILE_CACHE[selector] = plan
87
+ end
88
+
89
+ def self.execute(scope, plan)
90
+ return [] if plan.empty?
91
+ last_idx = plan.size - 1
92
+ candidates = candidates_for_atom(scope, plan[last_idx])
93
+ return candidates if plan.size == 1
94
+ candidates.select { |n| match_chain_backwards?(n, plan, last_idx - 1, scope) }
95
+ end
96
+
97
+ def self.candidates_for_atom(scope, atom)
98
+ # Use the document's lazy structural indexes when the atom has a
99
+ # narrowing anchor (id / class / tag). Falling back to a full
100
+ # walk_descendants on every fallback selector dominated parse
101
+ # time on 100KB SERP-style fixtures.
102
+ doc = atom_document(scope)
103
+ if doc.is_a?(Document) && atom.id
104
+ node = doc.id_index[atom.id]
105
+ return [] if node.nil?
106
+ return Scrapetor::Selector.atom_matches?(atom, node) && in_scope?(node, scope) ? [node] : []
107
+ end
108
+ if doc.is_a?(Document) && atom.classes && !atom.classes.empty?
109
+ # Pick the narrowest class index entry as the candidate seed.
110
+ sets = atom.classes.map { |c| doc.class_index[c] || [] }
111
+ seed = sets.min_by(&:size) || []
112
+ return seed.select do |node|
113
+ in_scope?(node, scope) && Scrapetor::Selector.atom_matches?(atom, node)
114
+ end
115
+ end
116
+ if doc.is_a?(Document) && atom.tag
117
+ seed = doc.tag_index[atom.tag.to_s] || []
118
+ return seed.select do |node|
119
+ in_scope?(node, scope) && Scrapetor::Selector.atom_matches?(atom, node)
120
+ end
121
+ end
122
+ result = []
123
+ walk_descendants(scope) do |node|
124
+ result << node if Scrapetor::Selector.atom_matches?(atom, node)
125
+ end
126
+ result
127
+ end
128
+
129
+ def self.atom_document(scope)
130
+ return scope if scope.is_a?(Document)
131
+ cur = scope
132
+ cur = cur.parent while cur && cur.respond_to?(:parent) && cur.parent
133
+ cur
134
+ end
135
+
136
+ def self.walk_descendants(scope, &block)
137
+ children =
138
+ if scope.is_a?(Document) || scope.is_a?(Element)
139
+ scope.children
140
+ else
141
+ []
142
+ end
143
+ children.each do |c|
144
+ if c.element?
145
+ block.call(c)
146
+ walk_descendants(c, &block)
147
+ end
148
+ end
149
+ end
150
+
151
+ def self.atom_matches?(atom, node)
152
+ Scrapetor::Selector.atom_matches?(atom, node)
153
+ end
154
+
155
+ def self.match_chain_backwards?(node, plan, idx, scope)
156
+ return true if idx < 0
157
+ atom = plan[idx]
158
+ combinator = plan[idx + 1].combinator
159
+ case combinator
160
+ when :child
161
+ parent = node.parent
162
+ return false unless parent.is_a?(Element)
163
+ return false unless in_scope?(parent, scope)
164
+ return false unless atom_matches?(atom, parent)
165
+ match_chain_backwards?(parent, plan, idx - 1, scope)
166
+ when :descendant, nil
167
+ cur = node.parent
168
+ while cur.is_a?(Element)
169
+ if in_scope?(cur, scope) && atom_matches?(atom, cur) &&
170
+ match_chain_backwards?(cur, plan, idx - 1, scope)
171
+ return true
172
+ end
173
+ cur = cur.parent
174
+ end
175
+ false
176
+ when :adj
177
+ prev = node.previous_element_sibling
178
+ return false unless prev.is_a?(Element)
179
+ return false unless in_scope?(prev, scope)
180
+ return false unless atom_matches?(atom, prev)
181
+ match_chain_backwards?(prev, plan, idx - 1, scope)
182
+ when :gen
183
+ prev = node.previous_element_sibling
184
+ while prev.is_a?(Element)
185
+ if in_scope?(prev, scope) && atom_matches?(atom, prev) &&
186
+ match_chain_backwards?(prev, plan, idx - 1, scope)
187
+ return true
188
+ end
189
+ prev = prev.previous_element_sibling
190
+ end
191
+ false
192
+ else
193
+ false
194
+ end
195
+ end
196
+
197
+ def self.in_scope?(node, scope)
198
+ return true if scope.is_a?(Document)
199
+ cur = node
200
+ while cur
201
+ return true if cur.equal?(scope)
202
+ cur = cur.parent
203
+ end
204
+ false
205
+ end
206
+ end
207
+ end
208
+ end