scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,301 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ class NodeSet
5
+ include Enumerable
6
+
7
+ def initialize(doc, backing_nodes)
8
+ @doc = doc
9
+ # `defined?` guard so this works when the native extension isn't
10
+ # loaded (e.g. install-time build failure, or the gem is required
11
+ # before its C extension is in place). Without the guard a plain
12
+ # NodeSet construction raises NameError on missing constant —
13
+ # which is the v0.1.x crash a production audit run surfaced.
14
+ if defined?(Scrapetor::Native::DocumentWrapper::LazyIds) &&
15
+ backing_nodes.is_a?(Scrapetor::Native::DocumentWrapper::LazyIds)
16
+ @lazy_ids = backing_nodes
17
+ @nodes = nil
18
+ else
19
+ @nodes = backing_nodes
20
+ end
21
+ end
22
+
23
+ def each
24
+ return enum_for(:each) unless block_given?
25
+ if @lazy_ids
26
+ wrap = @lazy_ids.wrapper
27
+ native = @lazy_ids.native
28
+ @lazy_ids.ids.each do |id|
29
+ yield Node.new(@doc, Scrapetor::Native::Element.new(native, id, wrap))
30
+ end
31
+ else
32
+ @nodes.each { |n| yield Node.new(@doc, n) }
33
+ end
34
+ end
35
+
36
+ def first
37
+ if @lazy_ids
38
+ id = @lazy_ids.ids.first
39
+ return nil unless id
40
+ Node.new(@doc, Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper))
41
+ else
42
+ n = @nodes.first
43
+ n && Node.new(@doc, n)
44
+ end
45
+ end
46
+
47
+ def last
48
+ if @lazy_ids
49
+ id = @lazy_ids.ids.last
50
+ return nil unless id
51
+ Node.new(@doc, Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper))
52
+ else
53
+ n = @nodes.last
54
+ n && Node.new(@doc, n)
55
+ end
56
+ end
57
+
58
+ def [](index, length = nil)
59
+ if length
60
+ slice = backing_nodes[index, length]
61
+ return self.class.new(@doc, slice || [])
62
+ end
63
+ if index.is_a?(Range)
64
+ slice = backing_nodes[index]
65
+ return self.class.new(@doc, slice || [])
66
+ end
67
+ if @lazy_ids
68
+ id = @lazy_ids.ids[index]
69
+ return nil unless id
70
+ Node.new(@doc, Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper))
71
+ else
72
+ n = @nodes[index]
73
+ n && Node.new(@doc, n)
74
+ end
75
+ end
76
+ alias slice []
77
+
78
+ def size
79
+ @lazy_ids ? @lazy_ids.ids.size : @nodes.size
80
+ end
81
+ alias length size
82
+ alias count size
83
+
84
+ def empty?
85
+ @lazy_ids ? @lazy_ids.ids.empty? : @nodes.empty?
86
+ end
87
+
88
+ def map
89
+ return enum_for(:map) unless block_given?
90
+ if @lazy_ids
91
+ wrap = @lazy_ids.wrapper
92
+ native = @lazy_ids.native
93
+ @lazy_ids.ids.map { |id| yield Node.new(@doc, Scrapetor::Native::Element.new(native, id, wrap)) }
94
+ else
95
+ @nodes.map { |n| yield Node.new(@doc, n) }
96
+ end
97
+ end
98
+
99
+ def text
100
+ backing_nodes.map(&:text).join
101
+ end
102
+ alias inner_text text
103
+ alias content text
104
+
105
+ def at(selector)
106
+ first&.at(selector)
107
+ end
108
+ alias at_css at
109
+
110
+ def css(selector)
111
+ # Determine up front whether the selector ends in a `::text` /
112
+ # `::attr` pseudo-element. Inferring from the result shape (was
113
+ # the previous approach) misclassifies zero-match queries as
114
+ # string-shaped and breaks `.at_css` chained off an empty NodeSet.
115
+ pe = selector.to_s
116
+ string_result = pe.include?("::") &&
117
+ pe =~ /::(?:text|attr\([^)]+\)|first-letter|first-line|before|after)\s*\z/i
118
+
119
+ collected = []
120
+ backing_nodes.each do |n|
121
+ next unless n.respond_to?(:css)
122
+ result = n.css(selector)
123
+ result = result.to_a if result.respond_to?(:to_a)
124
+ result.each { |hit| collected << hit }
125
+ end
126
+ return collected if string_result
127
+ NodeSet.new(@doc, collected)
128
+ end
129
+ alias search css
130
+
131
+ # Aggregate of children across all nodes in the set. Mirrors
132
+ # Nokogiri's NodeSet#children — every child of every node, including
133
+ # text and comment nodes, flattened into a single NodeSet. Pulls
134
+ # children straight from the backing element (rather than going
135
+ # through Node#children, which filters to elements only) so callers
136
+ # that iterate mixed-content can still see the text segments.
137
+ def children
138
+ collected = []
139
+ backing_nodes.each do |bk|
140
+ next unless bk.respond_to?(:children)
141
+ kids = bk.children
142
+ kids = kids.to_a if kids.respond_to?(:to_a)
143
+ kids.each { |c| collected << c }
144
+ end
145
+ NodeSet.new(@doc, collected)
146
+ end
147
+
148
+ def to_html
149
+ backing_nodes.map { |n| n.respond_to?(:to_html) ? n.to_html : n.to_s }.join
150
+ end
151
+ alias inner_html to_html
152
+ alias to_s to_html
153
+
154
+ def attr(name)
155
+ first&.attr(name)
156
+ end
157
+ alias attribute attr
158
+
159
+ def reverse
160
+ self.class.new(@doc, backing_nodes.reverse)
161
+ end
162
+
163
+ def +(other)
164
+ other_nodes = other.respond_to?(:backing_nodes) ? other.backing_nodes : Array(other)
165
+ self.class.new(@doc, backing_nodes + other_nodes)
166
+ end
167
+
168
+ def to_a
169
+ map { |n| n }
170
+ end
171
+ # Implicit conversion target — without this, `Array#+` /
172
+ # `Array#concat` / splat (`*nodeset`) all raise
173
+ # `TypeError: no implicit conversion of Scrapetor::NodeSet into Array`
174
+ # because Ruby's coercion path looks for to_ary, not to_a.
175
+ alias to_ary to_a
176
+
177
+ def backing_nodes
178
+ return materialize if @lazy_ids
179
+ @nodes
180
+ end
181
+
182
+ # Force the lazy-ids path to allocate its Element wrappers. Used by
183
+ # operations that need the original backing nodes (set algebra,
184
+ # +/-/&, removal).
185
+ def materialize
186
+ return @nodes unless @lazy_ids
187
+ @nodes = @lazy_ids.ids.map { |id| Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper) }
188
+ @lazy_ids = nil
189
+ @nodes
190
+ end
191
+
192
+ # ----- Bulk mutation passthroughs -----
193
+ #
194
+ # Nokogiri NodeSet exposes a handful of bulk operations that map onto
195
+ # iterating the underlying nodes. We keep parity so callers can do
196
+ # `doc.css('br').remove` etc. without crashing.
197
+
198
+ def remove
199
+ # Two-phase. First promote every backing node to its Dom
200
+ # equivalent (so path-based lookup happens against the still-
201
+ # intact tree); then remove. A naive "iterate + remove" works on
202
+ # a mutable Dom but invalidates the position-index paths the
203
+ # Native::Element fallback relies on after the first deletion.
204
+ resolved = backing_nodes.map do |n|
205
+ if n.respond_to?(:promote_to_dom!)
206
+ n.promote_to_dom!
207
+ else
208
+ n
209
+ end
210
+ end
211
+ resolved.each do |target|
212
+ if target.respond_to?(:remove)
213
+ target.remove
214
+ else
215
+ Node.new(@doc, target).remove
216
+ end
217
+ end
218
+ self
219
+ end
220
+ alias unlink remove
221
+
222
+ def each_with_index
223
+ return enum_for(:each_with_index) unless block_given?
224
+ backing_nodes.each_with_index { |n, i| yield Node.new(@doc, n), i }
225
+ end
226
+
227
+ def select
228
+ return enum_for(:select) unless block_given?
229
+ kept = []
230
+ backing_nodes.each do |n|
231
+ wrapped = Node.new(@doc, n)
232
+ kept << n if yield(wrapped)
233
+ end
234
+ self.class.new(@doc, kept)
235
+ end
236
+ alias filter select
237
+
238
+ def reject
239
+ return enum_for(:reject) unless block_given?
240
+ kept = []
241
+ backing_nodes.each do |n|
242
+ wrapped = Node.new(@doc, n)
243
+ kept << n unless yield(wrapped)
244
+ end
245
+ self.class.new(@doc, kept)
246
+ end
247
+
248
+ def find_all
249
+ return enum_for(:find_all) unless block_given?
250
+ select { |n| yield(n) }
251
+ end
252
+
253
+ def push(node)
254
+ materialize
255
+ @nodes << (node.is_a?(Node) ? node.backing_node : node)
256
+ self
257
+ end
258
+ alias << push
259
+
260
+ def pop
261
+ materialize
262
+ n = @nodes.pop
263
+ n && Node.new(@doc, n)
264
+ end
265
+
266
+ def shift
267
+ materialize
268
+ n = @nodes.shift
269
+ n && Node.new(@doc, n)
270
+ end
271
+
272
+ def index(node)
273
+ target = node.is_a?(Node) ? node.backing_node : node
274
+ backing_nodes.index(target)
275
+ end
276
+
277
+ def include?(node)
278
+ target = node.is_a?(Node) ? node.backing_node : node
279
+ backing_nodes.include?(target)
280
+ end
281
+
282
+ def -(other)
283
+ drop = other.respond_to?(:backing_nodes) ? other.backing_nodes : Array(other)
284
+ self.class.new(@doc, backing_nodes - drop)
285
+ end
286
+
287
+ def &(other)
288
+ keep = other.respond_to?(:backing_nodes) ? other.backing_nodes : Array(other)
289
+ self.class.new(@doc, backing_nodes & keep)
290
+ end
291
+
292
+ # Map every node through the `extract(fields)` extraction. Lets
293
+ # the standard SERP-result pattern collapse to:
294
+ #
295
+ # doc.css(".result").extract(title: ".t", price: ".p")
296
+ # # => [{title: ..., price: ...}, ...]
297
+ def extract(fields)
298
+ map { |n| n.extract(fields) }
299
+ end
300
+ end
301
+ end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Heuristic page-type detection.
5
+ #
6
+ # Returns one of:
7
+ # :product_page, :product_listing, :article, :search_results,
8
+ # :forum_thread, :profile, :documentation, :unknown
9
+ #
10
+ # The heuristic prefers strong signals (JSON-LD @type, OpenGraph
11
+ # og:type) and falls back to structural heuristics (repeated card
12
+ # patterns, byline + body, search bar + result list).
13
+ module PageType
14
+ PRODUCT_OG_TYPES = %w[product product.item og:product].freeze
15
+ ARTICLE_OG_TYPES = %w[article news.article].freeze
16
+ PROFILE_OG_TYPES = %w[profile person og:profile].freeze
17
+
18
+ def self.detect(doc)
19
+ from_structured_data(doc) ||
20
+ from_opengraph(doc) ||
21
+ from_structure(doc) ||
22
+ :unknown
23
+ end
24
+
25
+ # ----- strong signals: JSON-LD -----
26
+
27
+ def self.from_structured_data(doc)
28
+ types = doc.json_ld.flat_map { |item| Array(item.is_a?(Hash) ? item["@type"] : nil) }.compact.map(&:to_s)
29
+ return nil if types.empty?
30
+ return :product_listing if types.include?("ItemList") &&
31
+ (types.include?("Product") || types.include?("Offer"))
32
+ return :product_page if types.include?("Product")
33
+ return :article if (types & %w[NewsArticle Article BlogPosting]).any?
34
+ return :search_results if types.include?("SearchResultsPage")
35
+ return :profile if (types & %w[Person ProfilePage]).any?
36
+ return :forum_thread if types.include?("DiscussionForumPosting")
37
+ return :documentation if types.include?("TechArticle")
38
+ nil
39
+ end
40
+
41
+ # ----- OpenGraph signals -----
42
+
43
+ def self.from_opengraph(doc)
44
+ og = doc.opengraph
45
+ t = (og["type"] || "").to_s.downcase
46
+ return :product_page if PRODUCT_OG_TYPES.any? { |x| t.include?(x) }
47
+ return :article if ARTICLE_OG_TYPES.any? { |x| t.include?(x) }
48
+ return :profile if PROFILE_OG_TYPES.any? { |x| t.include?(x) }
49
+ nil
50
+ end
51
+
52
+ # ----- structural fallback -----
53
+
54
+ def self.from_structure(doc)
55
+ # Search results: a search bar + a list of result items
56
+ if doc.css('input[type="search"], form[role="search"], [class*="search-result"]').any?
57
+ return :search_results
58
+ end
59
+
60
+ # Repeated cards = listing
61
+ grid_candidates = %w[
62
+ .product-card .product-tile .product-item .listing-item
63
+ [class*="product-grid"] [class*="card"] [class*="tile"]
64
+ ].flat_map { |sel| doc.css(sel).to_a }.uniq
65
+ return :product_listing if grid_candidates.size >= 6
66
+
67
+ # Article: <article> with a byline AND a long body
68
+ articles = doc.css("article")
69
+ if articles.any?
70
+ text = articles.first.text.to_s
71
+ word_count = text.scan(/\S+/).size
72
+ has_byline = doc.css(".byline, .author, [rel='author'], [itemprop='author']").any?
73
+ return :article if word_count >= 200 || has_byline
74
+ end
75
+
76
+ # Profile: avatar + name + bio
77
+ if doc.css('[class*="avatar"], [class*="profile-header"]').any? &&
78
+ doc.css('[class*="bio"], [class*="about"]').any?
79
+ return :profile
80
+ end
81
+
82
+ # Forum thread
83
+ if doc.css('.thread, .topic, [class*="post-message"]').size >= 2
84
+ return :forum_thread
85
+ end
86
+
87
+ # Documentation: code blocks + heading hierarchy
88
+ if doc.css("pre code").size >= 3 && doc.css("h1, h2, h3").size >= 3
89
+ return :documentation
90
+ end
91
+
92
+ nil
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Scrapetor
6
+ # Pagination helper. Walks a page sequence by detecting the "next
7
+ # page" URL from the document — in priority order:
8
+ #
9
+ # 1. <link rel="next" href="..."> in <head>
10
+ # 2. a[rel~="next"] (most common pattern; HTML spec compliant)
11
+ # 3. The configured CSS selector via :next_link
12
+ #
13
+ # Stops when no next link is found, when max_pages is reached, or
14
+ # when the next URL hasn't changed (defensive against malformed
15
+ # next links pointing at self).
16
+ #
17
+ # Scrapetor::Pagination.each_page("https://example.com/listings") do |doc, url|
18
+ # doc.css(".product").each { |p| ... }
19
+ # end
20
+ #
21
+ # Yields (doc, url) for each page in order. When :http is set to a
22
+ # Scrapetor::Fetcher / Session-like object, it's used for fetches;
23
+ # otherwise Scrapetor::Fetcher (HTTP/2 via libcurl) is used by
24
+ # default, with a Net::HTTP fallback if libcurl isn't available.
25
+ module Pagination
26
+ DEFAULT_MAX_PAGES = 50
27
+ DEFAULT_DELAY = 0.0
28
+
29
+ def self.each_page(start_url, max_pages: DEFAULT_MAX_PAGES,
30
+ delay: DEFAULT_DELAY, http: nil,
31
+ next_link: nil)
32
+ return enum_for(:each_page, start_url,
33
+ max_pages: max_pages, delay: delay,
34
+ http: http, next_link: next_link) unless block_given?
35
+
36
+ url = start_url.to_s
37
+ visited = {}
38
+ page_no = 0
39
+ while url && page_no < max_pages
40
+ break if visited[url]
41
+ visited[url] = true
42
+ page_no += 1
43
+
44
+ doc = fetch_page(url, http)
45
+ yield doc, url
46
+
47
+ nxt = next_page_url(doc, url, next_link)
48
+ sleep delay if delay > 0 && nxt
49
+ url = nxt
50
+ end
51
+ nil
52
+ end
53
+
54
+ # Inspect a document and return the next page URL, or nil if
55
+ # this is the last page. Honours <link rel=next> > a[rel=next] >
56
+ # a custom selector via :next_link.
57
+ def self.next_page_url(doc, current_url, custom_selector = nil)
58
+ # 1. <link rel="next">
59
+ if (link = doc.at_css('link[rel~="next"]'))
60
+ href = link["href"] || link[:href]
61
+ return absolutize(href, current_url) if href && !href.empty?
62
+ end
63
+ # 2. a[rel~="next"]
64
+ doc.css('a[rel~="next"]').each do |a|
65
+ href = a["href"] || a[:href]
66
+ next unless href && !href.empty?
67
+ abs = absolutize(href, current_url)
68
+ return abs if abs && abs != current_url
69
+ end
70
+ # 3. Custom selector — first link element under the match.
71
+ if custom_selector
72
+ node = doc.at_css(custom_selector)
73
+ if node
74
+ # Walk up if user gave us a link target like ".next-link"
75
+ # already pointing at an <a>, or treat as the wrapper and
76
+ # grab the first <a> within.
77
+ link_node = node.respond_to?(:name) && node.name.casecmp?("a") ? node : node.at_css("a")
78
+ if link_node
79
+ href = link_node["href"] || link_node[:href]
80
+ return absolutize(href, current_url) if href && !href.empty?
81
+ end
82
+ end
83
+ end
84
+ nil
85
+ end
86
+
87
+ def self.fetch_page(url, http)
88
+ if http && http.respond_to?(:fetch)
89
+ http.fetch(url)
90
+ elsif defined?(Scrapetor::Fetcher) && Scrapetor::Fetcher.available?
91
+ Scrapetor::Fetcher.fetch(url)
92
+ else
93
+ Scrapetor.fetch(url)
94
+ end
95
+ end
96
+
97
+ def self.absolutize(href, base)
98
+ return nil if href.nil? || href.empty?
99
+ URI.join(base, href).to_s
100
+ rescue URI::InvalidURIError
101
+ nil
102
+ end
103
+ end
104
+
105
+ # Top-level shorthand.
106
+ def self.each_page(start_url, **opts, &block)
107
+ Pagination.each_page(start_url, **opts, &block)
108
+ end
109
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+ require "fileutils"
5
+
6
+ module Scrapetor
7
+ # Disk-backed parse cache. Persists the parsed arena (nodes blob,
8
+ # attrs blob, html bytes) to disk so subsequent process invocations
9
+ # restore the document via memcpy + index rebuild — the SAX
10
+ # tokeniser doesn't run on hit. Implementation is fully native:
11
+ # `Scrapetor::Native::Document#serialize_to_file` writes the binary
12
+ # arena; `Scrapetor::Native::Document.load_from_file` reads it back.
13
+ #
14
+ # Designed for:
15
+ # - CI / test suites looping the same fixture HTML across boots
16
+ # - Batch jobs that restart (cron, sidekiq workers)
17
+ # - A/B parser comparisons over a corpus
18
+ #
19
+ # Storage layout: SCRAP_CACHE_DIR/<first-2-bytes>/<sha256>.arena
20
+ # Files are content-addressed so identical HTML inputs share one
21
+ # cache entry regardless of caller.
22
+ #
23
+ # Opt-in via SCRAP_PERSISTENT_CACHE=1 or Scrapetor::PersistentCache.enable!
24
+ # Override the cache root via SCRAP_CACHE_DIR (default
25
+ # ~/.cache/scrapetor/parse).
26
+ module PersistentCache
27
+ DEFAULT_DIR = File.expand_path("~/.cache/scrapetor/parse")
28
+
29
+ class << self
30
+ attr_accessor :dir
31
+
32
+ def enabled?
33
+ e = defined?(@enabled) ? @enabled : nil
34
+ return e unless e.nil?
35
+ ENV["SCRAP_PERSISTENT_CACHE"] == "1"
36
+ end
37
+
38
+ def enable!
39
+ @enabled = true
40
+ @dir ||= ENV.fetch("SCRAP_CACHE_DIR", DEFAULT_DIR)
41
+ FileUtils.mkdir_p(@dir)
42
+ true
43
+ end
44
+
45
+ def disable!
46
+ @enabled = false
47
+ end
48
+
49
+ def directory
50
+ @dir ||= ENV.fetch("SCRAP_CACHE_DIR", DEFAULT_DIR)
51
+ end
52
+
53
+ # Load a cached parsed arena for the given HTML, or nil on miss.
54
+ # The return value is a Scrapetor::Native::Document ready to be
55
+ # wrapped by Scrapetor::Document.
56
+ def load(html)
57
+ return nil unless enabled?
58
+ return nil if html.nil? || html.empty?
59
+ key = key_for(html)
60
+ path = path_for(key)
61
+ return nil unless File.exist?(path)
62
+ native = Scrapetor::Native::Document.load_from_file(path)
63
+ native
64
+ rescue StandardError
65
+ File.delete(path) rescue nil
66
+ nil
67
+ end
68
+
69
+ # Persist a parsed arena to disk under its content fingerprint.
70
+ # Takes the Scrapetor::Native::Document handle (i.e.
71
+ # `doc.backing.native` for an unmutated document). Returns the
72
+ # cache key on success, nil on miss / disabled.
73
+ def store(html, native_doc)
74
+ return nil unless enabled?
75
+ return nil if html.nil? || html.empty?
76
+ return nil if native_doc.nil?
77
+ key = key_for(html)
78
+ path = path_for(key)
79
+ return key if File.exist?(path)
80
+ FileUtils.mkdir_p(File.dirname(path))
81
+ tmp = "#{path}.tmp.#{Process.pid}"
82
+ ok = native_doc.serialize_to_file(tmp)
83
+ unless ok
84
+ File.delete(tmp) rescue nil
85
+ return nil
86
+ end
87
+ File.rename(tmp, path)
88
+ key
89
+ end
90
+
91
+ # SHA-256 of the HTML — collisions effectively zero.
92
+ def key_for(html)
93
+ Digest::SHA256.hexdigest(html)
94
+ end
95
+
96
+ # Pre-warm the cache for a directory of fixtures.
97
+ def warm(paths_or_globs)
98
+ return 0 unless enabled?
99
+ n = 0
100
+ Array(paths_or_globs).each do |entry|
101
+ Dir.glob(entry).each do |path|
102
+ html = File.read(path)
103
+ doc = Scrapetor.parse(html)
104
+ store(html, doc.backing.native)
105
+ n += 1
106
+ end
107
+ end
108
+ n
109
+ end
110
+
111
+ def disk_usage
112
+ return 0 unless File.directory?(directory)
113
+ Dir.glob(File.join(directory, "*", "*.arena")).sum { |p| File.size(p) }
114
+ end
115
+
116
+ def clear!
117
+ return 0 unless File.directory?(directory)
118
+ Dir.glob(File.join(directory, "*", "*.arena")).each(&File.method(:delete)).size
119
+ end
120
+
121
+ private
122
+
123
+ def path_for(key)
124
+ File.join(directory, key[0, 2], "#{key}.arena")
125
+ end
126
+ end
127
+
128
+ enable! if enabled?
129
+ end
130
+ end