scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # HTML named-entity decoder.
5
+ #
6
+ # The C native engine handles the minimal set (`& < >
7
+ # " '   &#N; &#xH;`) inline during extraction. This
8
+ # module is the broader Ruby table — useful when post-processing raw
9
+ # strings, or when running the Ruby fallback path against HTML with
10
+ # uncommon entities.
11
+ #
12
+ # The table here covers the ~140 most frequent named entities from
13
+ # the HTML5 spec — enough to handle real-world text.
14
+ module Entities
15
+ TABLE = {
16
+ "amp" => "&", "lt" => "<", "gt" => ">",
17
+ "quot" => '"', "apos" => "'", "nbsp" => " ",
18
+ "copy" => "©", "reg" => "®", "trade" => "™",
19
+ "mdash" => "—", "ndash" => "–", "hellip" => "…",
20
+ "ldquo" => "“", "rdquo" => "”", "lsquo" => "‘",
21
+ "rsquo" => "’", "laquo" => "«", "raquo" => "»",
22
+ "lsaquo" => "‹", "rsaquo" => "›", "sbquo" => "‚",
23
+ "bdquo" => "„", "times" => "×", "divide" => "÷",
24
+ "plusmn" => "±", "deg" => "°", "sect" => "§",
25
+ "para" => "¶", "middot" => "·", "bull" => "•",
26
+ "dagger" => "†", "Dagger" => "‡", "permil" => "‰",
27
+ "prime" => "′", "Prime" => "″", "ne" => "≠",
28
+ "le" => "≤", "ge" => "≥", "asymp" => "≈",
29
+ "equiv" => "≡", "infin" => "∞", "sum" => "∑",
30
+ "prod" => "∏", "int" => "∫", "radic" => "√",
31
+ "part" => "∂", "nabla" => "∇", "minus" => "−",
32
+ "plus" => "+", "lowast" => "∗", "frasl" => "⁄",
33
+ "larr" => "←", "rarr" => "→", "uarr" => "↑",
34
+ "darr" => "↓", "harr" => "↔", "crarr" => "↵",
35
+ "lArr" => "⇐", "rArr" => "⇒", "uArr" => "⇑",
36
+ "dArr" => "⇓", "hArr" => "⇔", "spades" => "♠",
37
+ "clubs" => "♣", "hearts" => "♥", "diams" => "♦",
38
+ "loz" => "◊", "Aacute" => "Á", "aacute" => "á",
39
+ "Acirc" => "Â", "acirc" => "â", "Agrave" => "À",
40
+ "agrave" => "à", "Aring" => "Å", "aring" => "å",
41
+ "Atilde" => "Ã", "atilde" => "ã", "Auml" => "Ä",
42
+ "auml" => "ä", "AElig" => "Æ", "aelig" => "æ",
43
+ "Ccedil" => "Ç", "ccedil" => "ç", "Eacute" => "É",
44
+ "eacute" => "é", "Ecirc" => "Ê", "ecirc" => "ê",
45
+ "Egrave" => "È", "egrave" => "è", "Euml" => "Ë",
46
+ "euml" => "ë", "Iacute" => "Í", "iacute" => "í",
47
+ "Icirc" => "Î", "icirc" => "î", "Igrave" => "Ì",
48
+ "igrave" => "ì", "Iuml" => "Ï", "iuml" => "ï",
49
+ "Ntilde" => "Ñ", "ntilde" => "ñ", "Oacute" => "Ó",
50
+ "oacute" => "ó", "Ocirc" => "Ô", "ocirc" => "ô",
51
+ "Ograve" => "Ò", "ograve" => "ò", "Oslash" => "Ø",
52
+ "oslash" => "ø", "Otilde" => "Õ", "otilde" => "õ",
53
+ "Ouml" => "Ö", "ouml" => "ö", "Uacute" => "Ú",
54
+ "uacute" => "ú", "Ucirc" => "Û", "ucirc" => "û",
55
+ "Ugrave" => "Ù", "ugrave" => "ù", "Uuml" => "Ü",
56
+ "uuml" => "ü", "Yacute" => "Ý", "yacute" => "ý",
57
+ "yuml" => "ÿ", "szlig" => "ß",
58
+ "iexcl" => "¡", "iquest" => "¿", "cent" => "¢",
59
+ "pound" => "£", "yen" => "¥", "euro" => "€",
60
+ "curren" => "¤", "shy" => "­",
61
+ "frac12" => "½", "frac14" => "¼", "frac34" => "¾",
62
+ "alpha" => "α", "beta" => "β", "gamma" => "γ",
63
+ "delta" => "δ", "epsilon" => "ε", "pi" => "π",
64
+ "sigma" => "σ", "omega" => "ω",
65
+ "ensp" => " ", "emsp" => " ", "thinsp" => " ",
66
+ "zwnj" => "‌", "zwj" => "‍"
67
+ }.freeze
68
+
69
+ ENTITY_RE = /&(?:#(?:x([0-9A-Fa-f]+)|(\d+))|([a-zA-Z][a-zA-Z0-9]+));/.freeze
70
+
71
+ # Decode a string containing HTML entities into plain UTF-8 text.
72
+ def self.decode(s)
73
+ return s if s.nil? || s.empty?
74
+ s.to_s.gsub(ENTITY_RE) do
75
+ hex = Regexp.last_match(1)
76
+ dec = Regexp.last_match(2)
77
+ named = Regexp.last_match(3)
78
+ if hex
79
+ [hex.to_i(16)].pack("U")
80
+ elsif dec
81
+ [dec.to_i].pack("U")
82
+ elsif named
83
+ TABLE[named] || Regexp.last_match(0)
84
+ else
85
+ Regexp.last_match(0)
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Base for all Scrapetor errors.
5
+ class Error < StandardError; end
6
+
7
+ # Raised when a required field in the schema can't be found.
8
+ class ExtractionError < Error; end
9
+
10
+ # Raised when a schema descriptor isn't valid for the native engine.
11
+ class SchemaError < Error; end
12
+ end
@@ -0,0 +1,147 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Scrapetor
6
+ # Schema execution. The hot path operates on raw Nokolexbor nodes and
7
+ # inlines coercion — no Scrapetor::Node allocations per emitted field.
8
+ module Extractor
9
+ def self.run(doc, scope, schema)
10
+ result = {}
11
+ base_url = doc.respond_to?(:base_url) ? doc.base_url : nil
12
+ schema.fields.each do |f|
13
+ result[f.name] = extract_field(scope, f, base_url)
14
+ end
15
+ schema.groups.each do |g|
16
+ result[g.name] = run_group(scope, g, base_url)
17
+ end
18
+ result
19
+ end
20
+
21
+ def self.run_group(scope, group, base_url)
22
+ out = []
23
+ scope.css(group.selector).each do |sub|
24
+ inner = {}
25
+ group.fields.each { |f| inner[f.name] = extract_field(sub, f, base_url) }
26
+ group.groups.each { |gg| inner[gg.name] = run_group(sub, gg, base_url) }
27
+ out << inner
28
+ end
29
+ out
30
+ end
31
+
32
+ def self.extract_field(scope, f, base_url)
33
+ selectors = f.selector.is_a?(Array) ? f.selector : [f.selector]
34
+ value =
35
+ if f.multi
36
+ extract_multi(scope, selectors, f, base_url)
37
+ else
38
+ extract_single(scope, selectors, f, base_url)
39
+ end
40
+
41
+ # default + required
42
+ missing = value.nil? || (f.multi && value.empty?)
43
+ value = f.default if missing && !f.default.nil?
44
+
45
+ if f.required && (value.nil? || (f.multi && value.respond_to?(:empty?) && value.empty?))
46
+ raise ExtractionError, "required field `#{f.name}` not found"
47
+ end
48
+
49
+ # transform last (after coerce + default)
50
+ value = f.transform.call(value) if f.transform && !value.nil?
51
+ value
52
+ end
53
+
54
+ def self.extract_single(scope, selectors, f, base_url)
55
+ selectors.each do |sel|
56
+ n = sel ? scope.at_css(sel) : scope
57
+ next if n.nil?
58
+ raw = extract_raw(n, f)
59
+ next if raw.nil?
60
+ return coerce(raw, f, base_url, n)
61
+ end
62
+ nil
63
+ end
64
+
65
+ def self.extract_multi(scope, selectors, f, base_url)
66
+ out = []
67
+ selectors.each do |sel|
68
+ nodes = sel ? scope.css(sel) : [scope]
69
+ nodes.each do |n|
70
+ raw = extract_raw(n, f)
71
+ next if raw.nil?
72
+ v = coerce(raw, f, base_url, n)
73
+ out << v unless v.nil?
74
+ end
75
+ break unless out.empty?
76
+ end
77
+ out
78
+ end
79
+
80
+ def self.extract_raw(node, f)
81
+ return node.inner_html if f.type == :html && f.attr_str.nil?
82
+ if f.attr_str
83
+ node[f.attr_str]
84
+ else
85
+ node.text
86
+ end
87
+ end
88
+
89
+ def self.coerce(raw, f, base_url, _node)
90
+ return nil if raw.nil?
91
+ v = raw
92
+ v = Cleaner.clean(v) if f.clean
93
+ case f.type
94
+ when :text then v
95
+ when :integer then int_coerce(v)
96
+ when :float then float_coerce(v)
97
+ when :money then Money.parse(v)
98
+ when :url then f.normalize_url ? URL.absolute(v, base_url) : v
99
+ when :date then date_coerce(v)
100
+ when :json then json_coerce(v)
101
+ when :boolean then bool_coerce(v)
102
+ when :list then list_coerce(v, f.delimiter)
103
+ when :html then v # already inner_html
104
+ else v
105
+ end
106
+ end
107
+
108
+ def self.int_coerce(v)
109
+ s = v.to_s.gsub(/[^\d\-]/, "")
110
+ s.empty? || s == "-" ? nil : s.to_i
111
+ end
112
+
113
+ def self.float_coerce(v)
114
+ s = v.to_s.gsub(/[^\d.\-]/, "")
115
+ s.empty? || s == "-" || s == "." ? nil : s.to_f
116
+ end
117
+
118
+ def self.date_coerce(v)
119
+ require "date"
120
+ ::Date.parse(v.to_s)
121
+ rescue ::ArgumentError, ::TypeError
122
+ nil
123
+ end
124
+
125
+ def self.json_coerce(v)
126
+ JSON.parse(v.to_s)
127
+ rescue JSON::ParserError
128
+ nil
129
+ end
130
+
131
+ TRUTHY_STRINGS = %w[true yes 1 on enabled].freeze
132
+ FALSY_STRINGS = %w[false no 0 off disabled].freeze
133
+ private_constant :TRUTHY_STRINGS, :FALSY_STRINGS
134
+
135
+ def self.bool_coerce(v)
136
+ s = v.to_s.strip.downcase
137
+ return true if TRUTHY_STRINGS.include?(s)
138
+ return false if FALSY_STRINGS.include?(s)
139
+ return true if s == "yes"
140
+ nil
141
+ end
142
+
143
+ def self.list_coerce(v, delimiter)
144
+ v.to_s.split(delimiter).map(&:strip).reject(&:empty?)
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,390 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Native HTTP/2-capable fetch layer. Wraps the libcurl-backed
5
+ # Scrapetor::Native::Http module. Distinct from Scrapetor::HTTP /
6
+ # Scrapetor.fetch, which is the Net::HTTP-based fallback used by
7
+ # tests, the CLI, and environments without libcurl.
8
+ #
9
+ # Capabilities depend on the libcurl your gem links against. Inspect
10
+ # Scrapetor::Fetcher.features to see what's actually wired up
11
+ # (HTTP/2 / brotli / zstd / libz). HTTP/2 + gzip is the typical
12
+ # baseline on macOS; brotli/zstd need a libcurl rebuilt with them.
13
+ #
14
+ # The connection cache lives on a per-OS-thread libcurl easy handle —
15
+ # repeated fetches to the same host inside a single thread reuse the
16
+ # TLS session and HTTP/2 stream. The fetch itself drops the GVL, so
17
+ # background Ruby threads keep running during the round-trip.
18
+ #
19
+ # resp = Scrapetor::Fetcher.get("https://api.example.com/items")
20
+ # resp[:status] # => 200
21
+ # resp[:http_version] # => "2"
22
+ # resp[:headers] # => {"content-type" => "application/json", ...}
23
+ # resp[:body] # => "..."
24
+ #
25
+ # doc = Scrapetor::Fetcher.fetch("https://example.com/")
26
+ # # => Scrapetor::Document parsed from the response body, base_url
27
+ # # set to the final URL after redirects.
28
+ module Fetcher
29
+ class NotAvailableError < StandardError; end
30
+ class FetchError < StandardError
31
+ attr_reader :status, :response
32
+ def initialize(msg, status: nil, response: nil)
33
+ super(msg)
34
+ @status = status
35
+ @response = response
36
+ end
37
+ end
38
+
39
+ DEFAULT_USER_AGENT = "scrapetor/#{Scrapetor::VERSION} (libcurl)"
40
+
41
+ # Status codes worth retrying. 408 (timeout), 425 (early), 429
42
+ # (rate-limit), 500–504 (transient upstream). 5xx >= 505 are
43
+ # protocol-level rejections; they don't usually heal on retry.
44
+ DEFAULT_RETRY_STATUSES = [408, 425, 429, 500, 502, 503, 504].freeze
45
+
46
+ # Compute the backoff delay for attempt N (1-indexed). Exponential
47
+ # with full jitter — the AWS-style 'random between 0 and 2^n * base'
48
+ # variant — capped at max_backoff.
49
+ def self.backoff_for(attempt, base: 0.3, max: 10.0, retry_after: nil)
50
+ return [retry_after.to_f, max].min if retry_after && retry_after.to_f > 0
51
+ hi = [base * (2.0**(attempt - 1)), max].min
52
+ rand * hi
53
+ end
54
+
55
+ def self.retryable_response?(resp, retry_statuses)
56
+ retry_statuses.any? { |s| s == resp[:status] }
57
+ end
58
+
59
+ def self.parse_retry_after(headers)
60
+ v = headers && (headers["retry-after"] || headers["Retry-After"])
61
+ return nil unless v
62
+ # Either integer seconds or HTTP-date. We only honour the
63
+ # integer form (the date form is rare and parsing it adds a
64
+ # dependency on Time.httpdate that the caller may not need).
65
+ v.to_s.strip.match?(/\A\d+\z/) ? v.to_i : nil
66
+ end
67
+
68
+ def self.available?
69
+ defined?(Scrapetor::Native::Http::AVAILABLE) &&
70
+ Scrapetor::Native::Http::AVAILABLE
71
+ end
72
+
73
+ def self.features
74
+ ensure_available!
75
+ Scrapetor::Native::Http.features
76
+ end
77
+
78
+ # Single GET with optional retry + exponential backoff.
79
+ #
80
+ # retry: 0 - try once and return whatever happens (default).
81
+ # retry: N - retry up to N times on transient failure.
82
+ # backoff: - base backoff in seconds (default 0.3).
83
+ # max_backoff: - cap on a single sleep (default 10.0).
84
+ # retry_on: - statuses to retry (default [408, 425, 429, 500..504]).
85
+ #
86
+ # Network failures (IOError from libcurl: connect refused, DNS,
87
+ # TLS, timeout) are also retried. The wait between attempts honours
88
+ # Retry-After response headers (numeric form) when present and
89
+ # otherwise uses exponential backoff with full jitter.
90
+ def self.get(url, **opts)
91
+ ensure_available!
92
+ retries = opts.delete(:retry) || 0
93
+ base = opts.delete(:backoff) || 0.3
94
+ max_backoff = opts.delete(:max_backoff) || 10.0
95
+ retry_on = opts.delete(:retry_on) || DEFAULT_RETRY_STATUSES
96
+ opts[:user_agent] ||= DEFAULT_USER_AGENT
97
+ attempt = 0
98
+ last_err = nil
99
+ loop do
100
+ begin
101
+ resp = Scrapetor::Native::Http.get(url.to_s, opts)
102
+ return resp unless retries > attempt && retryable_response?(resp, retry_on)
103
+ ra = parse_retry_after(resp[:headers])
104
+ sleep backoff_for(attempt + 1, base: base, max: max_backoff,
105
+ retry_after: ra)
106
+ rescue IOError => e
107
+ last_err = e
108
+ raise e unless retries > attempt
109
+ sleep backoff_for(attempt + 1, base: base, max: max_backoff)
110
+ end
111
+ attempt += 1
112
+ end
113
+ rescue IOError
114
+ raise last_err if last_err
115
+ raise
116
+ end
117
+
118
+ # Fetch + parse. Raises FetchError on non-2xx status by default;
119
+ # pass raise_for_status: false to inspect non-2xx responses.
120
+ def self.fetch(url, raise_for_status: true, **opts)
121
+ resp = get(url, **opts)
122
+ if raise_for_status && (resp[:status] < 200 || resp[:status] >= 400)
123
+ raise FetchError.new(
124
+ "Scrapetor::Fetcher.fetch #{url} -> HTTP #{resp[:status]}",
125
+ status: resp[:status], response: resp
126
+ )
127
+ end
128
+ Scrapetor.parse(resp[:body], base_url: resp[:final_url])
129
+ end
130
+
131
+ def self.ensure_available!
132
+ return if available?
133
+ raise NotAvailableError,
134
+ "Scrapetor::Fetcher requires libcurl at build time. " \
135
+ "Reinstall after `brew install curl` / `apt-get install libcurl4-openssl-dev`."
136
+ end
137
+
138
+ # N concurrent GETs across pthread workers, each with a persistent
139
+ # libcurl handle (per-OS-thread connection cache). The full batch
140
+ # runs under one GVL release — other Ruby threads stay live
141
+ # throughout.
142
+ #
143
+ # results = Scrapetor::Fetcher.parallel_get(urls, threads: 8,
144
+ # timeout_ms: 5_000)
145
+ # # results is Array<Hash>; successful entries carry
146
+ # # :status, :headers, :body, :final_url, :http_version
147
+ # # failed entries carry { error: { url:, error: } } only.
148
+ # N concurrent GETs with optional retry. The native batch returns
149
+ # all results in one pass; failed entries (transient status or
150
+ # network error) get a second batch dispatch under retry, with
151
+ # the previous-attempt's wait honoured globally — i.e. one sleep
152
+ # between attempts rather than per-URL — so the pool keeps moving.
153
+ #
154
+ # Per-URL Retry-After headers are read on retryable responses and
155
+ # the maximum of them governs the next inter-attempt sleep, so a
156
+ # rate-limited host pulls the whole pool to its backoff rather
157
+ # than thrashing the rest in parallel.
158
+ def self.parallel_get(urls, **opts)
159
+ ensure_available!
160
+ urls = Array(urls).map(&:to_s)
161
+ return [] if urls.empty?
162
+
163
+ retries = opts.delete(:retry) || 0
164
+ base = opts.delete(:backoff) || 0.3
165
+ max_backoff = opts.delete(:max_backoff) || 10.0
166
+ retry_on = opts.delete(:retry_on) || DEFAULT_RETRY_STATUSES
167
+ opts[:user_agent] ||= DEFAULT_USER_AGENT
168
+
169
+ results = Array.new(urls.size)
170
+ pending = (0...urls.size).to_a
171
+ attempt = 0
172
+ loop do
173
+ batch = pending.map { |i| urls[i] }
174
+ batch_res = Scrapetor::Native::Http.parallel_fetch(batch, opts)
175
+ next_pending = []
176
+ next_retry_after = nil
177
+ pending.each_with_index do |orig_i, pos|
178
+ r = batch_res[pos]
179
+ if attempt < retries && retry_eligible?(r, retry_on)
180
+ ra = r[:headers] ? parse_retry_after(r[:headers]) : nil
181
+ next_retry_after = ra if ra && (next_retry_after.nil? || ra > next_retry_after)
182
+ next_pending << orig_i
183
+ else
184
+ results[orig_i] = r
185
+ end
186
+ end
187
+ break if next_pending.empty?
188
+ attempt += 1
189
+ sleep backoff_for(attempt, base: base, max: max_backoff,
190
+ retry_after: next_retry_after)
191
+ pending = next_pending
192
+ end
193
+ results
194
+ end
195
+
196
+ def self.retry_eligible?(r, retry_on)
197
+ return true if r[:error] # network-level
198
+ r[:status] && retry_on.any? { |s| s == r[:status] }
199
+ end
200
+
201
+ # Single-thread curl_multi bulk fetch — one driver thread, one
202
+ # multi handle, N concurrent transfers multiplexed via
203
+ # curl_multi_perform. Complements parallel_get:
204
+ #
205
+ # parallel_get - N pthread workers, each running blocking easy.
206
+ # Best when each transfer has CPU work after the
207
+ # fetch (decode + parse) since the GVL is released
208
+ # across the full batch and CPU scales with cores.
209
+ #
210
+ # multi_get - one driver thread, N concurrent in-flight.
211
+ # Best for I/O-dominated high-fan-out (hundreds of
212
+ # URLs across many hosts) where pthread setup
213
+ # overhead outweighs the in-flight count.
214
+ #
215
+ # Both share the same global CURLSH so connections / DNS / TLS
216
+ # sessions pool across them.
217
+ def self.multi_get(urls, **opts)
218
+ ensure_available!
219
+ urls = Array(urls).map(&:to_s)
220
+ return [] if urls.empty?
221
+ opts[:user_agent] ||= DEFAULT_USER_AGENT
222
+ Scrapetor::Native::Http.multi_fetch(urls, opts)
223
+ end
224
+
225
+ # Bulk-revalidate cached entries. Issues HEAD with
226
+ # If-None-Match / If-Modified-Since for every URL whose cache
227
+ # entry exists; the server's 304 / 200 verdict classifies each:
228
+ #
229
+ # :fresh - server said 304; cache still valid.
230
+ # :changed - server returned a new 2xx; cache rewritten.
231
+ # :missing - server returned 4xx (gone / not found).
232
+ # :error - transport failure.
233
+ #
234
+ # Returns a Hash[url => Symbol]. Optimal for crawls of N pages
235
+ # over moderate intervals: HEAD round-trips are cheap and run
236
+ # all-concurrent under curl_multi.
237
+ def self.revalidate(urls, cache_dir:, **opts)
238
+ ensure_available!
239
+ urls = Array(urls).map(&:to_s)
240
+ return {} if urls.empty?
241
+ opts[:user_agent] ||= DEFAULT_USER_AGENT
242
+ results = Scrapetor::Native::Http.multi_fetch(urls,
243
+ opts.merge(method: :head, cache_dir: cache_dir))
244
+ out = {}
245
+ results.each_with_index do |r, i|
246
+ url = urls[i]
247
+ out[url] =
248
+ if r[:error]
249
+ :error
250
+ elsif r[:headers] && r[:headers]["x-scrapetor-cache"] == "hit"
251
+ :fresh
252
+ elsif r[:status] && r[:status] >= 400
253
+ :missing
254
+ else
255
+ :changed
256
+ end
257
+ end
258
+ out
259
+ end
260
+
261
+ # multi_get + per-response parse, all under one no-GVL window.
262
+ # Returns Array<Scrapetor::Document | nil>, in input order. Failed
263
+ # entries are nil. Best for high-fan-out crawls where you want
264
+ # parsed Documents back and the I/O outweighs the per-page
265
+ # CPU cost.
266
+ def self.multi_fetch(urls, **opts)
267
+ urls = Array(urls).map(&:to_s)
268
+ return [] if urls.empty?
269
+ ensure_available!
270
+ opts[:user_agent] ||= DEFAULT_USER_AGENT
271
+ results = Scrapetor::Native::Http.multi_fetch(urls, opts.merge(parse: true))
272
+ results.map do |r|
273
+ next nil if r[:error]
274
+ native = r[:document]
275
+ next Scrapetor.parse(r[:body], base_url: r[:final_url]) unless native
276
+ Scrapetor::Document.new("", base_url: r[:final_url], native: native)
277
+ end
278
+ end
279
+
280
+ # Streaming variant of multi_get: yields each response as it
281
+ # completes (in completion order, not input order), so the caller
282
+ # can start processing while other transfers are still in flight.
283
+ # Pass parse: true to also parse the body in the worker thread.
284
+ #
285
+ # Scrapetor::Fetcher.multi_each(urls, threads: 8) do |r|
286
+ # puts r[:final_url], r[:status]
287
+ # # later transfers may still be on the wire here
288
+ # end
289
+ def self.multi_each(urls, **opts)
290
+ return enum_for(:multi_each, urls, **opts) unless block_given?
291
+ ensure_available!
292
+ urls = Array(urls).map(&:to_s)
293
+ return if urls.empty?
294
+ opts[:user_agent] ||= DEFAULT_USER_AGENT
295
+ batch = Scrapetor::Native::Http::MultiBatch.new(urls, opts)
296
+ while (r = batch.next)
297
+ yield r
298
+ end
299
+ end
300
+
301
+ # Method shorthands. Each is just a `.get` invocation with the
302
+ # corresponding method, plus the body sugar that POST/PUT/PATCH
303
+ # almost always need.
304
+ def self.post(url, body: nil, form: nil, json: nil, multipart: nil, **opts)
305
+ if multipart
306
+ opts[:multipart] = multipart
307
+ get(url, **opts.merge(method: :post))
308
+ else
309
+ body, opts = build_body(body, form, json, opts)
310
+ get(url, **opts.merge(method: :post, body: body))
311
+ end
312
+ end
313
+
314
+ # Convenience constructors for multipart values.
315
+ def self.upload_file(path, filename: nil, content_type: nil)
316
+ h = { path: path.to_s }
317
+ h[:filename] = filename if filename
318
+ h[:content_type] = content_type if content_type
319
+ h
320
+ end
321
+
322
+ def self.upload_bytes(bytes, filename:, content_type: "application/octet-stream")
323
+ { data: bytes, filename: filename, content_type: content_type }
324
+ end
325
+
326
+ def self.put(url, body: nil, form: nil, json: nil, **opts)
327
+ body, opts = build_body(body, form, json, opts)
328
+ get(url, **opts.merge(method: :put, body: body))
329
+ end
330
+
331
+ def self.patch(url, body: nil, form: nil, json: nil, **opts)
332
+ body, opts = build_body(body, form, json, opts)
333
+ get(url, **opts.merge(method: :patch, body: body))
334
+ end
335
+
336
+ def self.delete(url, **opts)
337
+ get(url, **opts.merge(method: :delete))
338
+ end
339
+
340
+ def self.head(url, **opts)
341
+ get(url, **opts.merge(method: :head))
342
+ end
343
+
344
+ # Build the request body from one of :body / :form / :json.
345
+ # Returns [body_string, opts_with_content_type_header_set].
346
+ def self.build_body(body, form, json, opts)
347
+ headers = (opts[:headers] || {}).dup
348
+ if json
349
+ require "json"
350
+ body = JSON.generate(json)
351
+ headers["Content-Type"] ||= "application/json"
352
+ elsif form
353
+ require "uri"
354
+ body = URI.encode_www_form(form)
355
+ headers["Content-Type"] ||= "application/x-www-form-urlencoded"
356
+ end
357
+ opts[:headers] = headers unless headers.empty?
358
+ [body, opts]
359
+ end
360
+
361
+ # Convenience: parallel_get + parse each successful response into
362
+ # a Scrapetor::Document. Failed entries return nil.
363
+ #
364
+ # The parse runs INSIDE the same no-GVL pthread worker that did
365
+ # the fetch (parse: true on the C side), so the whole batch —
366
+ # network + decode + transcode + dom_parse + index build — runs
367
+ # multi-core under a single GVL release. The main thread only
368
+ # wraps already-parsed documents.
369
+ def self.parallel_fetch(urls, **opts)
370
+ urls = Array(urls).map(&:to_s)
371
+ return [] if urls.empty?
372
+ ensure_available!
373
+ opts[:user_agent] ||= DEFAULT_USER_AGENT
374
+ results = Scrapetor::Native::Http.parallel_fetch(urls, opts.merge(parse: true))
375
+ results.map do |r|
376
+ next nil if r[:error]
377
+ native = r[:document]
378
+ next Scrapetor.parse(r[:body], base_url: r[:final_url]) unless native
379
+ Scrapetor::Document.new("", base_url: r[:final_url], native: native)
380
+ end
381
+ end
382
+ end
383
+
384
+ # Top-level shorthand for the libcurl path. Distinct from
385
+ # Scrapetor.fetch (Net::HTTP) so callers can opt-in to HTTP/2 +
386
+ # connection reuse where it's actually available.
387
+ def self.fetch_http2(url, **opts)
388
+ Fetcher.fetch(url, **opts)
389
+ end
390
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Structural fingerprint of a DOM subtree.
5
+ # Phase 1: tag-bigram rolling hash over the top `depth` levels.
6
+ # Phase 2+: tag bigrams + attribute-presence hash + child-shape hash.
7
+ module Fingerprint
8
+ MASK = 0xFFFFFFFFFFFFFFFF
9
+
10
+ def self.structural(node, depth: 4)
11
+ backing = node.respond_to?(:backing_node) ? node.backing_node : node
12
+ h = 0
13
+ walk(backing, depth) do |tag|
14
+ h = (h * 1_315_423_911 + tag.hash) & MASK
15
+ end
16
+ h
17
+ end
18
+
19
+ def self.walk(nlx, depth, &block)
20
+ return if depth <= 0
21
+ return unless nlx.respond_to?(:children)
22
+ nlx.children.each do |c|
23
+ next unless c.respond_to?(:element?) && c.element?
24
+ block.call(c.name)
25
+ walk(c, depth - 1, &block)
26
+ end
27
+ end
28
+ end
29
+ end