scrapetor 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +242 -0
- data/LICENSE +21 -0
- data/README.md +440 -0
- data/bin/scrapetor +190 -0
- data/bin/scrapetor-bench +5 -0
- data/ext/scrapetor/README.md +53 -0
- data/ext/scrapetor/native/extconf.rb +67 -0
- data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
- data/ext/scrapetor/native/scrapetor_http.c +2591 -0
- data/ext/scrapetor/native/scrapetor_native.c +1156 -0
- data/lib/scrapetor/builder.rb +158 -0
- data/lib/scrapetor/cleaner.rb +10 -0
- data/lib/scrapetor/comment_node.rb +67 -0
- data/lib/scrapetor/document.rb +457 -0
- data/lib/scrapetor/dom/parser.rb +69 -0
- data/lib/scrapetor/dom/selectors.rb +208 -0
- data/lib/scrapetor/dom.rb +563 -0
- data/lib/scrapetor/encoding.rb +85 -0
- data/lib/scrapetor/entities.rb +90 -0
- data/lib/scrapetor/errors.rb +12 -0
- data/lib/scrapetor/extractor.rb +147 -0
- data/lib/scrapetor/fetcher.rb +390 -0
- data/lib/scrapetor/fingerprint.rb +29 -0
- data/lib/scrapetor/form.rb +141 -0
- data/lib/scrapetor/http.rb +114 -0
- data/lib/scrapetor/microdata.rb +132 -0
- data/lib/scrapetor/money.rb +30 -0
- data/lib/scrapetor/native.rb +291 -0
- data/lib/scrapetor/native_dom.rb +2258 -0
- data/lib/scrapetor/node.rb +539 -0
- data/lib/scrapetor/node_set.rb +301 -0
- data/lib/scrapetor/page_type.rb +95 -0
- data/lib/scrapetor/pagination.rb +109 -0
- data/lib/scrapetor/persistent_cache.rb +130 -0
- data/lib/scrapetor/robots.rb +159 -0
- data/lib/scrapetor/sax.rb +285 -0
- data/lib/scrapetor/schema.rb +144 -0
- data/lib/scrapetor/selector.rb +576 -0
- data/lib/scrapetor/session.rb +141 -0
- data/lib/scrapetor/sitemap.rb +52 -0
- data/lib/scrapetor/stream.rb +111 -0
- data/lib/scrapetor/structured_data.rb +74 -0
- data/lib/scrapetor/template_registry.rb +24 -0
- data/lib/scrapetor/text_node.rb +101 -0
- data/lib/scrapetor/url.rb +21 -0
- data/lib/scrapetor/version.rb +5 -0
- data/lib/scrapetor/xpath.rb +1603 -0
- data/lib/scrapetor.rb +167 -0
- data/scrapetor.gemspec +77 -0
- metadata +200 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
# HTML named-entity decoder.
|
|
5
|
+
#
|
|
6
|
+
# The C native engine handles the minimal set (`& < >
|
|
7
|
+
# " ' &#N; &#xH;`) inline during extraction. This
|
|
8
|
+
# module is the broader Ruby table — useful when post-processing raw
|
|
9
|
+
# strings, or when running the Ruby fallback path against HTML with
|
|
10
|
+
# uncommon entities.
|
|
11
|
+
#
|
|
12
|
+
# The table here covers the ~140 most frequent named entities from
|
|
13
|
+
# the HTML5 spec — enough to handle real-world text.
|
|
14
|
+
module Entities
|
|
15
|
+
TABLE = {
|
|
16
|
+
"amp" => "&", "lt" => "<", "gt" => ">",
|
|
17
|
+
"quot" => '"', "apos" => "'", "nbsp" => " ",
|
|
18
|
+
"copy" => "©", "reg" => "®", "trade" => "™",
|
|
19
|
+
"mdash" => "—", "ndash" => "–", "hellip" => "…",
|
|
20
|
+
"ldquo" => "“", "rdquo" => "”", "lsquo" => "‘",
|
|
21
|
+
"rsquo" => "’", "laquo" => "«", "raquo" => "»",
|
|
22
|
+
"lsaquo" => "‹", "rsaquo" => "›", "sbquo" => "‚",
|
|
23
|
+
"bdquo" => "„", "times" => "×", "divide" => "÷",
|
|
24
|
+
"plusmn" => "±", "deg" => "°", "sect" => "§",
|
|
25
|
+
"para" => "¶", "middot" => "·", "bull" => "•",
|
|
26
|
+
"dagger" => "†", "Dagger" => "‡", "permil" => "‰",
|
|
27
|
+
"prime" => "′", "Prime" => "″", "ne" => "≠",
|
|
28
|
+
"le" => "≤", "ge" => "≥", "asymp" => "≈",
|
|
29
|
+
"equiv" => "≡", "infin" => "∞", "sum" => "∑",
|
|
30
|
+
"prod" => "∏", "int" => "∫", "radic" => "√",
|
|
31
|
+
"part" => "∂", "nabla" => "∇", "minus" => "−",
|
|
32
|
+
"plus" => "+", "lowast" => "∗", "frasl" => "⁄",
|
|
33
|
+
"larr" => "←", "rarr" => "→", "uarr" => "↑",
|
|
34
|
+
"darr" => "↓", "harr" => "↔", "crarr" => "↵",
|
|
35
|
+
"lArr" => "⇐", "rArr" => "⇒", "uArr" => "⇑",
|
|
36
|
+
"dArr" => "⇓", "hArr" => "⇔", "spades" => "♠",
|
|
37
|
+
"clubs" => "♣", "hearts" => "♥", "diams" => "♦",
|
|
38
|
+
"loz" => "◊", "Aacute" => "Á", "aacute" => "á",
|
|
39
|
+
"Acirc" => "Â", "acirc" => "â", "Agrave" => "À",
|
|
40
|
+
"agrave" => "à", "Aring" => "Å", "aring" => "å",
|
|
41
|
+
"Atilde" => "Ã", "atilde" => "ã", "Auml" => "Ä",
|
|
42
|
+
"auml" => "ä", "AElig" => "Æ", "aelig" => "æ",
|
|
43
|
+
"Ccedil" => "Ç", "ccedil" => "ç", "Eacute" => "É",
|
|
44
|
+
"eacute" => "é", "Ecirc" => "Ê", "ecirc" => "ê",
|
|
45
|
+
"Egrave" => "È", "egrave" => "è", "Euml" => "Ë",
|
|
46
|
+
"euml" => "ë", "Iacute" => "Í", "iacute" => "í",
|
|
47
|
+
"Icirc" => "Î", "icirc" => "î", "Igrave" => "Ì",
|
|
48
|
+
"igrave" => "ì", "Iuml" => "Ï", "iuml" => "ï",
|
|
49
|
+
"Ntilde" => "Ñ", "ntilde" => "ñ", "Oacute" => "Ó",
|
|
50
|
+
"oacute" => "ó", "Ocirc" => "Ô", "ocirc" => "ô",
|
|
51
|
+
"Ograve" => "Ò", "ograve" => "ò", "Oslash" => "Ø",
|
|
52
|
+
"oslash" => "ø", "Otilde" => "Õ", "otilde" => "õ",
|
|
53
|
+
"Ouml" => "Ö", "ouml" => "ö", "Uacute" => "Ú",
|
|
54
|
+
"uacute" => "ú", "Ucirc" => "Û", "ucirc" => "û",
|
|
55
|
+
"Ugrave" => "Ù", "ugrave" => "ù", "Uuml" => "Ü",
|
|
56
|
+
"uuml" => "ü", "Yacute" => "Ý", "yacute" => "ý",
|
|
57
|
+
"yuml" => "ÿ", "szlig" => "ß",
|
|
58
|
+
"iexcl" => "¡", "iquest" => "¿", "cent" => "¢",
|
|
59
|
+
"pound" => "£", "yen" => "¥", "euro" => "€",
|
|
60
|
+
"curren" => "¤", "shy" => "",
|
|
61
|
+
"frac12" => "½", "frac14" => "¼", "frac34" => "¾",
|
|
62
|
+
"alpha" => "α", "beta" => "β", "gamma" => "γ",
|
|
63
|
+
"delta" => "δ", "epsilon" => "ε", "pi" => "π",
|
|
64
|
+
"sigma" => "σ", "omega" => "ω",
|
|
65
|
+
"ensp" => " ", "emsp" => " ", "thinsp" => " ",
|
|
66
|
+
"zwnj" => "", "zwj" => ""
|
|
67
|
+
}.freeze
|
|
68
|
+
|
|
69
|
+
ENTITY_RE = /&(?:#(?:x([0-9A-Fa-f]+)|(\d+))|([a-zA-Z][a-zA-Z0-9]+));/.freeze
|
|
70
|
+
|
|
71
|
+
# Decode a string containing HTML entities into plain UTF-8 text.
|
|
72
|
+
def self.decode(s)
|
|
73
|
+
return s if s.nil? || s.empty?
|
|
74
|
+
s.to_s.gsub(ENTITY_RE) do
|
|
75
|
+
hex = Regexp.last_match(1)
|
|
76
|
+
dec = Regexp.last_match(2)
|
|
77
|
+
named = Regexp.last_match(3)
|
|
78
|
+
if hex
|
|
79
|
+
[hex.to_i(16)].pack("U")
|
|
80
|
+
elsif dec
|
|
81
|
+
[dec.to_i].pack("U")
|
|
82
|
+
elsif named
|
|
83
|
+
TABLE[named] || Regexp.last_match(0)
|
|
84
|
+
else
|
|
85
|
+
Regexp.last_match(0)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
# Base for all Scrapetor errors.
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# Raised when a required field in the schema can't be found.
|
|
8
|
+
class ExtractionError < Error; end
|
|
9
|
+
|
|
10
|
+
# Raised when a schema descriptor isn't valid for the native engine.
|
|
11
|
+
class SchemaError < Error; end
|
|
12
|
+
end
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Scrapetor
|
|
6
|
+
# Schema execution. The hot path operates on raw Nokolexbor nodes and
|
|
7
|
+
# inlines coercion — no Scrapetor::Node allocations per emitted field.
|
|
8
|
+
module Extractor
|
|
9
|
+
def self.run(doc, scope, schema)
|
|
10
|
+
result = {}
|
|
11
|
+
base_url = doc.respond_to?(:base_url) ? doc.base_url : nil
|
|
12
|
+
schema.fields.each do |f|
|
|
13
|
+
result[f.name] = extract_field(scope, f, base_url)
|
|
14
|
+
end
|
|
15
|
+
schema.groups.each do |g|
|
|
16
|
+
result[g.name] = run_group(scope, g, base_url)
|
|
17
|
+
end
|
|
18
|
+
result
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def self.run_group(scope, group, base_url)
|
|
22
|
+
out = []
|
|
23
|
+
scope.css(group.selector).each do |sub|
|
|
24
|
+
inner = {}
|
|
25
|
+
group.fields.each { |f| inner[f.name] = extract_field(sub, f, base_url) }
|
|
26
|
+
group.groups.each { |gg| inner[gg.name] = run_group(sub, gg, base_url) }
|
|
27
|
+
out << inner
|
|
28
|
+
end
|
|
29
|
+
out
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def self.extract_field(scope, f, base_url)
|
|
33
|
+
selectors = f.selector.is_a?(Array) ? f.selector : [f.selector]
|
|
34
|
+
value =
|
|
35
|
+
if f.multi
|
|
36
|
+
extract_multi(scope, selectors, f, base_url)
|
|
37
|
+
else
|
|
38
|
+
extract_single(scope, selectors, f, base_url)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# default + required
|
|
42
|
+
missing = value.nil? || (f.multi && value.empty?)
|
|
43
|
+
value = f.default if missing && !f.default.nil?
|
|
44
|
+
|
|
45
|
+
if f.required && (value.nil? || (f.multi && value.respond_to?(:empty?) && value.empty?))
|
|
46
|
+
raise ExtractionError, "required field `#{f.name}` not found"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# transform last (after coerce + default)
|
|
50
|
+
value = f.transform.call(value) if f.transform && !value.nil?
|
|
51
|
+
value
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def self.extract_single(scope, selectors, f, base_url)
|
|
55
|
+
selectors.each do |sel|
|
|
56
|
+
n = sel ? scope.at_css(sel) : scope
|
|
57
|
+
next if n.nil?
|
|
58
|
+
raw = extract_raw(n, f)
|
|
59
|
+
next if raw.nil?
|
|
60
|
+
return coerce(raw, f, base_url, n)
|
|
61
|
+
end
|
|
62
|
+
nil
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def self.extract_multi(scope, selectors, f, base_url)
|
|
66
|
+
out = []
|
|
67
|
+
selectors.each do |sel|
|
|
68
|
+
nodes = sel ? scope.css(sel) : [scope]
|
|
69
|
+
nodes.each do |n|
|
|
70
|
+
raw = extract_raw(n, f)
|
|
71
|
+
next if raw.nil?
|
|
72
|
+
v = coerce(raw, f, base_url, n)
|
|
73
|
+
out << v unless v.nil?
|
|
74
|
+
end
|
|
75
|
+
break unless out.empty?
|
|
76
|
+
end
|
|
77
|
+
out
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def self.extract_raw(node, f)
|
|
81
|
+
return node.inner_html if f.type == :html && f.attr_str.nil?
|
|
82
|
+
if f.attr_str
|
|
83
|
+
node[f.attr_str]
|
|
84
|
+
else
|
|
85
|
+
node.text
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def self.coerce(raw, f, base_url, _node)
|
|
90
|
+
return nil if raw.nil?
|
|
91
|
+
v = raw
|
|
92
|
+
v = Cleaner.clean(v) if f.clean
|
|
93
|
+
case f.type
|
|
94
|
+
when :text then v
|
|
95
|
+
when :integer then int_coerce(v)
|
|
96
|
+
when :float then float_coerce(v)
|
|
97
|
+
when :money then Money.parse(v)
|
|
98
|
+
when :url then f.normalize_url ? URL.absolute(v, base_url) : v
|
|
99
|
+
when :date then date_coerce(v)
|
|
100
|
+
when :json then json_coerce(v)
|
|
101
|
+
when :boolean then bool_coerce(v)
|
|
102
|
+
when :list then list_coerce(v, f.delimiter)
|
|
103
|
+
when :html then v # already inner_html
|
|
104
|
+
else v
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def self.int_coerce(v)
|
|
109
|
+
s = v.to_s.gsub(/[^\d\-]/, "")
|
|
110
|
+
s.empty? || s == "-" ? nil : s.to_i
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def self.float_coerce(v)
|
|
114
|
+
s = v.to_s.gsub(/[^\d.\-]/, "")
|
|
115
|
+
s.empty? || s == "-" || s == "." ? nil : s.to_f
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def self.date_coerce(v)
|
|
119
|
+
require "date"
|
|
120
|
+
::Date.parse(v.to_s)
|
|
121
|
+
rescue ::ArgumentError, ::TypeError
|
|
122
|
+
nil
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def self.json_coerce(v)
|
|
126
|
+
JSON.parse(v.to_s)
|
|
127
|
+
rescue JSON::ParserError
|
|
128
|
+
nil
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
TRUTHY_STRINGS = %w[true yes 1 on enabled].freeze
|
|
132
|
+
FALSY_STRINGS = %w[false no 0 off disabled].freeze
|
|
133
|
+
private_constant :TRUTHY_STRINGS, :FALSY_STRINGS
|
|
134
|
+
|
|
135
|
+
def self.bool_coerce(v)
|
|
136
|
+
s = v.to_s.strip.downcase
|
|
137
|
+
return true if TRUTHY_STRINGS.include?(s)
|
|
138
|
+
return false if FALSY_STRINGS.include?(s)
|
|
139
|
+
return true if s == "yes"
|
|
140
|
+
nil
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def self.list_coerce(v, delimiter)
|
|
144
|
+
v.to_s.split(delimiter).map(&:strip).reject(&:empty?)
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
# Native HTTP/2-capable fetch layer. Wraps the libcurl-backed
|
|
5
|
+
# Scrapetor::Native::Http module. Distinct from Scrapetor::HTTP /
|
|
6
|
+
# Scrapetor.fetch, which is the Net::HTTP-based fallback used by
|
|
7
|
+
# tests, the CLI, and environments without libcurl.
|
|
8
|
+
#
|
|
9
|
+
# Capabilities depend on the libcurl your gem links against. Inspect
|
|
10
|
+
# Scrapetor::Fetcher.features to see what's actually wired up
|
|
11
|
+
# (HTTP/2 / brotli / zstd / libz). HTTP/2 + gzip is the typical
|
|
12
|
+
# baseline on macOS; brotli/zstd need a libcurl rebuilt with them.
|
|
13
|
+
#
|
|
14
|
+
# The connection cache lives on a per-OS-thread libcurl easy handle —
|
|
15
|
+
# repeated fetches to the same host inside a single thread reuse the
|
|
16
|
+
# TLS session and HTTP/2 stream. The fetch itself drops the GVL, so
|
|
17
|
+
# background Ruby threads keep running during the round-trip.
|
|
18
|
+
#
|
|
19
|
+
# resp = Scrapetor::Fetcher.get("https://api.example.com/items")
|
|
20
|
+
# resp[:status] # => 200
|
|
21
|
+
# resp[:http_version] # => "2"
|
|
22
|
+
# resp[:headers] # => {"content-type" => "application/json", ...}
|
|
23
|
+
# resp[:body] # => "..."
|
|
24
|
+
#
|
|
25
|
+
# doc = Scrapetor::Fetcher.fetch("https://example.com/")
|
|
26
|
+
# # => Scrapetor::Document parsed from the response body, base_url
|
|
27
|
+
# # set to the final URL after redirects.
|
|
28
|
+
module Fetcher
|
|
29
|
+
class NotAvailableError < StandardError; end
|
|
30
|
+
class FetchError < StandardError
|
|
31
|
+
attr_reader :status, :response
|
|
32
|
+
def initialize(msg, status: nil, response: nil)
|
|
33
|
+
super(msg)
|
|
34
|
+
@status = status
|
|
35
|
+
@response = response
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
DEFAULT_USER_AGENT = "scrapetor/#{Scrapetor::VERSION} (libcurl)"
|
|
40
|
+
|
|
41
|
+
# Status codes worth retrying. 408 (timeout), 425 (early), 429
|
|
42
|
+
# (rate-limit), 500–504 (transient upstream). 5xx >= 505 are
|
|
43
|
+
# protocol-level rejections; they don't usually heal on retry.
|
|
44
|
+
DEFAULT_RETRY_STATUSES = [408, 425, 429, 500, 502, 503, 504].freeze
|
|
45
|
+
|
|
46
|
+
# Compute the backoff delay for attempt N (1-indexed). Exponential
|
|
47
|
+
# with full jitter — the AWS-style 'random between 0 and 2^n * base'
|
|
48
|
+
# variant — capped at max_backoff.
|
|
49
|
+
def self.backoff_for(attempt, base: 0.3, max: 10.0, retry_after: nil)
|
|
50
|
+
return [retry_after.to_f, max].min if retry_after && retry_after.to_f > 0
|
|
51
|
+
hi = [base * (2.0**(attempt - 1)), max].min
|
|
52
|
+
rand * hi
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def self.retryable_response?(resp, retry_statuses)
|
|
56
|
+
retry_statuses.any? { |s| s == resp[:status] }
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def self.parse_retry_after(headers)
|
|
60
|
+
v = headers && (headers["retry-after"] || headers["Retry-After"])
|
|
61
|
+
return nil unless v
|
|
62
|
+
# Either integer seconds or HTTP-date. We only honour the
|
|
63
|
+
# integer form (the date form is rare and parsing it adds a
|
|
64
|
+
# dependency on Time.httpdate that the caller may not need).
|
|
65
|
+
v.to_s.strip.match?(/\A\d+\z/) ? v.to_i : nil
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def self.available?
|
|
69
|
+
defined?(Scrapetor::Native::Http::AVAILABLE) &&
|
|
70
|
+
Scrapetor::Native::Http::AVAILABLE
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def self.features
|
|
74
|
+
ensure_available!
|
|
75
|
+
Scrapetor::Native::Http.features
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Single GET with optional retry + exponential backoff.
|
|
79
|
+
#
|
|
80
|
+
# retry: 0 - try once and return whatever happens (default).
|
|
81
|
+
# retry: N - retry up to N times on transient failure.
|
|
82
|
+
# backoff: - base backoff in seconds (default 0.3).
|
|
83
|
+
# max_backoff: - cap on a single sleep (default 10.0).
|
|
84
|
+
# retry_on: - statuses to retry (default [408, 425, 429, 500..504]).
|
|
85
|
+
#
|
|
86
|
+
# Network failures (IOError from libcurl: connect refused, DNS,
|
|
87
|
+
# TLS, timeout) are also retried. The wait between attempts honours
|
|
88
|
+
# Retry-After response headers (numeric form) when present and
|
|
89
|
+
# otherwise uses exponential backoff with full jitter.
|
|
90
|
+
def self.get(url, **opts)
|
|
91
|
+
ensure_available!
|
|
92
|
+
retries = opts.delete(:retry) || 0
|
|
93
|
+
base = opts.delete(:backoff) || 0.3
|
|
94
|
+
max_backoff = opts.delete(:max_backoff) || 10.0
|
|
95
|
+
retry_on = opts.delete(:retry_on) || DEFAULT_RETRY_STATUSES
|
|
96
|
+
opts[:user_agent] ||= DEFAULT_USER_AGENT
|
|
97
|
+
attempt = 0
|
|
98
|
+
last_err = nil
|
|
99
|
+
loop do
|
|
100
|
+
begin
|
|
101
|
+
resp = Scrapetor::Native::Http.get(url.to_s, opts)
|
|
102
|
+
return resp unless retries > attempt && retryable_response?(resp, retry_on)
|
|
103
|
+
ra = parse_retry_after(resp[:headers])
|
|
104
|
+
sleep backoff_for(attempt + 1, base: base, max: max_backoff,
|
|
105
|
+
retry_after: ra)
|
|
106
|
+
rescue IOError => e
|
|
107
|
+
last_err = e
|
|
108
|
+
raise e unless retries > attempt
|
|
109
|
+
sleep backoff_for(attempt + 1, base: base, max: max_backoff)
|
|
110
|
+
end
|
|
111
|
+
attempt += 1
|
|
112
|
+
end
|
|
113
|
+
rescue IOError
|
|
114
|
+
raise last_err if last_err
|
|
115
|
+
raise
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Fetch + parse. Raises FetchError on non-2xx status by default;
|
|
119
|
+
# pass raise_for_status: false to inspect non-2xx responses.
|
|
120
|
+
def self.fetch(url, raise_for_status: true, **opts)
|
|
121
|
+
resp = get(url, **opts)
|
|
122
|
+
if raise_for_status && (resp[:status] < 200 || resp[:status] >= 400)
|
|
123
|
+
raise FetchError.new(
|
|
124
|
+
"Scrapetor::Fetcher.fetch #{url} -> HTTP #{resp[:status]}",
|
|
125
|
+
status: resp[:status], response: resp
|
|
126
|
+
)
|
|
127
|
+
end
|
|
128
|
+
Scrapetor.parse(resp[:body], base_url: resp[:final_url])
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def self.ensure_available!
|
|
132
|
+
return if available?
|
|
133
|
+
raise NotAvailableError,
|
|
134
|
+
"Scrapetor::Fetcher requires libcurl at build time. " \
|
|
135
|
+
"Reinstall after `brew install curl` / `apt-get install libcurl4-openssl-dev`."
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# N concurrent GETs across pthread workers, each with a persistent
|
|
139
|
+
# libcurl handle (per-OS-thread connection cache). The full batch
|
|
140
|
+
# runs under one GVL release — other Ruby threads stay live
|
|
141
|
+
# throughout.
|
|
142
|
+
#
|
|
143
|
+
# results = Scrapetor::Fetcher.parallel_get(urls, threads: 8,
|
|
144
|
+
# timeout_ms: 5_000)
|
|
145
|
+
# # results is Array<Hash>; successful entries carry
|
|
146
|
+
# # :status, :headers, :body, :final_url, :http_version
|
|
147
|
+
# # failed entries carry { error: { url:, error: } } only.
|
|
148
|
+
# N concurrent GETs with optional retry. The native batch returns
|
|
149
|
+
# all results in one pass; failed entries (transient status or
|
|
150
|
+
# network error) get a second batch dispatch under retry, with
|
|
151
|
+
# the previous-attempt's wait honoured globally — i.e. one sleep
|
|
152
|
+
# between attempts rather than per-URL — so the pool keeps moving.
|
|
153
|
+
#
|
|
154
|
+
# Per-URL Retry-After headers are read on retryable responses and
|
|
155
|
+
# the maximum of them governs the next inter-attempt sleep, so a
|
|
156
|
+
# rate-limited host pulls the whole pool to its backoff rather
|
|
157
|
+
# than thrashing the rest in parallel.
|
|
158
|
+
def self.parallel_get(urls, **opts)
|
|
159
|
+
ensure_available!
|
|
160
|
+
urls = Array(urls).map(&:to_s)
|
|
161
|
+
return [] if urls.empty?
|
|
162
|
+
|
|
163
|
+
retries = opts.delete(:retry) || 0
|
|
164
|
+
base = opts.delete(:backoff) || 0.3
|
|
165
|
+
max_backoff = opts.delete(:max_backoff) || 10.0
|
|
166
|
+
retry_on = opts.delete(:retry_on) || DEFAULT_RETRY_STATUSES
|
|
167
|
+
opts[:user_agent] ||= DEFAULT_USER_AGENT
|
|
168
|
+
|
|
169
|
+
results = Array.new(urls.size)
|
|
170
|
+
pending = (0...urls.size).to_a
|
|
171
|
+
attempt = 0
|
|
172
|
+
loop do
|
|
173
|
+
batch = pending.map { |i| urls[i] }
|
|
174
|
+
batch_res = Scrapetor::Native::Http.parallel_fetch(batch, opts)
|
|
175
|
+
next_pending = []
|
|
176
|
+
next_retry_after = nil
|
|
177
|
+
pending.each_with_index do |orig_i, pos|
|
|
178
|
+
r = batch_res[pos]
|
|
179
|
+
if attempt < retries && retry_eligible?(r, retry_on)
|
|
180
|
+
ra = r[:headers] ? parse_retry_after(r[:headers]) : nil
|
|
181
|
+
next_retry_after = ra if ra && (next_retry_after.nil? || ra > next_retry_after)
|
|
182
|
+
next_pending << orig_i
|
|
183
|
+
else
|
|
184
|
+
results[orig_i] = r
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
break if next_pending.empty?
|
|
188
|
+
attempt += 1
|
|
189
|
+
sleep backoff_for(attempt, base: base, max: max_backoff,
|
|
190
|
+
retry_after: next_retry_after)
|
|
191
|
+
pending = next_pending
|
|
192
|
+
end
|
|
193
|
+
results
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def self.retry_eligible?(r, retry_on)
|
|
197
|
+
return true if r[:error] # network-level
|
|
198
|
+
r[:status] && retry_on.any? { |s| s == r[:status] }
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Single-thread curl_multi bulk fetch — one driver thread, one
|
|
202
|
+
# multi handle, N concurrent transfers multiplexed via
|
|
203
|
+
# curl_multi_perform. Complements parallel_get:
|
|
204
|
+
#
|
|
205
|
+
# parallel_get - N pthread workers, each running blocking easy.
|
|
206
|
+
# Best when each transfer has CPU work after the
|
|
207
|
+
# fetch (decode + parse) since the GVL is released
|
|
208
|
+
# across the full batch and CPU scales with cores.
|
|
209
|
+
#
|
|
210
|
+
# multi_get - one driver thread, N concurrent in-flight.
|
|
211
|
+
# Best for I/O-dominated high-fan-out (hundreds of
|
|
212
|
+
# URLs across many hosts) where pthread setup
|
|
213
|
+
# overhead outweighs the in-flight count.
|
|
214
|
+
#
|
|
215
|
+
# Both share the same global CURLSH so connections / DNS / TLS
|
|
216
|
+
# sessions pool across them.
|
|
217
|
+
def self.multi_get(urls, **opts)
|
|
218
|
+
ensure_available!
|
|
219
|
+
urls = Array(urls).map(&:to_s)
|
|
220
|
+
return [] if urls.empty?
|
|
221
|
+
opts[:user_agent] ||= DEFAULT_USER_AGENT
|
|
222
|
+
Scrapetor::Native::Http.multi_fetch(urls, opts)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Bulk-revalidate cached entries. Issues HEAD with
|
|
226
|
+
# If-None-Match / If-Modified-Since for every URL whose cache
|
|
227
|
+
# entry exists; the server's 304 / 200 verdict classifies each:
|
|
228
|
+
#
|
|
229
|
+
# :fresh - server said 304; cache still valid.
|
|
230
|
+
# :changed - server returned a new 2xx; cache rewritten.
|
|
231
|
+
# :missing - server returned 4xx (gone / not found).
|
|
232
|
+
# :error - transport failure.
|
|
233
|
+
#
|
|
234
|
+
# Returns a Hash[url => Symbol]. Optimal for crawls of N pages
|
|
235
|
+
# over moderate intervals: HEAD round-trips are cheap and run
|
|
236
|
+
# all-concurrent under curl_multi.
|
|
237
|
+
def self.revalidate(urls, cache_dir:, **opts)
|
|
238
|
+
ensure_available!
|
|
239
|
+
urls = Array(urls).map(&:to_s)
|
|
240
|
+
return {} if urls.empty?
|
|
241
|
+
opts[:user_agent] ||= DEFAULT_USER_AGENT
|
|
242
|
+
results = Scrapetor::Native::Http.multi_fetch(urls,
|
|
243
|
+
opts.merge(method: :head, cache_dir: cache_dir))
|
|
244
|
+
out = {}
|
|
245
|
+
results.each_with_index do |r, i|
|
|
246
|
+
url = urls[i]
|
|
247
|
+
out[url] =
|
|
248
|
+
if r[:error]
|
|
249
|
+
:error
|
|
250
|
+
elsif r[:headers] && r[:headers]["x-scrapetor-cache"] == "hit"
|
|
251
|
+
:fresh
|
|
252
|
+
elsif r[:status] && r[:status] >= 400
|
|
253
|
+
:missing
|
|
254
|
+
else
|
|
255
|
+
:changed
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
out
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# multi_get + per-response parse, all under one no-GVL window.
|
|
262
|
+
# Returns Array<Scrapetor::Document | nil>, in input order. Failed
|
|
263
|
+
# entries are nil. Best for high-fan-out crawls where you want
|
|
264
|
+
# parsed Documents back and the I/O outweighs the per-page
|
|
265
|
+
# CPU cost.
|
|
266
|
+
def self.multi_fetch(urls, **opts)
|
|
267
|
+
urls = Array(urls).map(&:to_s)
|
|
268
|
+
return [] if urls.empty?
|
|
269
|
+
ensure_available!
|
|
270
|
+
opts[:user_agent] ||= DEFAULT_USER_AGENT
|
|
271
|
+
results = Scrapetor::Native::Http.multi_fetch(urls, opts.merge(parse: true))
|
|
272
|
+
results.map do |r|
|
|
273
|
+
next nil if r[:error]
|
|
274
|
+
native = r[:document]
|
|
275
|
+
next Scrapetor.parse(r[:body], base_url: r[:final_url]) unless native
|
|
276
|
+
Scrapetor::Document.new("", base_url: r[:final_url], native: native)
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Streaming variant of multi_get: yields each response as it
|
|
281
|
+
# completes (in completion order, not input order), so the caller
|
|
282
|
+
# can start processing while other transfers are still in flight.
|
|
283
|
+
# Pass parse: true to also parse the body in the worker thread.
|
|
284
|
+
#
|
|
285
|
+
# Scrapetor::Fetcher.multi_each(urls, threads: 8) do |r|
|
|
286
|
+
# puts r[:final_url], r[:status]
|
|
287
|
+
# # later transfers may still be on the wire here
|
|
288
|
+
# end
|
|
289
|
+
def self.multi_each(urls, **opts)
|
|
290
|
+
return enum_for(:multi_each, urls, **opts) unless block_given?
|
|
291
|
+
ensure_available!
|
|
292
|
+
urls = Array(urls).map(&:to_s)
|
|
293
|
+
return if urls.empty?
|
|
294
|
+
opts[:user_agent] ||= DEFAULT_USER_AGENT
|
|
295
|
+
batch = Scrapetor::Native::Http::MultiBatch.new(urls, opts)
|
|
296
|
+
while (r = batch.next)
|
|
297
|
+
yield r
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# Method shorthands. Each is just a `.get` invocation with the
|
|
302
|
+
# corresponding method, plus the body sugar that POST/PUT/PATCH
|
|
303
|
+
# almost always need.
|
|
304
|
+
def self.post(url, body: nil, form: nil, json: nil, multipart: nil, **opts)
|
|
305
|
+
if multipart
|
|
306
|
+
opts[:multipart] = multipart
|
|
307
|
+
get(url, **opts.merge(method: :post))
|
|
308
|
+
else
|
|
309
|
+
body, opts = build_body(body, form, json, opts)
|
|
310
|
+
get(url, **opts.merge(method: :post, body: body))
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
# Convenience constructors for multipart values.
|
|
315
|
+
def self.upload_file(path, filename: nil, content_type: nil)
|
|
316
|
+
h = { path: path.to_s }
|
|
317
|
+
h[:filename] = filename if filename
|
|
318
|
+
h[:content_type] = content_type if content_type
|
|
319
|
+
h
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
def self.upload_bytes(bytes, filename:, content_type: "application/octet-stream")
|
|
323
|
+
{ data: bytes, filename: filename, content_type: content_type }
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
def self.put(url, body: nil, form: nil, json: nil, **opts)
|
|
327
|
+
body, opts = build_body(body, form, json, opts)
|
|
328
|
+
get(url, **opts.merge(method: :put, body: body))
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
def self.patch(url, body: nil, form: nil, json: nil, **opts)
|
|
332
|
+
body, opts = build_body(body, form, json, opts)
|
|
333
|
+
get(url, **opts.merge(method: :patch, body: body))
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
def self.delete(url, **opts)
|
|
337
|
+
get(url, **opts.merge(method: :delete))
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
def self.head(url, **opts)
|
|
341
|
+
get(url, **opts.merge(method: :head))
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
# Build the request body from one of :body / :form / :json.
|
|
345
|
+
# Returns [body_string, opts_with_content_type_header_set].
|
|
346
|
+
def self.build_body(body, form, json, opts)
|
|
347
|
+
headers = (opts[:headers] || {}).dup
|
|
348
|
+
if json
|
|
349
|
+
require "json"
|
|
350
|
+
body = JSON.generate(json)
|
|
351
|
+
headers["Content-Type"] ||= "application/json"
|
|
352
|
+
elsif form
|
|
353
|
+
require "uri"
|
|
354
|
+
body = URI.encode_www_form(form)
|
|
355
|
+
headers["Content-Type"] ||= "application/x-www-form-urlencoded"
|
|
356
|
+
end
|
|
357
|
+
opts[:headers] = headers unless headers.empty?
|
|
358
|
+
[body, opts]
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
# Convenience: parallel_get + parse each successful response into
|
|
362
|
+
# a Scrapetor::Document. Failed entries return nil.
|
|
363
|
+
#
|
|
364
|
+
# The parse runs INSIDE the same no-GVL pthread worker that did
|
|
365
|
+
# the fetch (parse: true on the C side), so the whole batch —
|
|
366
|
+
# network + decode + transcode + dom_parse + index build — runs
|
|
367
|
+
# multi-core under a single GVL release. The main thread only
|
|
368
|
+
# wraps already-parsed documents.
|
|
369
|
+
def self.parallel_fetch(urls, **opts)
|
|
370
|
+
urls = Array(urls).map(&:to_s)
|
|
371
|
+
return [] if urls.empty?
|
|
372
|
+
ensure_available!
|
|
373
|
+
opts[:user_agent] ||= DEFAULT_USER_AGENT
|
|
374
|
+
results = Scrapetor::Native::Http.parallel_fetch(urls, opts.merge(parse: true))
|
|
375
|
+
results.map do |r|
|
|
376
|
+
next nil if r[:error]
|
|
377
|
+
native = r[:document]
|
|
378
|
+
next Scrapetor.parse(r[:body], base_url: r[:final_url]) unless native
|
|
379
|
+
Scrapetor::Document.new("", base_url: r[:final_url], native: native)
|
|
380
|
+
end
|
|
381
|
+
end
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
# Top-level shorthand for the libcurl path. Distinct from
|
|
385
|
+
# Scrapetor.fetch (Net::HTTP) so callers can opt-in to HTTP/2 +
|
|
386
|
+
# connection reuse where it's actually available.
|
|
387
|
+
def self.fetch_http2(url, **opts)
|
|
388
|
+
Fetcher.fetch(url, **opts)
|
|
389
|
+
end
|
|
390
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Scrapetor
|
|
4
|
+
# Structural fingerprint of a DOM subtree.
|
|
5
|
+
# Phase 1: tag-bigram rolling hash over the top `depth` levels.
|
|
6
|
+
# Phase 2+: tag bigrams + attribute-presence hash + child-shape hash.
|
|
7
|
+
module Fingerprint
|
|
8
|
+
MASK = 0xFFFFFFFFFFFFFFFF
|
|
9
|
+
|
|
10
|
+
def self.structural(node, depth: 4)
|
|
11
|
+
backing = node.respond_to?(:backing_node) ? node.backing_node : node
|
|
12
|
+
h = 0
|
|
13
|
+
walk(backing, depth) do |tag|
|
|
14
|
+
h = (h * 1_315_423_911 + tag.hash) & MASK
|
|
15
|
+
end
|
|
16
|
+
h
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def self.walk(nlx, depth, &block)
|
|
20
|
+
return if depth <= 0
|
|
21
|
+
return unless nlx.respond_to?(:children)
|
|
22
|
+
nlx.children.each do |c|
|
|
23
|
+
next unless c.respond_to?(:element?) && c.element?
|
|
24
|
+
block.call(c.name)
|
|
25
|
+
walk(c, depth - 1, &block)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|