fetch_util 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +97 -0
- data/CHANGELOG.md +48 -0
- data/LICENSE.txt +21 -0
- data/README.md +199 -0
- data/Rakefile +18 -0
- data/SKILL.md +92 -0
- data/exe/fetch_util +6 -0
- data/lib/fetch_util/assets/extract.js +1 -0
- data/lib/fetch_util/assets/vendor/readability.js +2314 -0
- data/lib/fetch_util/assets/vendor/turndown.js +974 -0
- data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
- data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
- data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
- data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
- data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
- data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
- data/lib/fetch_util/browser/navigation.rb +13 -0
- data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
- data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
- data/lib/fetch_util/browser/site_stabilization.rb +13 -0
- data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
- data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
- data/lib/fetch_util/browser/stabilization.rb +13 -0
- data/lib/fetch_util/browser.rb +135 -0
- data/lib/fetch_util/cli.rb +124 -0
- data/lib/fetch_util/extractor.rb +56 -0
- data/lib/fetch_util/fetcher.rb +242 -0
- data/lib/fetch_util/parallel_fetcher.rb +97 -0
- data/lib/fetch_util/raw_docs_fallback.rb +260 -0
- data/lib/fetch_util/regulatory/cache_store.rb +92 -0
- data/lib/fetch_util/regulatory/directives.rb +106 -0
- data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
- data/lib/fetch_util/regulatory/headers.rb +39 -0
- data/lib/fetch_util/regulatory/http_client.rb +70 -0
- data/lib/fetch_util/regulatory/human.rb +104 -0
- data/lib/fetch_util/regulatory/orchestration.rb +82 -0
- data/lib/fetch_util/regulatory/page.rb +70 -0
- data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
- data/lib/fetch_util/regulatory/robots.rb +117 -0
- data/lib/fetch_util/regulatory/signals.rb +106 -0
- data/lib/fetch_util/regulatory/source_selection.rb +60 -0
- data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
- data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
- data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
- data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
- data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
- data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
- data/lib/fetch_util/regulatory.rb +74 -0
- data/lib/fetch_util/request_log.rb +24 -0
- data/lib/fetch_util/result.rb +58 -0
- data/lib/fetch_util/searcher/result_filtering.rb +102 -0
- data/lib/fetch_util/searcher.rb +332 -0
- data/lib/fetch_util/version.rb +5 -0
- data/lib/fetch_util.rb +115 -0
- metadata +145 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "cgi"
|
|
4
|
+
require "net/http"
|
|
5
|
+
require "nokogiri"
|
|
6
|
+
require "uri"
|
|
7
|
+
|
|
8
|
+
module FetchUtil
|
|
9
|
+
class RawDocsFallback
|
|
10
|
+
DEFAULT_HEADERS = {
|
|
11
|
+
"User-Agent" => Browser::DEFAULT_USER_AGENT,
|
|
12
|
+
"Accept-Language" => Browser::DEFAULT_ACCEPT_LANGUAGE
|
|
13
|
+
}.freeze
|
|
14
|
+
|
|
15
|
+
BLOCK_ELEMENTS = %w[h1 h2 h3 h4 h5 h6 p pre ul ol li table tr].freeze
|
|
16
|
+
BLOCK_SELECTOR = BLOCK_ELEMENTS.join(", ").freeze
|
|
17
|
+
DROP_SELECTORS = [
|
|
18
|
+
"script",
|
|
19
|
+
"style",
|
|
20
|
+
"nav",
|
|
21
|
+
"aside",
|
|
22
|
+
"footer",
|
|
23
|
+
".toc",
|
|
24
|
+
".sidebar",
|
|
25
|
+
".breadcrumbs",
|
|
26
|
+
"[aria-label*='breadcrumb']",
|
|
27
|
+
".headerlink",
|
|
28
|
+
".copybutton",
|
|
29
|
+
"button"
|
|
30
|
+
].freeze
|
|
31
|
+
DOCS_ROOT_SELECTORS = [
|
|
32
|
+
"main article",
|
|
33
|
+
"main",
|
|
34
|
+
"article",
|
|
35
|
+
"[role='main']",
|
|
36
|
+
".content",
|
|
37
|
+
".resource-container"
|
|
38
|
+
].freeze
|
|
39
|
+
PRUNED_TEXT_PATTERN = /\A(?:on this page|table of contents|edit this page|copy page|copy item path|search|settings|help|expand description)\z/i
|
|
40
|
+
|
|
41
|
+
def initialize(timeout: 20)
|
|
42
|
+
@timeout = timeout.to_i
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def fetch(url)
|
|
46
|
+
final_url, html = fetch_html(url)
|
|
47
|
+
payload = payload_from_html(html, requested_url: url, final_url: final_url)
|
|
48
|
+
return nil unless payload
|
|
49
|
+
|
|
50
|
+
[final_url, payload]
|
|
51
|
+
rescue Error, SocketError, SystemCallError, Timeout::Error, URI::InvalidURIError
|
|
52
|
+
nil
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def payload_from_html(html, requested_url:, final_url: requested_url)
|
|
56
|
+
document = Nokogiri::HTML(html)
|
|
57
|
+
root = fragment_root(document, final_url) || docs_root(document)
|
|
58
|
+
return nil unless root
|
|
59
|
+
|
|
60
|
+
prune!(root)
|
|
61
|
+
title = clean_text(fragment_title(document, final_url) || first_heading(root) || meta_title(document) || document.title)
|
|
62
|
+
markdown = markdown_from_root(root, title)
|
|
63
|
+
return nil if clean_text(markdown).length < 40
|
|
64
|
+
|
|
65
|
+
{
|
|
66
|
+
"title" => title,
|
|
67
|
+
"byline" => meta_value(document, "author"),
|
|
68
|
+
"excerpt" => first_paragraph(root),
|
|
69
|
+
"siteName" => meta_value(document, "og:site_name", attr: "property") || safe_host(final_url),
|
|
70
|
+
"publishedTime" => meta_value(document, "article:published_time", attr: "property") || meta_value(document, "publish-date"),
|
|
71
|
+
"canonicalUrl" => canonical_url(document, final_url),
|
|
72
|
+
"language" => document.at_css("html")&.[]("lang") || "en",
|
|
73
|
+
"html" => root.to_html,
|
|
74
|
+
"markdown" => markdown,
|
|
75
|
+
"readerMode" => false,
|
|
76
|
+
"contentType" => "article",
|
|
77
|
+
"suspect" => false,
|
|
78
|
+
"warnings" => []
|
|
79
|
+
}
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private
|
|
83
|
+
|
|
84
|
+
def meta_value(document, name, attr: "name")
|
|
85
|
+
document.at_css(%(meta[#{attr}="#{name}"]))&.[]("content")
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def canonical_url(document, final_url)
|
|
89
|
+
href = document.at_css('link[rel="canonical"]')&.[]("href")
|
|
90
|
+
return strip_fragment(final_url) unless href && !href.empty?
|
|
91
|
+
|
|
92
|
+
URI.join(final_url, href).to_s
|
|
93
|
+
rescue URI::InvalidURIError
|
|
94
|
+
strip_fragment(final_url)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def fragment_id(url)
|
|
98
|
+
fragment = URI.parse(url).fragment.to_s
|
|
99
|
+
CGI.unescape(fragment)
|
|
100
|
+
rescue URI::InvalidURIError
|
|
101
|
+
""
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def fragment_root(document, url)
|
|
105
|
+
id = fragment_id(url)
|
|
106
|
+
return nil if id.empty?
|
|
107
|
+
|
|
108
|
+
node = fragment_node(document, id)
|
|
109
|
+
return nil unless node
|
|
110
|
+
|
|
111
|
+
if node.name == "a" && node["name"] == id
|
|
112
|
+
container = Nokogiri::XML::Node.new("div", document)
|
|
113
|
+
sibling = node.next_sibling
|
|
114
|
+
while sibling
|
|
115
|
+
break if sibling.element? && sibling.name == "a" && sibling["name"]
|
|
116
|
+
|
|
117
|
+
container.add_child(sibling.dup)
|
|
118
|
+
sibling = sibling.next_sibling
|
|
119
|
+
end
|
|
120
|
+
return container if clean_text(container.text).length >= 40
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
candidate = node
|
|
124
|
+
while candidate&.element?
|
|
125
|
+
text = clean_text(candidate.text)
|
|
126
|
+
return candidate.dup if text.length >= 80 || candidate["id"] == id
|
|
127
|
+
candidate = candidate.parent
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
node.dup
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def fragment_title(document, url)
|
|
134
|
+
id = fragment_id(url)
|
|
135
|
+
return nil if id.empty?
|
|
136
|
+
|
|
137
|
+
node = fragment_node(document, id)
|
|
138
|
+
return nil unless node
|
|
139
|
+
|
|
140
|
+
heading = if node.name.match?(/h[1-6]/)
|
|
141
|
+
node
|
|
142
|
+
elsif node.name == "a" && node["name"] == id
|
|
143
|
+
node.at_xpath("following-sibling::*[1][self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6]") ||
|
|
144
|
+
node.at_xpath("following-sibling::*[1]//strong[1]")
|
|
145
|
+
else
|
|
146
|
+
node.at_css("h1, h2, h3, h4, h5, h6")
|
|
147
|
+
end
|
|
148
|
+
clean_text(heading&.text)
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def docs_root(document)
|
|
152
|
+
DOCS_ROOT_SELECTORS.each do |selector|
|
|
153
|
+
node = document.at_css(selector)
|
|
154
|
+
return node.dup if node && clean_text(node.text).length >= 120
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
body = document.at_css("body")
|
|
158
|
+
body&.dup
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def prune!(root)
|
|
162
|
+
DROP_SELECTORS.each { |selector| root.css(selector).remove }
|
|
163
|
+
root.css("*").each do |node|
|
|
164
|
+
text = clean_text(node.text)
|
|
165
|
+
node.remove if text.match?(PRUNED_TEXT_PATTERN)
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def meta_title(document)
|
|
170
|
+
meta_value(document, "og:title", attr: "property") || document.title
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def first_heading(root)
|
|
174
|
+
clean_text(root.at_css("h1, h2, h3")&.text)
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def first_paragraph(root)
|
|
178
|
+
root.css("p").map { |node| clean_text(node.text) }.find { |text| text.length >= 30 }
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def markdown_from_root(root, title)
|
|
182
|
+
sections = []
|
|
183
|
+
root.css(BLOCK_SELECTOR).each do |node|
|
|
184
|
+
text = clean_text(node.text)
|
|
185
|
+
next if text.empty?
|
|
186
|
+
|
|
187
|
+
case node.name
|
|
188
|
+
when /h([1-6])/
|
|
189
|
+
level = Regexp.last_match(1).to_i
|
|
190
|
+
sections << "#{"#" * level} #{text}"
|
|
191
|
+
when "p"
|
|
192
|
+
sections << text
|
|
193
|
+
when "pre"
|
|
194
|
+
sections << ["```", node.text.rstrip, "```"].join("\n")
|
|
195
|
+
when "li"
|
|
196
|
+
sections << "- #{text}"
|
|
197
|
+
when "tr"
|
|
198
|
+
cells = node.css("th, td").map { |cell| clean_text(cell.text) }.reject(&:empty?)
|
|
199
|
+
sections << "- #{cells.join(": ")}" unless cells.empty?
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
markdown = sections.join("\n\n").gsub(/\n{3,}/, "\n\n").strip
|
|
204
|
+
markdown = "# #{title}\n\n#{markdown}" if title && !markdown.start_with?("# #{title}")
|
|
205
|
+
markdown
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def clean_text(text)
|
|
209
|
+
FetchUtil.normalize_whitespace(text)
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def strip_fragment(url)
|
|
213
|
+
uri = URI.parse(url)
|
|
214
|
+
uri.fragment = nil
|
|
215
|
+
uri.to_s
|
|
216
|
+
rescue URI::InvalidURIError
|
|
217
|
+
url
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def safe_host(url)
|
|
221
|
+
URI.parse(url).host
|
|
222
|
+
rescue URI::InvalidURIError
|
|
223
|
+
nil
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
def fragment_node(document, id)
|
|
227
|
+
document.at_xpath(%(//*[@id=#{xpath_literal(id)}])) || document.at_xpath(%(//a[@name=#{xpath_literal(id)}]))
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def xpath_literal(value)
|
|
231
|
+
return %('#{value}') unless value.include?("'")
|
|
232
|
+
return %("#{value}") unless value.include?('"')
|
|
233
|
+
|
|
234
|
+
parts = value.split("'").map { |part| %('#{part}') }
|
|
235
|
+
%(concat(#{parts.join(%q(, "'", ))}))
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
def fetch_html(url, limit: 5)
|
|
239
|
+
raise URI::InvalidURIError, "too many redirects" if limit <= 0
|
|
240
|
+
|
|
241
|
+
uri = URI.parse(url)
|
|
242
|
+
request = Net::HTTP::Get.new(uri)
|
|
243
|
+
DEFAULT_HEADERS.each { |key, value| request[key] = value }
|
|
244
|
+
|
|
245
|
+
response = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https", open_timeout: @timeout, read_timeout: @timeout) do |http|
|
|
246
|
+
http.request(request)
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
case response
|
|
250
|
+
when Net::HTTPSuccess
|
|
251
|
+
[uri.to_s, response.body]
|
|
252
|
+
when Net::HTTPRedirection
|
|
253
|
+
location = URI.join(uri, response["location"]).to_s
|
|
254
|
+
fetch_html(location, limit: limit - 1)
|
|
255
|
+
else
|
|
256
|
+
raise Error, "HTTP #{response.code}"
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
end
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module CacheStore
|
|
6
|
+
private
|
|
7
|
+
|
|
8
|
+
def cache_fetch(key)
|
|
9
|
+
path = cache_file_path(key)
|
|
10
|
+
cached = read_cache(path)
|
|
11
|
+
return cached if cached
|
|
12
|
+
|
|
13
|
+
payload = yield
|
|
14
|
+
write_cache(path, payload)
|
|
15
|
+
payload
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def fetch_record(key, uri, fallback: nil, require_success: true)
|
|
19
|
+
cache_fetch(key) do
|
|
20
|
+
response = record_response(uri, require_success: require_success)
|
|
21
|
+
response ? yield(response.body, response) : fallback
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def record_response(uri, require_success:)
|
|
26
|
+
Array(uri).each do |candidate|
|
|
27
|
+
response = safe_get(candidate)
|
|
28
|
+
next unless response
|
|
29
|
+
|
|
30
|
+
return response if !require_success || response.status&.between?(200, 299)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
nil
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def cache_file_path(key)
|
|
37
|
+
digest = Digest::SHA256.hexdigest("v#{CACHE_VERSION}:#{key}")
|
|
38
|
+
File.join(cache_path, "#{digest}.json")
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def read_cache(path)
|
|
42
|
+
return nil unless File.exist?(path)
|
|
43
|
+
|
|
44
|
+
parsed = JSON.parse(File.read(path))
|
|
45
|
+
cached_at = Time.parse(parsed.fetch("cached_at"))
|
|
46
|
+
return nil if Time.now.utc - cached_at > CACHE_TTL
|
|
47
|
+
|
|
48
|
+
parsed["payload"]
|
|
49
|
+
rescue Errno::ENOENT, JSON::ParserError, KeyError, TypeError, ArgumentError
|
|
50
|
+
nil
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def write_cache(path, payload)
|
|
54
|
+
FileUtils.mkdir_p(File.dirname(path))
|
|
55
|
+
File.write(path, JSON.generate({ "cached_at" => Time.now.utc.iso8601, "payload" => json_safe(payload) }))
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def safe_get(url)
|
|
59
|
+
client.get(url)
|
|
60
|
+
rescue ArgumentError, IOError, SocketError, Timeout::Error
|
|
61
|
+
nil
|
|
62
|
+
rescue FetchUtil::Error, SystemCallError, OpenSSL::SSL::SSLError
|
|
63
|
+
nil
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def deep_copy(value)
|
|
67
|
+
JSON.parse(JSON.generate(json_safe(value)))
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def response_chain(response)
|
|
71
|
+
Array(response&.redirects) + [response].compact
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def json_safe(value)
|
|
75
|
+
case value
|
|
76
|
+
when String
|
|
77
|
+
value.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
|
|
78
|
+
when Array
|
|
79
|
+
value.map { |item| json_safe(item) }
|
|
80
|
+
when Hash
|
|
81
|
+
safe = {}
|
|
82
|
+
value.each do |key, item|
|
|
83
|
+
safe[json_safe(key)] = json_safe(item)
|
|
84
|
+
end
|
|
85
|
+
safe
|
|
86
|
+
else
|
|
87
|
+
value
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module Directives
|
|
6
|
+
private
|
|
7
|
+
|
|
8
|
+
def extract_robot_directive_signals(value, path:, meta_name: nil)
|
|
9
|
+
current_user_agent = nil
|
|
10
|
+
tokenize_robot_directives(value).flat_map do |token|
|
|
11
|
+
prefix, directive = robot_directive_prefix(token)
|
|
12
|
+
if prefix
|
|
13
|
+
current_user_agent = prefix
|
|
14
|
+
directive_signals(directive, path: path, user_agent: prefix)
|
|
15
|
+
else
|
|
16
|
+
directive_signals(token, path: path, user_agent: meta_name && !meta_name.casecmp?("robots") ? meta_name : current_user_agent)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def tokenize_robot_directives(value)
|
|
22
|
+
protected = value.to_s.gsub(
|
|
23
|
+
/(unavailable_after\s*:\s*[A-Za-z]{3}),\s*(\d{1,2}\s+[A-Za-z]{3}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+[A-Za-z+-]+)/i,
|
|
24
|
+
'\\1__FETCH_UTIL_COMMA__\\2'
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
protected.split(",").map { |token| token.gsub("__FETCH_UTIL_COMMA__", ",").strip }.reject(&:empty?)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def robot_directive_prefix(token)
|
|
31
|
+
match = token.to_s.match(/\A([A-Za-z0-9*_.-]+)\s*:\s*(.+)\z/)
|
|
32
|
+
return [nil, nil] unless match
|
|
33
|
+
|
|
34
|
+
prefix = match[1]
|
|
35
|
+
directive = match[2]
|
|
36
|
+
return [nil, nil] if FetchUtil::Regulatory::Robots::ROBOT_DIRECTIVES.include?(prefix.downcase)
|
|
37
|
+
|
|
38
|
+
[prefix, directive]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def directive_signals(directive, path:, user_agent: nil)
|
|
42
|
+
name, raw_value = directive.to_s.split(":", 2)
|
|
43
|
+
name = name.to_s.strip.downcase
|
|
44
|
+
value = raw_value.to_s.strip
|
|
45
|
+
conditions = directive_conditions(user_agent)
|
|
46
|
+
|
|
47
|
+
case name
|
|
48
|
+
when "all"
|
|
49
|
+
[
|
|
50
|
+
build_signal("allow", "index", path: path, conditions: conditions),
|
|
51
|
+
build_signal("allow", "follow", path: path, conditions: conditions)
|
|
52
|
+
]
|
|
53
|
+
when "follow"
|
|
54
|
+
[build_signal("allow", "follow", path: path, conditions: conditions)]
|
|
55
|
+
when "index"
|
|
56
|
+
[build_signal("allow", "index", path: path, conditions: conditions)]
|
|
57
|
+
when "indexifembedded"
|
|
58
|
+
[build_signal("allow", "index", path: path, conditions: conditions.merge("if-embedded" => true))]
|
|
59
|
+
when "max-image-preview"
|
|
60
|
+
max_image_preview_signal(path: path, conditions: conditions, value: value)
|
|
61
|
+
when "max-snippet"
|
|
62
|
+
[build_signal("allow", "snippet", path: path, conditions: conditions.merge("max-chars" => integer_or_value(value)))]
|
|
63
|
+
when "max-video-preview"
|
|
64
|
+
[build_signal("allow", "video-preview", path: path, conditions: conditions.merge("max-seconds" => integer_or_value(value)))]
|
|
65
|
+
when "noai"
|
|
66
|
+
[build_signal("disallow", "ai-training", path: path, conditions: conditions)]
|
|
67
|
+
when "noarchive", "nocache"
|
|
68
|
+
[build_signal("disallow", "archive", path: path, conditions: conditions)]
|
|
69
|
+
when "nofollow"
|
|
70
|
+
[build_signal("disallow", "follow", path: path, conditions: conditions)]
|
|
71
|
+
when "noimageai"
|
|
72
|
+
[build_signal("disallow", "image-ai-training", path: path, conditions: conditions)]
|
|
73
|
+
when "noimageindex"
|
|
74
|
+
[build_signal("disallow", "image-index", path: path, conditions: conditions)]
|
|
75
|
+
when "noindex"
|
|
76
|
+
[build_signal("disallow", "index", path: path, conditions: conditions)]
|
|
77
|
+
when "none"
|
|
78
|
+
[
|
|
79
|
+
build_signal("disallow", "index", path: path, conditions: conditions),
|
|
80
|
+
build_signal("disallow", "follow", path: path, conditions: conditions)
|
|
81
|
+
]
|
|
82
|
+
when "nosnippet"
|
|
83
|
+
[build_signal("disallow", "snippet", path: path, conditions: conditions)]
|
|
84
|
+
when "notranslate"
|
|
85
|
+
[build_signal("disallow", "translate", path: path, conditions: conditions)]
|
|
86
|
+
when "unavailable_after"
|
|
87
|
+
[build_signal("disallow", "index", path: path, conditions: conditions.merge("after" => value))]
|
|
88
|
+
else
|
|
89
|
+
[]
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def max_image_preview_signal(path:, conditions:, value:)
|
|
94
|
+
return [build_signal("disallow", "image-preview", path: path, conditions: conditions)] if value.casecmp?("none")
|
|
95
|
+
|
|
96
|
+
[build_signal("allow", "image-preview", path: path, conditions: conditions.merge("max" => value))]
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def directive_conditions(user_agent)
|
|
100
|
+
return {} if user_agent.to_s.strip.empty?
|
|
101
|
+
|
|
102
|
+
{ "user-agent" => robot_user_agent_glob(user_agent) }
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module FetchRecords
|
|
6
|
+
private
|
|
7
|
+
|
|
8
|
+
def normalize_output_path(value)
|
|
9
|
+
text = value.to_s.strip
|
|
10
|
+
return "/*" if text.empty? || text == "/"
|
|
11
|
+
|
|
12
|
+
text
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def request_target(uri)
|
|
16
|
+
path = uri.path.to_s.empty? ? "/" : uri.path.to_s
|
|
17
|
+
query = uri.query.to_s
|
|
18
|
+
return path if query.empty?
|
|
19
|
+
|
|
20
|
+
"#{path}?#{query}"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def origin_query?(uri)
|
|
24
|
+
uri.path.to_s.empty? && uri.query.to_s.empty?
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def page_query_target(record, fallback:)
|
|
28
|
+
final_url = record["final_url"].to_s
|
|
29
|
+
return fallback if final_url.empty?
|
|
30
|
+
|
|
31
|
+
request_target(parse_http_uri(final_url))
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def robots_uri(uri)
|
|
35
|
+
"#{base_origin(uri)}/robots.txt"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def tdmrep_uri(uri)
|
|
39
|
+
"#{base_origin(uri)}/.well-known/tdmrep.json"
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def trusttxt_uri(uri)
|
|
43
|
+
"#{base_origin(uri)}/trust.txt"
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def trusttxt_well_known_uri(uri)
|
|
47
|
+
"#{base_origin(uri)}/.well-known/trust.txt"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def base_origin(uri)
|
|
51
|
+
port = if (uri.scheme == "https" && uri.port == 443) || (uri.scheme == "http" && uri.port == 80)
|
|
52
|
+
nil
|
|
53
|
+
else
|
|
54
|
+
":#{uri.port}"
|
|
55
|
+
end
|
|
56
|
+
"#{uri.scheme}://#{uri.host}#{port}"
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def origin_key(uri)
|
|
60
|
+
base_origin(uri)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def parse_http_uri(value)
|
|
64
|
+
uri = URI.parse(value.to_s.strip)
|
|
65
|
+
unless uri.is_a?(URI::HTTP) && uri.host
|
|
66
|
+
raise ArgumentError, "unsupported url: #{value}"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
uri
|
|
70
|
+
rescue URI::InvalidURIError
|
|
71
|
+
raise ArgumentError, "unsupported url: #{value}"
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def first_header_value(headers, name)
|
|
75
|
+
Array(headers[name]).first.to_s.strip
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def header_values(headers, name)
|
|
79
|
+
headers.fetch(name, [])
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def html_content?(headers, body)
|
|
83
|
+
content_type = first_header_value(headers, "content-type")
|
|
84
|
+
return true if content_type.include?("text/html") || content_type.include?("application/xhtml+xml")
|
|
85
|
+
|
|
86
|
+
body.to_s.lstrip.start_with?("<!DOCTYPE html", "<html", "<HTML")
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def parse_meta_tags(body)
|
|
90
|
+
body.to_s.scan(/<meta\b[^>]*>/im).map do |tag|
|
|
91
|
+
attributes = {}
|
|
92
|
+
tag.scan(/([A-Za-z_:.-]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))/).each do |name, quoted, single, bare|
|
|
93
|
+
attributes[name.downcase] = CGI.unescapeHTML(quoted || single || bare || "")
|
|
94
|
+
end
|
|
95
|
+
attributes
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def json_like_response?(headers, body)
|
|
100
|
+
content_type = first_header_value(headers, "content-type")
|
|
101
|
+
return true if content_type.include?("application/json") || content_type.include?("application/ld+json")
|
|
102
|
+
|
|
103
|
+
stripped = body.to_s.lstrip
|
|
104
|
+
stripped.start_with?("{", "[")
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module Headers
|
|
6
|
+
def extract_x_robot_signals(headers, path:)
|
|
7
|
+
header_values(headers, "x-robots-tag").flat_map do |value|
|
|
8
|
+
extract_robot_directive_signals(value, path: path)
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def extract_content_usage_header_signals(headers, path:)
|
|
13
|
+
header_values(headers, "content-usage").flat_map do |value|
|
|
14
|
+
extract_content_usage_statement_signals(value, path: path)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def extract_meta_robot_signals(meta_tags, path:)
|
|
19
|
+
signals = []
|
|
20
|
+
|
|
21
|
+
meta_tags.each do |attributes|
|
|
22
|
+
if attributes["http-equiv"].to_s.casecmp?("x-robots-tag")
|
|
23
|
+
signals.concat(extract_robot_directive_signals(attributes["content"], path: path))
|
|
24
|
+
next
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
name = attributes["name"].to_s.strip
|
|
28
|
+
next if name.empty?
|
|
29
|
+
next if name.casecmp?("tdm-reservation") || name.casecmp?("tdm-policy")
|
|
30
|
+
next unless name.casecmp?("robots") || name.match?(/bot/i)
|
|
31
|
+
|
|
32
|
+
signals.concat(extract_robot_directive_signals(attributes["content"], path: path, meta_name: name))
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
signals
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
|
|
5
|
+
module FetchUtil
|
|
6
|
+
class Regulatory
|
|
7
|
+
class HttpClient
|
|
8
|
+
REDIRECT_LIMIT = 5
|
|
9
|
+
|
|
10
|
+
def initialize(timeout:, user_agent:)
|
|
11
|
+
@timeout = timeout.to_f
|
|
12
|
+
@user_agent = user_agent.to_s.strip
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def get(url, limit: REDIRECT_LIMIT)
|
|
16
|
+
uri = parse_http_uri(url)
|
|
17
|
+
fetch(uri, limit, [])
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
attr_reader :timeout, :user_agent
|
|
23
|
+
|
|
24
|
+
def fetch(uri, limit, redirects)
|
|
25
|
+
response = request(uri)
|
|
26
|
+
return build_response(uri, response, redirects: redirects) unless response.is_a?(Net::HTTPRedirection)
|
|
27
|
+
|
|
28
|
+
raise FetchUtil::Error, "too many redirects for #{uri}" if limit <= 0
|
|
29
|
+
|
|
30
|
+
location = response["location"].to_s.strip
|
|
31
|
+
return build_response(uri, response, redirects: redirects) if location.empty?
|
|
32
|
+
|
|
33
|
+
redirect_response = build_response(uri, response)
|
|
34
|
+
fetch(uri.merge(location), limit - 1, redirects + [redirect_response])
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def request(uri)
|
|
38
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
39
|
+
http.use_ssl = uri.scheme == "https"
|
|
40
|
+
http.open_timeout = timeout
|
|
41
|
+
http.read_timeout = timeout
|
|
42
|
+
request = Net::HTTP::Get.new(uri.request_uri.empty? ? "/" : uri.request_uri)
|
|
43
|
+
request["Accept"] = "text/html,application/json,text/plain,*/*"
|
|
44
|
+
request["User-Agent"] = user_agent unless user_agent.empty?
|
|
45
|
+
http.request(request)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def build_response(uri, response, redirects: [])
|
|
49
|
+
FetchUtil::Regulatory::Response.new(
|
|
50
|
+
url: uri.to_s,
|
|
51
|
+
status: response.code.to_i,
|
|
52
|
+
headers: response.to_hash.transform_keys(&:downcase),
|
|
53
|
+
body: response.body.to_s,
|
|
54
|
+
redirects: redirects
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def parse_http_uri(url)
|
|
59
|
+
uri = URI.parse(url.to_s)
|
|
60
|
+
unless uri.is_a?(URI::HTTP) && uri.host
|
|
61
|
+
raise ArgumentError, "unsupported url: #{url}"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
uri
|
|
65
|
+
rescue URI::InvalidURIError
|
|
66
|
+
raise ArgumentError, "unsupported url: #{url}"
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|