fetch_util 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +2 -0
  3. data/.rubocop.yml +97 -0
  4. data/CHANGELOG.md +48 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +199 -0
  7. data/Rakefile +18 -0
  8. data/SKILL.md +92 -0
  9. data/exe/fetch_util +6 -0
  10. data/lib/fetch_util/assets/extract.js +1 -0
  11. data/lib/fetch_util/assets/vendor/readability.js +2314 -0
  12. data/lib/fetch_util/assets/vendor/turndown.js +974 -0
  13. data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
  14. data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
  15. data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
  16. data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
  17. data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
  18. data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
  19. data/lib/fetch_util/browser/navigation.rb +13 -0
  20. data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
  21. data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
  22. data/lib/fetch_util/browser/site_stabilization.rb +13 -0
  23. data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
  24. data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
  25. data/lib/fetch_util/browser/stabilization.rb +13 -0
  26. data/lib/fetch_util/browser.rb +135 -0
  27. data/lib/fetch_util/cli.rb +124 -0
  28. data/lib/fetch_util/extractor.rb +56 -0
  29. data/lib/fetch_util/fetcher.rb +242 -0
  30. data/lib/fetch_util/parallel_fetcher.rb +97 -0
  31. data/lib/fetch_util/raw_docs_fallback.rb +260 -0
  32. data/lib/fetch_util/regulatory/cache_store.rb +92 -0
  33. data/lib/fetch_util/regulatory/directives.rb +106 -0
  34. data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
  35. data/lib/fetch_util/regulatory/headers.rb +39 -0
  36. data/lib/fetch_util/regulatory/http_client.rb +70 -0
  37. data/lib/fetch_util/regulatory/human.rb +104 -0
  38. data/lib/fetch_util/regulatory/orchestration.rb +82 -0
  39. data/lib/fetch_util/regulatory/page.rb +70 -0
  40. data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
  41. data/lib/fetch_util/regulatory/robots.rb +117 -0
  42. data/lib/fetch_util/regulatory/signals.rb +106 -0
  43. data/lib/fetch_util/regulatory/source_selection.rb +60 -0
  44. data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
  45. data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
  46. data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
  47. data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
  48. data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
  49. data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
  50. data/lib/fetch_util/regulatory.rb +74 -0
  51. data/lib/fetch_util/request_log.rb +24 -0
  52. data/lib/fetch_util/result.rb +58 -0
  53. data/lib/fetch_util/searcher/result_filtering.rb +102 -0
  54. data/lib/fetch_util/searcher.rb +332 -0
  55. data/lib/fetch_util/version.rb +5 -0
  56. data/lib/fetch_util.rb +115 -0
  57. metadata +145 -0
@@ -0,0 +1,260 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "cgi"
4
+ require "net/http"
5
+ require "nokogiri"
6
+ require "uri"
7
+
8
+ module FetchUtil
9
+ class RawDocsFallback
10
+ DEFAULT_HEADERS = {
11
+ "User-Agent" => Browser::DEFAULT_USER_AGENT,
12
+ "Accept-Language" => Browser::DEFAULT_ACCEPT_LANGUAGE
13
+ }.freeze
14
+
15
+ BLOCK_ELEMENTS = %w[h1 h2 h3 h4 h5 h6 p pre ul ol li table tr].freeze
16
+ BLOCK_SELECTOR = BLOCK_ELEMENTS.join(", ").freeze
17
+ DROP_SELECTORS = [
18
+ "script",
19
+ "style",
20
+ "nav",
21
+ "aside",
22
+ "footer",
23
+ ".toc",
24
+ ".sidebar",
25
+ ".breadcrumbs",
26
+ "[aria-label*='breadcrumb']",
27
+ ".headerlink",
28
+ ".copybutton",
29
+ "button"
30
+ ].freeze
31
+ DOCS_ROOT_SELECTORS = [
32
+ "main article",
33
+ "main",
34
+ "article",
35
+ "[role='main']",
36
+ ".content",
37
+ ".resource-container"
38
+ ].freeze
39
+ PRUNED_TEXT_PATTERN = /\A(?:on this page|table of contents|edit this page|copy page|copy item path|search|settings|help|expand description)\z/i
40
+
41
+ def initialize(timeout: 20)
42
+ @timeout = timeout.to_i
43
+ end
44
+
45
+ def fetch(url)
46
+ final_url, html = fetch_html(url)
47
+ payload = payload_from_html(html, requested_url: url, final_url: final_url)
48
+ return nil unless payload
49
+
50
+ [final_url, payload]
51
+ rescue Error, SocketError, SystemCallError, Timeout::Error, URI::InvalidURIError
52
+ nil
53
+ end
54
+
55
+ def payload_from_html(html, requested_url:, final_url: requested_url)
56
+ document = Nokogiri::HTML(html)
57
+ root = fragment_root(document, final_url) || docs_root(document)
58
+ return nil unless root
59
+
60
+ prune!(root)
61
+ title = clean_text(fragment_title(document, final_url) || first_heading(root) || meta_title(document) || document.title)
62
+ markdown = markdown_from_root(root, title)
63
+ return nil if clean_text(markdown).length < 40
64
+
65
+ {
66
+ "title" => title,
67
+ "byline" => meta_value(document, "author"),
68
+ "excerpt" => first_paragraph(root),
69
+ "siteName" => meta_value(document, "og:site_name", attr: "property") || safe_host(final_url),
70
+ "publishedTime" => meta_value(document, "article:published_time", attr: "property") || meta_value(document, "publish-date"),
71
+ "canonicalUrl" => canonical_url(document, final_url),
72
+ "language" => document.at_css("html")&.[]("lang") || "en",
73
+ "html" => root.to_html,
74
+ "markdown" => markdown,
75
+ "readerMode" => false,
76
+ "contentType" => "article",
77
+ "suspect" => false,
78
+ "warnings" => []
79
+ }
80
+ end
81
+
82
+ private
83
+
84
+ def meta_value(document, name, attr: "name")
85
+ document.at_css(%(meta[#{attr}="#{name}"]))&.[]("content")
86
+ end
87
+
88
+ def canonical_url(document, final_url)
89
+ href = document.at_css('link[rel="canonical"]')&.[]("href")
90
+ return strip_fragment(final_url) unless href && !href.empty?
91
+
92
+ URI.join(final_url, href).to_s
93
+ rescue URI::InvalidURIError
94
+ strip_fragment(final_url)
95
+ end
96
+
97
+ def fragment_id(url)
98
+ fragment = URI.parse(url).fragment.to_s
99
+ CGI.unescape(fragment)
100
+ rescue URI::InvalidURIError
101
+ ""
102
+ end
103
+
104
+ def fragment_root(document, url)
105
+ id = fragment_id(url)
106
+ return nil if id.empty?
107
+
108
+ node = fragment_node(document, id)
109
+ return nil unless node
110
+
111
+ if node.name == "a" && node["name"] == id
112
+ container = Nokogiri::XML::Node.new("div", document)
113
+ sibling = node.next_sibling
114
+ while sibling
115
+ break if sibling.element? && sibling.name == "a" && sibling["name"]
116
+
117
+ container.add_child(sibling.dup)
118
+ sibling = sibling.next_sibling
119
+ end
120
+ return container if clean_text(container.text).length >= 40
121
+ end
122
+
123
+ candidate = node
124
+ while candidate&.element?
125
+ text = clean_text(candidate.text)
126
+ return candidate.dup if text.length >= 80 || candidate["id"] == id
127
+ candidate = candidate.parent
128
+ end
129
+
130
+ node.dup
131
+ end
132
+
133
+ def fragment_title(document, url)
134
+ id = fragment_id(url)
135
+ return nil if id.empty?
136
+
137
+ node = fragment_node(document, id)
138
+ return nil unless node
139
+
140
+ heading = if node.name.match?(/h[1-6]/)
141
+ node
142
+ elsif node.name == "a" && node["name"] == id
143
+ node.at_xpath("following-sibling::*[1][self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6]") ||
144
+ node.at_xpath("following-sibling::*[1]//strong[1]")
145
+ else
146
+ node.at_css("h1, h2, h3, h4, h5, h6")
147
+ end
148
+ clean_text(heading&.text)
149
+ end
150
+
151
+ def docs_root(document)
152
+ DOCS_ROOT_SELECTORS.each do |selector|
153
+ node = document.at_css(selector)
154
+ return node.dup if node && clean_text(node.text).length >= 120
155
+ end
156
+
157
+ body = document.at_css("body")
158
+ body&.dup
159
+ end
160
+
161
+ def prune!(root)
162
+ DROP_SELECTORS.each { |selector| root.css(selector).remove }
163
+ root.css("*").each do |node|
164
+ text = clean_text(node.text)
165
+ node.remove if text.match?(PRUNED_TEXT_PATTERN)
166
+ end
167
+ end
168
+
169
+ def meta_title(document)
170
+ meta_value(document, "og:title", attr: "property") || document.title
171
+ end
172
+
173
+ def first_heading(root)
174
+ clean_text(root.at_css("h1, h2, h3")&.text)
175
+ end
176
+
177
+ def first_paragraph(root)
178
+ root.css("p").map { |node| clean_text(node.text) }.find { |text| text.length >= 30 }
179
+ end
180
+
181
+ def markdown_from_root(root, title)
182
+ sections = []
183
+ root.css(BLOCK_SELECTOR).each do |node|
184
+ text = clean_text(node.text)
185
+ next if text.empty?
186
+
187
+ case node.name
188
+ when /h([1-6])/
189
+ level = Regexp.last_match(1).to_i
190
+ sections << "#{"#" * level} #{text}"
191
+ when "p"
192
+ sections << text
193
+ when "pre"
194
+ sections << ["```", node.text.rstrip, "```"].join("\n")
195
+ when "li"
196
+ sections << "- #{text}"
197
+ when "tr"
198
+ cells = node.css("th, td").map { |cell| clean_text(cell.text) }.reject(&:empty?)
199
+ sections << "- #{cells.join(": ")}" unless cells.empty?
200
+ end
201
+ end
202
+
203
+ markdown = sections.join("\n\n").gsub(/\n{3,}/, "\n\n").strip
204
+ markdown = "# #{title}\n\n#{markdown}" if title && !markdown.start_with?("# #{title}")
205
+ markdown
206
+ end
207
+
208
+ def clean_text(text)
209
+ FetchUtil.normalize_whitespace(text)
210
+ end
211
+
212
+ def strip_fragment(url)
213
+ uri = URI.parse(url)
214
+ uri.fragment = nil
215
+ uri.to_s
216
+ rescue URI::InvalidURIError
217
+ url
218
+ end
219
+
220
+ def safe_host(url)
221
+ URI.parse(url).host
222
+ rescue URI::InvalidURIError
223
+ nil
224
+ end
225
+
226
+ def fragment_node(document, id)
227
+ document.at_xpath(%(//*[@id=#{xpath_literal(id)}])) || document.at_xpath(%(//a[@name=#{xpath_literal(id)}]))
228
+ end
229
+
230
+ def xpath_literal(value)
231
+ return %('#{value}') unless value.include?("'")
232
+ return %("#{value}") unless value.include?('"')
233
+
234
+ parts = value.split("'").map { |part| %('#{part}') }
235
+ %(concat(#{parts.join(%q(, "'", ))}))
236
+ end
237
+
238
+ def fetch_html(url, limit: 5)
239
+ raise URI::InvalidURIError, "too many redirects" if limit <= 0
240
+
241
+ uri = URI.parse(url)
242
+ request = Net::HTTP::Get.new(uri)
243
+ DEFAULT_HEADERS.each { |key, value| request[key] = value }
244
+
245
+ response = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https", open_timeout: @timeout, read_timeout: @timeout) do |http|
246
+ http.request(request)
247
+ end
248
+
249
+ case response
250
+ when Net::HTTPSuccess
251
+ [uri.to_s, response.body]
252
+ when Net::HTTPRedirection
253
+ location = URI.join(uri, response["location"]).to_s
254
+ fetch_html(location, limit: limit - 1)
255
+ else
256
+ raise Error, "HTTP #{response.code}"
257
+ end
258
+ end
259
+ end
260
+ end
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module CacheStore
6
+ private
7
+
8
+ def cache_fetch(key)
9
+ path = cache_file_path(key)
10
+ cached = read_cache(path)
11
+ return cached if cached
12
+
13
+ payload = yield
14
+ write_cache(path, payload)
15
+ payload
16
+ end
17
+
18
+ def fetch_record(key, uri, fallback: nil, require_success: true)
19
+ cache_fetch(key) do
20
+ response = record_response(uri, require_success: require_success)
21
+ response ? yield(response.body, response) : fallback
22
+ end
23
+ end
24
+
25
+ def record_response(uri, require_success:)
26
+ Array(uri).each do |candidate|
27
+ response = safe_get(candidate)
28
+ next unless response
29
+
30
+ return response if !require_success || response.status&.between?(200, 299)
31
+ end
32
+
33
+ nil
34
+ end
35
+
36
+ def cache_file_path(key)
37
+ digest = Digest::SHA256.hexdigest("v#{CACHE_VERSION}:#{key}")
38
+ File.join(cache_path, "#{digest}.json")
39
+ end
40
+
41
+ def read_cache(path)
42
+ return nil unless File.exist?(path)
43
+
44
+ parsed = JSON.parse(File.read(path))
45
+ cached_at = Time.parse(parsed.fetch("cached_at"))
46
+ return nil if Time.now.utc - cached_at > CACHE_TTL
47
+
48
+ parsed["payload"]
49
+ rescue Errno::ENOENT, JSON::ParserError, KeyError, TypeError, ArgumentError
50
+ nil
51
+ end
52
+
53
+ def write_cache(path, payload)
54
+ FileUtils.mkdir_p(File.dirname(path))
55
+ File.write(path, JSON.generate({ "cached_at" => Time.now.utc.iso8601, "payload" => json_safe(payload) }))
56
+ end
57
+
58
+ def safe_get(url)
59
+ client.get(url)
60
+ rescue ArgumentError, IOError, SocketError, Timeout::Error
61
+ nil
62
+ rescue FetchUtil::Error, SystemCallError, OpenSSL::SSL::SSLError
63
+ nil
64
+ end
65
+
66
+ def deep_copy(value)
67
+ JSON.parse(JSON.generate(json_safe(value)))
68
+ end
69
+
70
+ def response_chain(response)
71
+ Array(response&.redirects) + [response].compact
72
+ end
73
+
74
+ def json_safe(value)
75
+ case value
76
+ when String
77
+ value.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
78
+ when Array
79
+ value.map { |item| json_safe(item) }
80
+ when Hash
81
+ safe = {}
82
+ value.each do |key, item|
83
+ safe[json_safe(key)] = json_safe(item)
84
+ end
85
+ safe
86
+ else
87
+ value
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module Directives
6
+ private
7
+
8
+ def extract_robot_directive_signals(value, path:, meta_name: nil)
9
+ current_user_agent = nil
10
+ tokenize_robot_directives(value).flat_map do |token|
11
+ prefix, directive = robot_directive_prefix(token)
12
+ if prefix
13
+ current_user_agent = prefix
14
+ directive_signals(directive, path: path, user_agent: prefix)
15
+ else
16
+ directive_signals(token, path: path, user_agent: meta_name && !meta_name.casecmp?("robots") ? meta_name : current_user_agent)
17
+ end
18
+ end
19
+ end
20
+
21
+ def tokenize_robot_directives(value)
22
+ protected = value.to_s.gsub(
23
+ /(unavailable_after\s*:\s*[A-Za-z]{3}),\s*(\d{1,2}\s+[A-Za-z]{3}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+[A-Za-z+-]+)/i,
24
+ '\\1__FETCH_UTIL_COMMA__\\2'
25
+ )
26
+
27
+ protected.split(",").map { |token| token.gsub("__FETCH_UTIL_COMMA__", ",").strip }.reject(&:empty?)
28
+ end
29
+
30
+ def robot_directive_prefix(token)
31
+ match = token.to_s.match(/\A([A-Za-z0-9*_.-]+)\s*:\s*(.+)\z/)
32
+ return [nil, nil] unless match
33
+
34
+ prefix = match[1]
35
+ directive = match[2]
36
+ return [nil, nil] if FetchUtil::Regulatory::Robots::ROBOT_DIRECTIVES.include?(prefix.downcase)
37
+
38
+ [prefix, directive]
39
+ end
40
+
41
+ def directive_signals(directive, path:, user_agent: nil)
42
+ name, raw_value = directive.to_s.split(":", 2)
43
+ name = name.to_s.strip.downcase
44
+ value = raw_value.to_s.strip
45
+ conditions = directive_conditions(user_agent)
46
+
47
+ case name
48
+ when "all"
49
+ [
50
+ build_signal("allow", "index", path: path, conditions: conditions),
51
+ build_signal("allow", "follow", path: path, conditions: conditions)
52
+ ]
53
+ when "follow"
54
+ [build_signal("allow", "follow", path: path, conditions: conditions)]
55
+ when "index"
56
+ [build_signal("allow", "index", path: path, conditions: conditions)]
57
+ when "indexifembedded"
58
+ [build_signal("allow", "index", path: path, conditions: conditions.merge("if-embedded" => true))]
59
+ when "max-image-preview"
60
+ max_image_preview_signal(path: path, conditions: conditions, value: value)
61
+ when "max-snippet"
62
+ [build_signal("allow", "snippet", path: path, conditions: conditions.merge("max-chars" => integer_or_value(value)))]
63
+ when "max-video-preview"
64
+ [build_signal("allow", "video-preview", path: path, conditions: conditions.merge("max-seconds" => integer_or_value(value)))]
65
+ when "noai"
66
+ [build_signal("disallow", "ai-training", path: path, conditions: conditions)]
67
+ when "noarchive", "nocache"
68
+ [build_signal("disallow", "archive", path: path, conditions: conditions)]
69
+ when "nofollow"
70
+ [build_signal("disallow", "follow", path: path, conditions: conditions)]
71
+ when "noimageai"
72
+ [build_signal("disallow", "image-ai-training", path: path, conditions: conditions)]
73
+ when "noimageindex"
74
+ [build_signal("disallow", "image-index", path: path, conditions: conditions)]
75
+ when "noindex"
76
+ [build_signal("disallow", "index", path: path, conditions: conditions)]
77
+ when "none"
78
+ [
79
+ build_signal("disallow", "index", path: path, conditions: conditions),
80
+ build_signal("disallow", "follow", path: path, conditions: conditions)
81
+ ]
82
+ when "nosnippet"
83
+ [build_signal("disallow", "snippet", path: path, conditions: conditions)]
84
+ when "notranslate"
85
+ [build_signal("disallow", "translate", path: path, conditions: conditions)]
86
+ when "unavailable_after"
87
+ [build_signal("disallow", "index", path: path, conditions: conditions.merge("after" => value))]
88
+ else
89
+ []
90
+ end
91
+ end
92
+
93
+ def max_image_preview_signal(path:, conditions:, value:)
94
+ return [build_signal("disallow", "image-preview", path: path, conditions: conditions)] if value.casecmp?("none")
95
+
96
+ [build_signal("allow", "image-preview", path: path, conditions: conditions.merge("max" => value))]
97
+ end
98
+
99
+ def directive_conditions(user_agent)
100
+ return {} if user_agent.to_s.strip.empty?
101
+
102
+ { "user-agent" => robot_user_agent_glob(user_agent) }
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,108 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module FetchRecords
6
+ private
7
+
8
+ def normalize_output_path(value)
9
+ text = value.to_s.strip
10
+ return "/*" if text.empty? || text == "/"
11
+
12
+ text
13
+ end
14
+
15
+ def request_target(uri)
16
+ path = uri.path.to_s.empty? ? "/" : uri.path.to_s
17
+ query = uri.query.to_s
18
+ return path if query.empty?
19
+
20
+ "#{path}?#{query}"
21
+ end
22
+
23
+ def origin_query?(uri)
24
+ uri.path.to_s.empty? && uri.query.to_s.empty?
25
+ end
26
+
27
+ def page_query_target(record, fallback:)
28
+ final_url = record["final_url"].to_s
29
+ return fallback if final_url.empty?
30
+
31
+ request_target(parse_http_uri(final_url))
32
+ end
33
+
34
+ def robots_uri(uri)
35
+ "#{base_origin(uri)}/robots.txt"
36
+ end
37
+
38
+ def tdmrep_uri(uri)
39
+ "#{base_origin(uri)}/.well-known/tdmrep.json"
40
+ end
41
+
42
+ def trusttxt_uri(uri)
43
+ "#{base_origin(uri)}/trust.txt"
44
+ end
45
+
46
+ def trusttxt_well_known_uri(uri)
47
+ "#{base_origin(uri)}/.well-known/trust.txt"
48
+ end
49
+
50
+ def base_origin(uri)
51
+ port = if (uri.scheme == "https" && uri.port == 443) || (uri.scheme == "http" && uri.port == 80)
52
+ nil
53
+ else
54
+ ":#{uri.port}"
55
+ end
56
+ "#{uri.scheme}://#{uri.host}#{port}"
57
+ end
58
+
59
+ def origin_key(uri)
60
+ base_origin(uri)
61
+ end
62
+
63
+ def parse_http_uri(value)
64
+ uri = URI.parse(value.to_s.strip)
65
+ unless uri.is_a?(URI::HTTP) && uri.host
66
+ raise ArgumentError, "unsupported url: #{value}"
67
+ end
68
+
69
+ uri
70
+ rescue URI::InvalidURIError
71
+ raise ArgumentError, "unsupported url: #{value}"
72
+ end
73
+
74
+ def first_header_value(headers, name)
75
+ Array(headers[name]).first.to_s.strip
76
+ end
77
+
78
+ def header_values(headers, name)
79
+ headers.fetch(name, [])
80
+ end
81
+
82
+ def html_content?(headers, body)
83
+ content_type = first_header_value(headers, "content-type")
84
+ return true if content_type.include?("text/html") || content_type.include?("application/xhtml+xml")
85
+
86
+ body.to_s.lstrip.start_with?("<!DOCTYPE html", "<html", "<HTML")
87
+ end
88
+
89
+ def parse_meta_tags(body)
90
+ body.to_s.scan(/<meta\b[^>]*>/im).map do |tag|
91
+ attributes = {}
92
+ tag.scan(/([A-Za-z_:.-]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))/).each do |name, quoted, single, bare|
93
+ attributes[name.downcase] = CGI.unescapeHTML(quoted || single || bare || "")
94
+ end
95
+ attributes
96
+ end
97
+ end
98
+
99
+ def json_like_response?(headers, body)
100
+ content_type = first_header_value(headers, "content-type")
101
+ return true if content_type.include?("application/json") || content_type.include?("application/ld+json")
102
+
103
+ stripped = body.to_s.lstrip
104
+ stripped.start_with?("{", "[")
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module Headers
6
+ def extract_x_robot_signals(headers, path:)
7
+ header_values(headers, "x-robots-tag").flat_map do |value|
8
+ extract_robot_directive_signals(value, path: path)
9
+ end
10
+ end
11
+
12
+ def extract_content_usage_header_signals(headers, path:)
13
+ header_values(headers, "content-usage").flat_map do |value|
14
+ extract_content_usage_statement_signals(value, path: path)
15
+ end
16
+ end
17
+
18
+ def extract_meta_robot_signals(meta_tags, path:)
19
+ signals = []
20
+
21
+ meta_tags.each do |attributes|
22
+ if attributes["http-equiv"].to_s.casecmp?("x-robots-tag")
23
+ signals.concat(extract_robot_directive_signals(attributes["content"], path: path))
24
+ next
25
+ end
26
+
27
+ name = attributes["name"].to_s.strip
28
+ next if name.empty?
29
+ next if name.casecmp?("tdm-reservation") || name.casecmp?("tdm-policy")
30
+ next unless name.casecmp?("robots") || name.match?(/bot/i)
31
+
32
+ signals.concat(extract_robot_directive_signals(attributes["content"], path: path, meta_name: name))
33
+ end
34
+
35
+ signals
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+
5
+ module FetchUtil
6
+ class Regulatory
7
+ class HttpClient
8
+ REDIRECT_LIMIT = 5
9
+
10
+ def initialize(timeout:, user_agent:)
11
+ @timeout = timeout.to_f
12
+ @user_agent = user_agent.to_s.strip
13
+ end
14
+
15
+ def get(url, limit: REDIRECT_LIMIT)
16
+ uri = parse_http_uri(url)
17
+ fetch(uri, limit, [])
18
+ end
19
+
20
+ private
21
+
22
+ attr_reader :timeout, :user_agent
23
+
24
+ def fetch(uri, limit, redirects)
25
+ response = request(uri)
26
+ return build_response(uri, response, redirects: redirects) unless response.is_a?(Net::HTTPRedirection)
27
+
28
+ raise FetchUtil::Error, "too many redirects for #{uri}" if limit <= 0
29
+
30
+ location = response["location"].to_s.strip
31
+ return build_response(uri, response, redirects: redirects) if location.empty?
32
+
33
+ redirect_response = build_response(uri, response)
34
+ fetch(uri.merge(location), limit - 1, redirects + [redirect_response])
35
+ end
36
+
37
+ def request(uri)
38
+ http = Net::HTTP.new(uri.host, uri.port)
39
+ http.use_ssl = uri.scheme == "https"
40
+ http.open_timeout = timeout
41
+ http.read_timeout = timeout
42
+ request = Net::HTTP::Get.new(uri.request_uri.empty? ? "/" : uri.request_uri)
43
+ request["Accept"] = "text/html,application/json,text/plain,*/*"
44
+ request["User-Agent"] = user_agent unless user_agent.empty?
45
+ http.request(request)
46
+ end
47
+
48
+ def build_response(uri, response, redirects: [])
49
+ FetchUtil::Regulatory::Response.new(
50
+ url: uri.to_s,
51
+ status: response.code.to_i,
52
+ headers: response.to_hash.transform_keys(&:downcase),
53
+ body: response.body.to_s,
54
+ redirects: redirects
55
+ )
56
+ end
57
+
58
+ def parse_http_uri(url)
59
+ uri = URI.parse(url.to_s)
60
+ unless uri.is_a?(URI::HTTP) && uri.host
61
+ raise ArgumentError, "unsupported url: #{url}"
62
+ end
63
+
64
+ uri
65
+ rescue URI::InvalidURIError
66
+ raise ArgumentError, "unsupported url: #{url}"
67
+ end
68
+ end
69
+ end
70
+ end