fetch_util 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +2 -0
  3. data/.rubocop.yml +97 -0
  4. data/CHANGELOG.md +48 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +199 -0
  7. data/Rakefile +18 -0
  8. data/SKILL.md +92 -0
  9. data/exe/fetch_util +6 -0
  10. data/lib/fetch_util/assets/extract.js +1 -0
  11. data/lib/fetch_util/assets/vendor/readability.js +2314 -0
  12. data/lib/fetch_util/assets/vendor/turndown.js +974 -0
  13. data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
  14. data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
  15. data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
  16. data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
  17. data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
  18. data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
  19. data/lib/fetch_util/browser/navigation.rb +13 -0
  20. data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
  21. data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
  22. data/lib/fetch_util/browser/site_stabilization.rb +13 -0
  23. data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
  24. data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
  25. data/lib/fetch_util/browser/stabilization.rb +13 -0
  26. data/lib/fetch_util/browser.rb +135 -0
  27. data/lib/fetch_util/cli.rb +124 -0
  28. data/lib/fetch_util/extractor.rb +56 -0
  29. data/lib/fetch_util/fetcher.rb +242 -0
  30. data/lib/fetch_util/parallel_fetcher.rb +97 -0
  31. data/lib/fetch_util/raw_docs_fallback.rb +260 -0
  32. data/lib/fetch_util/regulatory/cache_store.rb +92 -0
  33. data/lib/fetch_util/regulatory/directives.rb +106 -0
  34. data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
  35. data/lib/fetch_util/regulatory/headers.rb +39 -0
  36. data/lib/fetch_util/regulatory/http_client.rb +70 -0
  37. data/lib/fetch_util/regulatory/human.rb +104 -0
  38. data/lib/fetch_util/regulatory/orchestration.rb +82 -0
  39. data/lib/fetch_util/regulatory/page.rb +70 -0
  40. data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
  41. data/lib/fetch_util/regulatory/robots.rb +117 -0
  42. data/lib/fetch_util/regulatory/signals.rb +106 -0
  43. data/lib/fetch_util/regulatory/source_selection.rb +60 -0
  44. data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
  45. data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
  46. data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
  47. data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
  48. data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
  49. data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
  50. data/lib/fetch_util/regulatory.rb +74 -0
  51. data/lib/fetch_util/request_log.rb +24 -0
  52. data/lib/fetch_util/result.rb +58 -0
  53. data/lib/fetch_util/searcher/result_filtering.rb +102 -0
  54. data/lib/fetch_util/searcher.rb +332 -0
  55. data/lib/fetch_util/version.rb +5 -0
  56. data/lib/fetch_util.rb +115 -0
  57. metadata +145 -0
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "thor"
5
+
6
+ module FetchUtil
7
+ class CLI < Thor
8
+ DEFAULT_FETCH_FIELDS = %i[
9
+ url
10
+ final_url
11
+ canonical_url
12
+ title
13
+ byline
14
+ site_name
15
+ published_time
16
+ markdown
17
+ content_type
18
+ suspect
19
+ warnings
20
+ ].freeze
21
+
22
+ class_option :log_path, type: :string, desc: "Append-only request log path"
23
+ class_option :format, type: :string, default: "markdown", enum: %w[markdown json jsonl], desc: "Output format"
24
+ class_option :timeout, type: :numeric, default: 20
25
+ class_option :wait, type: :numeric, default: 0.75
26
+ class_option :concurrency, type: :numeric, default: 4
27
+ class_option :reader_mode, type: :boolean, default: true
28
+ class_option :wait_for_idle, type: :boolean, default: true
29
+ class_option :include_html, type: :boolean, default: false, desc: "Include raw html in fetch output"
30
+
31
+ desc "version", "Display fetch_util version"
32
+ def version
33
+ puts FetchUtil::VERSION
34
+ end
35
+
36
+ desc "fetch URL [URL...]", "Fetch one or more URLs"
37
+ def fetch(*urls)
38
+ raise ArgumentError, "at least one URL is required" if urls.empty?
39
+
40
+ results = if urls.length == 1
41
+ [FetchUtil.fetch(urls.first, **fetch_options, request_log: request_log)]
42
+ else
43
+ FetchUtil.fetch_many(urls, **fetch_options, request_log: request_log, concurrency: options[:concurrency])
44
+ end
45
+
46
+ if options[:format] == "markdown"
47
+ results.each_with_index do |result, index|
48
+ puts "\n---\n\n" if index > 0
49
+ puts result.markdown
50
+ end
51
+ else
52
+ emit(urls.length == 1 && options[:format] == "json" ? result_payload(results.first) : results.map { |result| result_payload(result) })
53
+ end
54
+ end
55
+
56
+ desc "search QUERY", "Search across configured engines and aggregate results"
57
+ option :source, type: :array, default: FetchUtil::Searcher::DEFAULT_SOURCES, desc: "Search sources"
58
+ option :limit, type: :numeric, default: 10
59
+ option :verbose_search, type: :boolean, default: false, desc: "Include per-result search provenance"
60
+ def search(*terms)
61
+ query = terms.join(" ").strip
62
+ raise ArgumentError, "query is required" if query.empty?
63
+
64
+ payload = Searcher.new(
65
+ request_log: request_log,
66
+ sources: options[:source],
67
+ limit: options[:limit],
68
+ concurrency: [options[:concurrency], options[:source].length].min,
69
+ verbose: options[:verbose_search],
70
+ **fetch_options
71
+ ).search(query)
72
+
73
+ emit(payload)
74
+ end
75
+
76
+ desc "regulatory URL", "Inspect regulatory crawl, index, and TDM signals for one URL"
77
+ option :sources, type: :string, default: "machine", desc: "Comma-separated source selectors, e.g. machine,-robotstxt or human,machine,-human"
78
+ option :cache_path, type: :string, desc: "Structured regulatory cache directory"
79
+ def regulatory(url)
80
+ raise ArgumentError, "url is required" if url.to_s.strip.empty?
81
+
82
+ request_log.append("regulatory://#{url}?sources=#{options[:sources]}")
83
+ payload = FetchUtil.regulatory(
84
+ url,
85
+ cache_path: options[:cache_path],
86
+ sources: options[:sources],
87
+ timeout: options[:timeout]
88
+ )
89
+
90
+ emit(payload)
91
+ end
92
+
93
+ no_commands do
94
+ def request_log
95
+ @request_log ||= RequestLog.new(path: options[:log_path] || ENV.fetch("FETCH_UTIL_REQUEST_LOG", RequestLog::DEFAULT_PATH))
96
+ end
97
+
98
+ def fetch_options
99
+ {
100
+ timeout: options[:timeout],
101
+ wait: options[:wait],
102
+ wait_for_idle: options[:wait_for_idle],
103
+ reader_mode: options[:reader_mode]
104
+ }
105
+ end
106
+
107
+ def result_payload(result)
108
+ payload = result.to_h
109
+ payload = payload.select { |key, _value| DEFAULT_FETCH_FIELDS.include?(key) }
110
+ payload[:html] = result.html if options[:include_html]
111
+
112
+ payload.reject { |_key, value| value.nil? || value == "" }
113
+ end
114
+
115
+ def emit(payload)
116
+ if options[:format] == "jsonl" && payload.is_a?(Array)
117
+ payload.each { |item| puts JSON.generate(item) }
118
+ else
119
+ puts JSON.generate(payload)
120
+ end
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module FetchUtil
6
+ class Extractor
7
+ def initialize(reader_mode: true, asset_root: nil)
8
+ @reader_mode = reader_mode
9
+ @asset_root = asset_root || File.join(__dir__, "assets")
10
+ end
11
+
12
+ def extract(page)
13
+ payload = extract_payload(page)
14
+ raise ExtractionError, "Page extraction returned no content" unless payload.is_a?(Hash)
15
+
16
+ payload
17
+ rescue Ferrum::JavaScriptError, Ferrum::StatusError, Ferrum::TimeoutError => e
18
+ raise ExtractionError, e.message
19
+ end
20
+
21
+ private
22
+
23
+ def inject_assets(page)
24
+ page.add_script_tag(path: asset_path("vendor/readability.js"))
25
+ page.add_script_tag(path: asset_path("vendor/turndown.js"))
26
+ page.add_script_tag(path: asset_path("extract.js"))
27
+ end
28
+
29
+ def inject_assets_inline(page)
30
+ %w[vendor/readability.js vendor/turndown.js extract.js].each do |relative_path|
31
+ script = File.read(asset_path(relative_path), encoding: "UTF-8")
32
+ page.evaluate("#{script}\ntrue")
33
+ end
34
+ end
35
+
36
+ def extract_payload(page)
37
+ inject_assets(page)
38
+ page.evaluate(extraction_call)
39
+ rescue Ferrum::TimeoutError
40
+ begin
41
+ page.evaluate("window.stop && window.stop()")
42
+ rescue Ferrum::Error
43
+ end
44
+ inject_assets_inline(page)
45
+ page.evaluate(extraction_call)
46
+ end
47
+
48
+ def extraction_call
49
+ "window.FetchUtilExtract.extract(#{JSON.generate(reader_mode: @reader_mode)})"
50
+ end
51
+
52
+ def asset_path(relative_path)
53
+ File.join(@asset_root, relative_path)
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,242 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module FetchUtil
6
+ class Fetcher
7
+ HOMEPAGE_INDEX_PATTERN = Regexp.new(
8
+ "top stories|breaking news|latest news|headlines|" \
9
+ "aktuelle nachrichten|schlagzeilen|neueste nachrichten|" \
10
+ "à la une|dernières nouvelles|actualités|últimas noticias|" \
11
+ "noticias principales|notizie principali|ultime notizie|" \
12
+ "najnowsze wiadomości|najważniejsze|ostatnie wiadomości|aktualności|" \
13
+ "actualiteit|laatste nieuws|senaste nyheter|seneste nyheder|" \
14
+ "siste nytt|tuoreimmat uutiset|aktuálně|legfrissebb|" \
15
+ "cele mai noi știri|aktualności|најновије вести|останні новини|" \
16
+ "τελευταία νέα|güncel haberler|son dakika|senaste nyheterna|" \
17
+ "viktigaste nyheterna|aktualitātes|jaunākās ziņas|naujienos|" \
18
+ "svarbiausios naujienos|главные новости|últimas notícias|" \
19
+ "najnovšie správy|najnovije vijesti|derniers articles",
20
+ Regexp::IGNORECASE
21
+ ).freeze
22
+ DOCS_PORTAL_TITLE_PATTERN = /documentation|docs|the ultimate server/i
23
+ STRIPPED_QUERY_PARAM_PATTERNS = [
24
+ /\A(?:__goaway_|__cf_chl_)/,
25
+ /\A(?:utm_[a-z]+|fbclid|gclid|mc_cid|mc_eid)\z/,
26
+ /\A__gr(?:sc|ts|ua|rn)\z/
27
+ ].freeze
28
+ SECOND_LEVEL_COUNTRY_TLDS = /\A(co|com|org|net|gov|edu|ac)\z/
29
+ GOOGLE_HOST_PATTERN = /\Agoogle\.[a-z.]+\z/
30
+
31
+ def initialize(browser: nil, extractor: nil, **options)
32
+ @timeout = options.fetch(:timeout, 20)
33
+ @browser = browser || Browser.new(**browser_options(options))
34
+ @extractor = extractor || Extractor.new(reader_mode: options.fetch(:reader_mode, true))
35
+ @raw_docs_fallback = options[:raw_docs_fallback] || RawDocsFallback.new(timeout: @timeout)
36
+ @request_log = options[:request_log]
37
+ end
38
+
39
+ def quit
40
+ @browser.quit
41
+ end
42
+
43
+ def fetch(url)
44
+ t0 = monotonic_now
45
+ result = @browser.with_page(url) do |page|
46
+ payload = @extractor.extract(page)
47
+ build_result(url, page.current_url, payload)
48
+ end
49
+ fallback = docs_fallback_candidate?(url, result) && poor_docs_result?(result) ? @raw_docs_fallback.fetch(url) : nil
50
+ result = fallback_result(url, fallback) if fallback
51
+ log_request(url, t0)
52
+ result
53
+ rescue BrowserError, ExtractionError => e
54
+ fallback = docs_fallback_candidate?(url) ? @raw_docs_fallback.fetch(url) : nil
55
+ if fallback
56
+ result = fallback_result(url, fallback)
57
+ log_request(url, t0)
58
+ return result
59
+ end
60
+
61
+ log_request(url, t0)
62
+ raise e
63
+ end
64
+
65
+ private
66
+
67
+ def build_result(url, final_url, payload)
68
+ final_url = normalized_result_url(final_url)
69
+ canonical_url = normalized_result_url(payload["canonicalUrl"])
70
+ homepage_like = homepage_like?(final_url)
71
+ content_type = resolved_content_type(homepage_like, payload)
72
+ warnings = resolved_warnings(content_type, homepage_like, payload, requested_url: url, final_url: final_url)
73
+ suspect = warnings.any?
74
+ completeness_ratio = payload["contentCompletenessRatio"]&.to_f || 1.0
75
+ content_format = payload["contentFormat"]
76
+ paywall_state = payload["paywallState"]
77
+
78
+ metadata = {
79
+ title: payload["title"],
80
+ byline: payload["byline"],
81
+ excerpt: payload["excerpt"],
82
+ site_name: payload["siteName"],
83
+ published_time: payload["publishedTime"],
84
+ canonical_url: canonical_url,
85
+ language: payload["language"],
86
+ content_url: final_url,
87
+ reader_mode: payload["readerMode"],
88
+ content_type: content_type,
89
+ suspect: suspect,
90
+ warnings: warnings,
91
+ content_completeness_ratio: completeness_ratio,
92
+ content_format: content_format,
93
+ paywall_state: paywall_state
94
+ }.freeze
95
+
96
+ Result.new(
97
+ url: url,
98
+ final_url: final_url,
99
+ title: payload["title"],
100
+ byline: payload["byline"],
101
+ excerpt: payload["excerpt"],
102
+ site_name: payload["siteName"],
103
+ published_time: payload["publishedTime"],
104
+ canonical_url: canonical_url,
105
+ language: payload["language"],
106
+ html: payload["html"],
107
+ markdown: payload["markdown"],
108
+ metadata: metadata,
109
+ reader_mode: payload["readerMode"],
110
+ content_type: content_type,
111
+ suspect: suspect,
112
+ warnings: warnings,
113
+ content_completeness_ratio: completeness_ratio,
114
+ content_format: content_format,
115
+ paywall_state: paywall_state
116
+ )
117
+ end
118
+
119
+ def resolved_content_type(homepage_like, payload)
120
+ content_type = payload["contentType"] || "article"
121
+ return content_type unless content_type == "article"
122
+ return "list" if homepage_like && homepage_index_markdown?(payload["title"], payload["markdown"])
123
+
124
+ content_type
125
+ end
126
+
127
+ def resolved_warnings(content_type, homepage_like, payload, requested_url: nil, final_url: nil)
128
+ warnings = Array(payload["warnings"]).dup
129
+ warnings << "homepage_index_page" if content_type == "list" && homepage_like
130
+ warnings << "cross_domain_redirect" if cross_domain_redirect?(requested_url, final_url)
131
+ warnings << "aggregator_redirect_url" if aggregator_url?(requested_url)
132
+ warnings.uniq
133
+ end
134
+
135
+ def homepage_like?(url)
136
+ path = URI.parse(url).path
137
+ path.nil? || path.empty? || path == "/"
138
+ rescue URI::InvalidURIError
139
+ false
140
+ end
141
+
142
+ def homepage_index_markdown?(title, markdown)
143
+ snippet = [title, markdown].compact.join(" ")
144
+ return false unless snippet.match?(HOMEPAGE_INDEX_PATTERN)
145
+
146
+ markdown.to_s.lines.grep(/^\s*(?:\d+\.\s+|[-*]\s+)/).count >= 3
147
+ end
148
+
149
+ def fallback_result(url, fallback)
150
+ build_result(url, *fallback)
151
+ end
152
+
153
+ def docs_fallback_candidate?(requested_url, result = nil)
154
+ candidates = [requested_url]
155
+ if result
156
+ candidates << result.final_url
157
+ candidates << result.canonical_url
158
+ end
159
+
160
+ candidates.compact.any? { |candidate| FetchUtil.docs_like_url?(candidate) }
161
+ end
162
+
163
+ def browser_options(options)
164
+ options.slice(:timeout, :wait, :wait_for_idle, :idle_duration, :viewport,
165
+ :user_agent, :accept_language, :browser_path, :browser_options)
166
+ end
167
+
168
+ def log_request(url, t0)
169
+ @request_log&.append(url, duration: monotonic_now - t0)
170
+ end
171
+
172
+ def monotonic_now
173
+ Process.clock_gettime(Process::CLOCK_MONOTONIC)
174
+ end
175
+
176
+ def poor_docs_result?(result)
177
+ markdown = result.markdown.to_s
178
+ title = result.title.to_s
179
+ text_length = FetchUtil.normalize_whitespace(markdown).length
180
+
181
+ return true if result.warnings.include?("not_found_interstitial") || result.warnings.include?("empty_extraction") || result.warnings.include?("short_extraction")
182
+ return true if markdown.include?("Interstitial: requested page is unavailable")
183
+ return true if text_length < 160 && title.match?(DOCS_PORTAL_TITLE_PATTERN)
184
+ return true if title.match?(DOCS_PORTAL_TITLE_PATTERN) && markdown.scan(/^# /).length >= 2
185
+
186
+ false
187
+ end
188
+
189
+ def effective_domain(url)
190
+ host = FetchUtil.strip_www_host(url)
191
+ parts = host.split(".")
192
+ return host if parts.length <= 2
193
+
194
+ if parts.length >= 3 && parts[-2].match?(SECOND_LEVEL_COUNTRY_TLDS) && parts[-1].length == 2
195
+ parts.last(3).join(".")
196
+ else
197
+ parts.last(2).join(".")
198
+ end
199
+ rescue URI::InvalidURIError
200
+ nil
201
+ end
202
+
203
+ def cross_domain_redirect?(requested_url, final_url)
204
+ return false if requested_url.nil? || final_url.nil?
205
+
206
+ req_domain = effective_domain(requested_url)
207
+ fin_domain = effective_domain(final_url)
208
+ return false if req_domain.nil? || fin_domain.nil?
209
+
210
+ req_domain != fin_domain
211
+ end
212
+
213
+ def aggregator_url?(url)
214
+ return false if url.nil?
215
+
216
+ host = FetchUtil.strip_www_host(url)
217
+ path = URI.parse(url).path.to_s
218
+
219
+ return true if host == "news.google.com"
220
+
221
+ return true if host == "cdn.ampproject.org" || host.end_with?(".cdn.ampproject.org")
222
+
223
+ return true if host.match?(GOOGLE_HOST_PATTERN) && path == "/url"
224
+
225
+ false
226
+ rescue URI::InvalidURIError
227
+ false
228
+ end
229
+
230
+ def normalized_result_url(url)
231
+ return url if url.nil? || url.empty?
232
+
233
+ uri = URI.parse(url)
234
+ params = URI.decode_www_form(uri.query.to_s)
235
+ params.reject! { |key, _value| STRIPPED_QUERY_PARAM_PATTERNS.any? { |pattern| key.match?(pattern) } }
236
+ uri.query = params.empty? ? nil : URI.encode_www_form(params)
237
+ uri.to_s
238
+ rescue URI::InvalidURIError
239
+ url
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class ParallelFetcher
5
+ Failure = Struct.new(:index, :url, :error, keyword_init: true)
6
+
7
+ class ParallelFetchError < Error
8
+ attr_reader :failures, :results
9
+
10
+ def initialize(failures, results = nil)
11
+ @failures = failures.freeze
12
+ @results = results&.freeze
13
+ super(self.class.build_message(@failures))
14
+ end
15
+
16
+ def errors
17
+ @failures.map(&:error)
18
+ end
19
+
20
+ def self.build_message(failures)
21
+ preview = failures.first(3).map do |failure|
22
+ label = failure.url || "<initialization>"
23
+ "#{label} (#{failure.error.class}: #{failure.error.message})"
24
+ end.join(", ")
25
+ suffix = failures.length > 3 ? ", +#{failures.length - 3} more" : ""
26
+ "parallel fetch failed for #{failures.length} URLs: #{preview}#{suffix}"
27
+ end
28
+ end
29
+
30
+ DEFAULT_CONCURRENCY = 4
31
+
32
+ def initialize(fetcher_factory: nil, concurrency: DEFAULT_CONCURRENCY, **fetch_options)
33
+ @fetcher_factory = fetcher_factory || -> { Fetcher.new(**fetch_options) }
34
+ @concurrency = [concurrency.to_i, 1].max
35
+ end
36
+
37
+ def fetch(urls)
38
+ work = Array(urls).compact.map(&:to_s).reject(&:empty?)
39
+ return [] if work.empty?
40
+
41
+ jobs = Queue.new
42
+ failures = Queue.new
43
+ work.each_with_index { |url, index| jobs << [index, url] }
44
+ results = Array.new(work.length)
45
+ worker_count = [@concurrency, work.length].min
46
+
47
+ threads = Array.new(worker_count) do
48
+ Thread.new do
49
+ fetcher = @fetcher_factory.call
50
+
51
+ begin
52
+ loop do
53
+ begin
54
+ index, url = jobs.pop(true)
55
+ rescue ThreadError
56
+ break
57
+ end
58
+
59
+ begin
60
+ results[index] = fetcher.fetch(url)
61
+ rescue StandardError => e
62
+ failures << Failure.new(index: index, url: url, error: e)
63
+ end
64
+ end
65
+ ensure
66
+ fetcher.quit if fetcher.respond_to?(:quit)
67
+ end
68
+ rescue StandardError => e
69
+ failures << Failure.new(index: nil, url: nil, error: e)
70
+ end
71
+ end
72
+
73
+ threads.each(&:join)
74
+ raise_for_failures(drain_queue(failures), results)
75
+
76
+ results
77
+ end
78
+
79
+ private
80
+
81
+ def drain_queue(queue)
82
+ items = []
83
+ loop do
84
+ items << queue.pop(true)
85
+ rescue ThreadError
86
+ break
87
+ end
88
+ items
89
+ end
90
+
91
+ def raise_for_failures(failures, results)
92
+ return if failures.empty?
93
+
94
+ raise ParallelFetchError.new(failures, results)
95
+ end
96
+ end
97
+ end