fetch_util 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +2 -0
  3. data/.rubocop.yml +97 -0
  4. data/CHANGELOG.md +48 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +199 -0
  7. data/Rakefile +18 -0
  8. data/SKILL.md +92 -0
  9. data/exe/fetch_util +6 -0
  10. data/lib/fetch_util/assets/extract.js +1 -0
  11. data/lib/fetch_util/assets/vendor/readability.js +2314 -0
  12. data/lib/fetch_util/assets/vendor/turndown.js +974 -0
  13. data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
  14. data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
  15. data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
  16. data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
  17. data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
  18. data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
  19. data/lib/fetch_util/browser/navigation.rb +13 -0
  20. data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
  21. data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
  22. data/lib/fetch_util/browser/site_stabilization.rb +13 -0
  23. data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
  24. data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
  25. data/lib/fetch_util/browser/stabilization.rb +13 -0
  26. data/lib/fetch_util/browser.rb +135 -0
  27. data/lib/fetch_util/cli.rb +124 -0
  28. data/lib/fetch_util/extractor.rb +56 -0
  29. data/lib/fetch_util/fetcher.rb +242 -0
  30. data/lib/fetch_util/parallel_fetcher.rb +97 -0
  31. data/lib/fetch_util/raw_docs_fallback.rb +260 -0
  32. data/lib/fetch_util/regulatory/cache_store.rb +92 -0
  33. data/lib/fetch_util/regulatory/directives.rb +106 -0
  34. data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
  35. data/lib/fetch_util/regulatory/headers.rb +39 -0
  36. data/lib/fetch_util/regulatory/http_client.rb +70 -0
  37. data/lib/fetch_util/regulatory/human.rb +104 -0
  38. data/lib/fetch_util/regulatory/orchestration.rb +82 -0
  39. data/lib/fetch_util/regulatory/page.rb +70 -0
  40. data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
  41. data/lib/fetch_util/regulatory/robots.rb +117 -0
  42. data/lib/fetch_util/regulatory/signals.rb +106 -0
  43. data/lib/fetch_util/regulatory/source_selection.rb +60 -0
  44. data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
  45. data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
  46. data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
  47. data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
  48. data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
  49. data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
  50. data/lib/fetch_util/regulatory.rb +74 -0
  51. data/lib/fetch_util/request_log.rb +24 -0
  52. data/lib/fetch_util/result.rb +58 -0
  53. data/lib/fetch_util/searcher/result_filtering.rb +102 -0
  54. data/lib/fetch_util/searcher.rb +332 -0
  55. data/lib/fetch_util/version.rb +5 -0
  56. data/lib/fetch_util.rb +115 -0
  57. metadata +145 -0
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module TdmRep
6
+ def tdmrep_record(requested_uri)
7
+ fetch_record(
8
+ "tdmrep:#{origin_key(requested_uri)}",
9
+ tdmrep_uri(requested_uri),
10
+ fallback: { "signals" => [], "policies" => [] }
11
+ ) do |body|
12
+ signals, policies = extract_tdmrep_signals(body)
13
+ {
14
+ "signals" => sort_specificity_signals(signals),
15
+ "policies" => policies
16
+ }
17
+ end
18
+ end
19
+
20
+ def extract_tdmrep_signals(body)
21
+ payload = JSON.parse(body.to_s)
22
+ return [[], []] unless payload.is_a?(Array)
23
+
24
+ signals = []
25
+ policies = []
26
+ payload.each do |rule|
27
+ next unless rule.is_a?(Hash)
28
+
29
+ location = string_value(rule, "location")
30
+ reservation = string_value(rule, "tdm-reservation")
31
+ policy_url = string_value(rule, "tdm-policy")
32
+ next if location.empty?
33
+ next unless %w[0 1].include?(reservation)
34
+
35
+ rule_signals, rule_policies = extract_tdm_value_signals(
36
+ reservation: reservation,
37
+ policy_url: policy_url,
38
+ path: normalize_output_path(location)
39
+ )
40
+ signals.concat(rule_signals)
41
+ policies.concat(rule_policies)
42
+ end
43
+
44
+ [signals, dedupe_policy_refs(policies)]
45
+ rescue JSON::ParserError
46
+ [[], []]
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module TdmSupport
6
+ private
7
+
8
+ def tdm_policy_action?(value)
9
+ odrl_token(value) == "mine"
10
+ end
11
+
12
+ def duty_name(value)
13
+ action = value.is_a?(Hash) ? value["action"].to_s : value.to_s
14
+ return nil if action.empty?
15
+
16
+ normalized = odrl_token(action)
17
+ case normalized.downcase
18
+ when "obtainconsent"
19
+ "obtain-consent"
20
+ when "compensate"
21
+ "compensate"
22
+ else
23
+ normalized.downcase.gsub(/[^a-z0-9]+/, "-").sub(/\A-+|-+\z/, "")
24
+ end
25
+ end
26
+
27
+ def permission_purpose(permission)
28
+ array_value(permission, "constraint").each do |constraint|
29
+ next unless constraint.is_a?(Hash)
30
+ next unless odrl_token(constraint["leftOperand"]) == "purpose"
31
+ next unless odrl_token(constraint["operator"]) == "eq"
32
+
33
+ right_operand = odrl_token(constraint["rightOperand"])
34
+ return "research" if right_operand == "research"
35
+ return "non-research" if right_operand == "non-research"
36
+ end
37
+
38
+ nil
39
+ end
40
+
41
+ def policy_target_path(value)
42
+ target = value.to_s.strip
43
+ return nil if target.empty?
44
+
45
+ uri = URI.parse(target)
46
+ return normalize_output_path(request_target(uri)) if uri.is_a?(URI::HTTP)
47
+ return normalize_output_path(target) if target.start_with?("/")
48
+
49
+ nil
50
+ rescue URI::InvalidURIError
51
+ nil
52
+ end
53
+
54
+ def string_value(hash, key)
55
+ hash[key].to_s.strip
56
+ end
57
+
58
+ def array_value(hash, key)
59
+ value = hash[key]
60
+ return value if value.is_a?(Array)
61
+ return [] if value.nil?
62
+
63
+ [value]
64
+ end
65
+
66
+ def policy_ref(url, path)
67
+ candidate = url.to_s.strip
68
+ return nil if candidate.empty?
69
+
70
+ parse_http_uri(candidate)
71
+ { "url" => candidate, "path" => path }
72
+ rescue ArgumentError
73
+ nil
74
+ end
75
+
76
+ def dedupe_policy_refs(policy_refs)
77
+ seen = {}
78
+ list = []
79
+ Array(policy_refs).compact.each do |policy_ref|
80
+ key = [policy_ref["url"], policy_ref["path"]]
81
+ next if seen[key]
82
+
83
+ seen[key] = true
84
+ list << policy_ref
85
+ end
86
+ list
87
+ end
88
+
89
+ def odrl_token(value)
90
+ value.to_s.strip.downcase.split(/[#:]/).last.to_s
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module TrustTxt
6
+ def trusttxt_record(requested_uri)
7
+ fetch_record(
8
+ "trusttxt:#{origin_key(requested_uri)}",
9
+ [trusttxt_uri(requested_uri), trusttxt_well_known_uri(requested_uri)],
10
+ fallback: { "signals" => [] }
11
+ ) do |body|
12
+ signals = extract_trusttxt_signals(body)
13
+ { "signals" => sort_usage_preference_signals(signals) }
14
+ end
15
+ end
16
+
17
+ def extract_trusttxt_signals(body)
18
+ preference = nil
19
+
20
+ body.to_s.gsub("\r\n", "\n").gsub("\r", "\n").each_line do |line|
21
+ content = line.sub(/\s*#.*\z/, "").strip
22
+ next if content.empty?
23
+
24
+ key, raw_value = content.split("=", 2)
25
+ next unless raw_value
26
+ next unless key.to_s.strip.casecmp?("datatrainingallowed")
27
+
28
+ preference = case raw_value.to_s.strip.downcase
29
+ when "yes"
30
+ "allow"
31
+ when "no"
32
+ "disallow"
33
+ end
34
+ end
35
+
36
+ return [] unless preference
37
+
38
+ [
39
+ build_signal(
40
+ preference,
41
+ "ai-training",
42
+ path: "/*",
43
+ conditions: { "label" => "datatrainingallowed" }
44
+ )
45
+ ]
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module UsagePreferences
6
+ def extract_content_signal_signals(value, user_agent: nil)
7
+ parse_yes_no_preferences(value).map do |label, verb|
8
+ usage_preference_signal(label, verb, path: "/*", user_agent: user_agent)
9
+ end
10
+ end
11
+
12
+ def extract_content_usage_robot_signals(value, user_agent: nil)
13
+ path, statement = parse_content_usage_rule(value)
14
+ return [] if statement.nil? || statement.empty?
15
+
16
+ extract_content_usage_statement_signals(statement, path: normalize_output_path(path || "/*"), user_agent: user_agent)
17
+ end
18
+
19
+ def extract_content_usage_statement_signals(value, path:, user_agent: nil)
20
+ parse_structured_usage_preferences(value).map do |label, verb|
21
+ usage_preference_signal(label, verb, path: path, user_agent: user_agent)
22
+ end
23
+ end
24
+
25
+ def sort_usage_preference_signals(signals)
26
+ Array(signals).sort_by do |signal|
27
+ [
28
+ -path_specificity(signal["path"]),
29
+ wildcard_signal?(signal) ? 1 : 0,
30
+ signal_noun(signal),
31
+ allow_signal?(signal) ? 1 : 0,
32
+ signal.dig("conditions", "user-agent").to_s
33
+ ]
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ def usage_preference_signal(label, verb, path:, user_agent: nil)
40
+ conditions = {}
41
+ user_agent_glob = robot_user_agent_glob(user_agent)
42
+ conditions["user-agent"] = user_agent_glob unless user_agent_glob == "*"
43
+ normalized_noun = usage_preference_noun(label)
44
+ conditions["label"] = label if normalized_noun != label
45
+
46
+ build_signal(verb, normalized_noun, path: path, conditions: conditions)
47
+ end
48
+
49
+ def usage_preference_noun(label)
50
+ case label
51
+ when "ai-train", "train-ai"
52
+ "ai-training"
53
+ else
54
+ label
55
+ end
56
+ end
57
+
58
+ def parse_content_usage_rule(value)
59
+ text = value.to_s.strip
60
+ return [nil, text] unless text.start_with?("/")
61
+
62
+ path, statement = text.split(/[ \t]+/, 2)
63
+ [path, statement.to_s.strip]
64
+ end
65
+
66
+ def parse_yes_no_preferences(value)
67
+ preferences = {}
68
+ value.to_s.split(",").each do |entry|
69
+ label, raw_value = entry.split("=", 2)
70
+ next unless raw_value
71
+
72
+ verb = case raw_value.to_s.strip.downcase
73
+ when "yes"
74
+ "allow"
75
+ when "no"
76
+ "disallow"
77
+ end
78
+ next unless verb
79
+
80
+ preferences[label.to_s.strip.downcase] = verb
81
+ end
82
+ preferences.to_a
83
+ end
84
+
85
+ def parse_structured_usage_preferences(value)
86
+ preferences = {}
87
+ value.to_s.split(",").each do |entry|
88
+ label_part, raw_value = entry.split("=", 2)
89
+ next unless raw_value
90
+
91
+ label = label_part.to_s.split(";", 2).first.to_s.strip.downcase
92
+ token = raw_value.to_s.split(";", 2).first.to_s.strip.delete_prefix('"').delete_suffix('"').downcase
93
+ verb = if token == "y"
94
+ "allow"
95
+ else
96
+ (token == "n" ? "disallow" : nil)
97
+ end
98
+ next unless verb
99
+
100
+ preferences[label] = verb
101
+ end
102
+ preferences.to_a
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "cgi"
4
+ require "digest"
5
+ require "fileutils"
6
+ require "json"
7
+ require "openssl"
8
+ require "time"
9
+ require "timeout"
10
+ require "uri"
11
+
12
+ module FetchUtil
13
+ class Regulatory
14
+ CACHE_TTL = 86_400
15
+ CACHE_VERSION = 2
16
+ DEFAULT_CACHE_PATH = File.expand_path("~/.local/state/fetch_util/regulatory-cache")
17
+ MACHINE_SOURCES = %w[
18
+ robotstxt
19
+ contentsignal
20
+ contentusagerobots
21
+ contentusageheader
22
+ trusttxt
23
+ xrobotstag
24
+ metarobots
25
+ tdmrep
26
+ tdmheaders
27
+ tdmmeta
28
+ tdmpolicy
29
+ ].freeze
30
+ HUMAN_SOURCES = %w[human].freeze
31
+ SOURCE_CLASSES = {
32
+ "machine" => MACHINE_SOURCES,
33
+ "human" => HUMAN_SOURCES
34
+ }.freeze
35
+
36
+ Response = Struct.new(:url, :status, :headers, :body, :redirects, keyword_init: true)
37
+ autoload :HttpClient, "fetch_util/regulatory/http_client"
38
+ autoload :Orchestration, "fetch_util/regulatory/orchestration"
39
+ autoload :SourceSelection, "fetch_util/regulatory/source_selection"
40
+ autoload :Signals, "fetch_util/regulatory/signals"
41
+ autoload :FetchRecords, "fetch_util/regulatory/fetch_records"
42
+ autoload :CacheStore, "fetch_util/regulatory/cache_store"
43
+ autoload :Robots, "fetch_util/regulatory/robots"
44
+ autoload :RobotGlobs, "fetch_util/regulatory/robot_globs"
45
+ autoload :Headers, "fetch_util/regulatory/headers"
46
+ autoload :Directives, "fetch_util/regulatory/directives"
47
+ autoload :TdmSupport, "fetch_util/regulatory/tdm_support"
48
+ autoload :TdmPage, "fetch_util/regulatory/tdm_page"
49
+ autoload :TrustTxt, "fetch_util/regulatory/trust_txt"
50
+ autoload :UsagePreferences, "fetch_util/regulatory/usage_preferences"
51
+ autoload :Page, "fetch_util/regulatory/page"
52
+ autoload :TdmRep, "fetch_util/regulatory/tdm_rep"
53
+ autoload :TdmPolicy, "fetch_util/regulatory/tdm_policy"
54
+ autoload :Human, "fetch_util/regulatory/human"
55
+
56
+ include Orchestration
57
+ include SourceSelection
58
+ include Signals
59
+ include FetchRecords
60
+ include CacheStore
61
+ include Robots
62
+ include RobotGlobs
63
+ include Headers
64
+ include Directives
65
+ include TdmSupport
66
+ include TdmPage
67
+ include TrustTxt
68
+ include UsagePreferences
69
+ include Page
70
+ include TdmRep
71
+ include TdmPolicy
72
+ include Human
73
+ end
74
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "time"
5
+
6
+ module FetchUtil
7
+ class RequestLog
8
+ DEFAULT_PATH = File.expand_path("~/.local/state/fetch_util/requests.log")
9
+
10
+ def initialize(path: ENV.fetch("FETCH_UTIL_REQUEST_LOG", DEFAULT_PATH))
11
+ @path = path
12
+ end
13
+
14
+ attr_reader :path
15
+
16
+ def append(entry, duration: nil)
17
+ FileUtils.mkdir_p(File.dirname(path))
18
+ line = "#{Time.now.utc.iso8601}\t#{entry}"
19
+ line = "#{line}\t#{format("%.2f", duration)}s" if duration
20
+ File.open(path, "a") { |file| file.puts(line) }
21
+ path
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Result
5
+ attr_reader :url, :final_url, :title, :byline, :excerpt, :site_name,
6
+ :published_time, :canonical_url, :language, :html, :markdown,
7
+ :metadata, :reader_mode, :content_type, :suspect, :warnings,
8
+ :content_completeness_ratio, :content_format, :paywall_state
9
+
10
+ def initialize(url:, final_url:, title:, byline:, excerpt:, site_name:, published_time:,
11
+ canonical_url:, language:, html:, markdown:, metadata:, reader_mode:, content_type:, suspect:, warnings:,
12
+ content_completeness_ratio: 1.0, content_format: nil, paywall_state: nil)
13
+ @url = url
14
+ @final_url = final_url
15
+ @title = title
16
+ @byline = byline
17
+ @excerpt = excerpt
18
+ @site_name = site_name
19
+ @published_time = published_time
20
+ @canonical_url = canonical_url
21
+ @language = language
22
+ @html = html
23
+ @markdown = markdown
24
+ @metadata = metadata.freeze
25
+ @reader_mode = reader_mode
26
+ @content_type = content_type
27
+ @suspect = suspect
28
+ @warnings = warnings.freeze
29
+ @content_completeness_ratio = content_completeness_ratio
30
+ @content_format = content_format&.freeze
31
+ @paywall_state = paywall_state&.freeze
32
+ end
33
+
34
+ def to_h
35
+ {
36
+ url: url,
37
+ final_url: final_url,
38
+ title: title,
39
+ byline: byline,
40
+ excerpt: excerpt,
41
+ site_name: site_name,
42
+ published_time: published_time,
43
+ canonical_url: canonical_url,
44
+ language: language,
45
+ html: html,
46
+ markdown: markdown,
47
+ metadata: metadata,
48
+ reader_mode: reader_mode,
49
+ content_type: content_type,
50
+ suspect: suspect,
51
+ warnings: warnings,
52
+ content_completeness_ratio: content_completeness_ratio,
53
+ content_format: content_format,
54
+ paywall_state: paywall_state
55
+ }
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Searcher
5
+ module ResultFiltering
6
+ private
7
+
8
+ def search_engine_self_link?(title, url, snippet)
9
+ host, path = result_location(url)
10
+ text = compact_text([title, snippet].compact.join(" ")).downcase
11
+ return false if host.empty?
12
+
13
+ return true if duckduckgo_self_link?(host, path, title, text)
14
+ return true if google_self_link?(host, path, title, text)
15
+ return true if search_shell_result?(host, path, title)
16
+
17
+ false
18
+ end
19
+
20
+ def low_value_result?(title, url, snippet)
21
+ host, path = result_location(url)
22
+ return false if host.empty?
23
+ return true if non_html_document_url?(url)
24
+ return true if host == "duckduckgo.com" && path == "/y.js"
25
+ return true if host.start_with?("translate.google.")
26
+ return true if facebook_noise_result?(host, path, title, snippet)
27
+ return true if pinterest_noise_result?(host, path, title)
28
+ return true if host.end_with?("threads.net") || host.end_with?("threads.com")
29
+ return true if tiktok_noise_result?(host, path, snippet)
30
+ return true if host.end_with?("walmart.com") && path.match?(%r{\A/(search|browse|c|cp|b)\b})
31
+
32
+ false
33
+ end
34
+
35
+ def result_location(url)
36
+ [host_for(url).to_s, path_for(url)]
37
+ end
38
+
39
+ def search_action_text?(text)
40
+ /\b(redo search without this site|block this site from all results|go to google home|duckduckgo)\b/.match?(text)
41
+ end
42
+
43
+ def duckduckgo_self_link?(host, path, title, text)
44
+ return false unless host.end_with?("duckduckgo.com")
45
+
46
+ (path == "/" && (title.casecmp?("DuckDuckGo") || search_action_text?(text))) ||
47
+ (path.start_with?("/html") && search_action_text?(text))
48
+ end
49
+
50
+ def google_self_link?(host, path, title, text)
51
+ if host.end_with?("google.com")
52
+ return true if path == "/" && search_action_text?(text)
53
+ return true if %w[/search /preferences /advanced_search /setprefs].include?(path)
54
+ end
55
+
56
+ return false unless host.match?(/\Agoogle\.[a-z.]+\z/)
57
+
58
+ google_home_shell = /before you continue to google|go to google home/.match?(text) || title.casecmp?("Before you continue to Google")
59
+ (%w[/ /webhp].include?(path) && google_home_shell) ||
60
+ (path.start_with?("/intl/") && /\bgoogle apps|about google|products\b/.match?(text))
61
+ end
62
+
63
+ def search_shell_result?(host, path, title)
64
+ return true if host.end_with?("search.brave.com") && path == "/search" && title.casecmp?("Brave Search")
65
+ return true if host.end_with?("bing.com") && path == "/search" && title.casecmp?("Bing")
66
+ return true if host.end_with?("ecosia.org") && path == "/search" && title.casecmp?("Ecosia")
67
+
68
+ false
69
+ end
70
+
71
+ def facebook_noise_result?(host, path, title, snippet)
72
+ return false unless host.end_with?("facebook.com")
73
+
74
+ path.match?(%r{\A/(groups|events|watch|share|reel|photo)\b}) ||
75
+ title.end_with?(" - Facebook") ||
76
+ title.match?(/\(@[^)]+\)/) ||
77
+ snippet.to_s.match?(/\b\d+[,\dKMB+.]*\s*(followers?|likes?|members?)\b/i)
78
+ end
79
+
80
+ def pinterest_noise_result?(host, path, title)
81
+ return false unless host.include?("pinterest.")
82
+
83
+ !path.match?(%r{\A/search/}) || title.end_with?(" - Pinterest")
84
+ end
85
+
86
+ def tiktok_noise_result?(host, path, snippet)
87
+ return false unless host.end_with?("tiktok.com")
88
+
89
+ host.start_with?("shop.") ||
90
+ path.match?(%r{\A/@[^/]+/video/}) ||
91
+ snippet.to_s.match?(/\bAll Categories\b/i)
92
+ end
93
+
94
+ def non_html_document_url?(url)
95
+ normalized = url.to_s.downcase
96
+ path = path_for(normalized).downcase
97
+
98
+ path.end_with?(".pdf") || path.match?(%r{/pdf(?:/|\z)}) || normalized.match?(/[?&](?:format|download)=pdf\b/)
99
+ end
100
+ end
101
+ end
102
+ end