fetch_util 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +97 -0
- data/CHANGELOG.md +48 -0
- data/LICENSE.txt +21 -0
- data/README.md +199 -0
- data/Rakefile +18 -0
- data/SKILL.md +92 -0
- data/exe/fetch_util +6 -0
- data/lib/fetch_util/assets/extract.js +1 -0
- data/lib/fetch_util/assets/vendor/readability.js +2314 -0
- data/lib/fetch_util/assets/vendor/turndown.js +974 -0
- data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
- data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
- data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
- data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
- data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
- data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
- data/lib/fetch_util/browser/navigation.rb +13 -0
- data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
- data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
- data/lib/fetch_util/browser/site_stabilization.rb +13 -0
- data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
- data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
- data/lib/fetch_util/browser/stabilization.rb +13 -0
- data/lib/fetch_util/browser.rb +135 -0
- data/lib/fetch_util/cli.rb +124 -0
- data/lib/fetch_util/extractor.rb +56 -0
- data/lib/fetch_util/fetcher.rb +242 -0
- data/lib/fetch_util/parallel_fetcher.rb +97 -0
- data/lib/fetch_util/raw_docs_fallback.rb +260 -0
- data/lib/fetch_util/regulatory/cache_store.rb +92 -0
- data/lib/fetch_util/regulatory/directives.rb +106 -0
- data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
- data/lib/fetch_util/regulatory/headers.rb +39 -0
- data/lib/fetch_util/regulatory/http_client.rb +70 -0
- data/lib/fetch_util/regulatory/human.rb +104 -0
- data/lib/fetch_util/regulatory/orchestration.rb +82 -0
- data/lib/fetch_util/regulatory/page.rb +70 -0
- data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
- data/lib/fetch_util/regulatory/robots.rb +117 -0
- data/lib/fetch_util/regulatory/signals.rb +106 -0
- data/lib/fetch_util/regulatory/source_selection.rb +60 -0
- data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
- data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
- data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
- data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
- data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
- data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
- data/lib/fetch_util/regulatory.rb +74 -0
- data/lib/fetch_util/request_log.rb +24 -0
- data/lib/fetch_util/result.rb +58 -0
- data/lib/fetch_util/searcher/result_filtering.rb +102 -0
- data/lib/fetch_util/searcher.rb +332 -0
- data/lib/fetch_util/version.rb +5 -0
- data/lib/fetch_util.rb +115 -0
- metadata +145 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module TdmRep
|
|
6
|
+
def tdmrep_record(requested_uri)
|
|
7
|
+
fetch_record(
|
|
8
|
+
"tdmrep:#{origin_key(requested_uri)}",
|
|
9
|
+
tdmrep_uri(requested_uri),
|
|
10
|
+
fallback: { "signals" => [], "policies" => [] }
|
|
11
|
+
) do |body|
|
|
12
|
+
signals, policies = extract_tdmrep_signals(body)
|
|
13
|
+
{
|
|
14
|
+
"signals" => sort_specificity_signals(signals),
|
|
15
|
+
"policies" => policies
|
|
16
|
+
}
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def extract_tdmrep_signals(body)
|
|
21
|
+
payload = JSON.parse(body.to_s)
|
|
22
|
+
return [[], []] unless payload.is_a?(Array)
|
|
23
|
+
|
|
24
|
+
signals = []
|
|
25
|
+
policies = []
|
|
26
|
+
payload.each do |rule|
|
|
27
|
+
next unless rule.is_a?(Hash)
|
|
28
|
+
|
|
29
|
+
location = string_value(rule, "location")
|
|
30
|
+
reservation = string_value(rule, "tdm-reservation")
|
|
31
|
+
policy_url = string_value(rule, "tdm-policy")
|
|
32
|
+
next if location.empty?
|
|
33
|
+
next unless %w[0 1].include?(reservation)
|
|
34
|
+
|
|
35
|
+
rule_signals, rule_policies = extract_tdm_value_signals(
|
|
36
|
+
reservation: reservation,
|
|
37
|
+
policy_url: policy_url,
|
|
38
|
+
path: normalize_output_path(location)
|
|
39
|
+
)
|
|
40
|
+
signals.concat(rule_signals)
|
|
41
|
+
policies.concat(rule_policies)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
[signals, dedupe_policy_refs(policies)]
|
|
45
|
+
rescue JSON::ParserError
|
|
46
|
+
[[], []]
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module TdmSupport
|
|
6
|
+
private
|
|
7
|
+
|
|
8
|
+
def tdm_policy_action?(value)
|
|
9
|
+
odrl_token(value) == "mine"
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def duty_name(value)
|
|
13
|
+
action = value.is_a?(Hash) ? value["action"].to_s : value.to_s
|
|
14
|
+
return nil if action.empty?
|
|
15
|
+
|
|
16
|
+
normalized = odrl_token(action)
|
|
17
|
+
case normalized.downcase
|
|
18
|
+
when "obtainconsent"
|
|
19
|
+
"obtain-consent"
|
|
20
|
+
when "compensate"
|
|
21
|
+
"compensate"
|
|
22
|
+
else
|
|
23
|
+
normalized.downcase.gsub(/[^a-z0-9]+/, "-").sub(/\A-+|-+\z/, "")
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def permission_purpose(permission)
|
|
28
|
+
array_value(permission, "constraint").each do |constraint|
|
|
29
|
+
next unless constraint.is_a?(Hash)
|
|
30
|
+
next unless odrl_token(constraint["leftOperand"]) == "purpose"
|
|
31
|
+
next unless odrl_token(constraint["operator"]) == "eq"
|
|
32
|
+
|
|
33
|
+
right_operand = odrl_token(constraint["rightOperand"])
|
|
34
|
+
return "research" if right_operand == "research"
|
|
35
|
+
return "non-research" if right_operand == "non-research"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
nil
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def policy_target_path(value)
|
|
42
|
+
target = value.to_s.strip
|
|
43
|
+
return nil if target.empty?
|
|
44
|
+
|
|
45
|
+
uri = URI.parse(target)
|
|
46
|
+
return normalize_output_path(request_target(uri)) if uri.is_a?(URI::HTTP)
|
|
47
|
+
return normalize_output_path(target) if target.start_with?("/")
|
|
48
|
+
|
|
49
|
+
nil
|
|
50
|
+
rescue URI::InvalidURIError
|
|
51
|
+
nil
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def string_value(hash, key)
|
|
55
|
+
hash[key].to_s.strip
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def array_value(hash, key)
|
|
59
|
+
value = hash[key]
|
|
60
|
+
return value if value.is_a?(Array)
|
|
61
|
+
return [] if value.nil?
|
|
62
|
+
|
|
63
|
+
[value]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def policy_ref(url, path)
|
|
67
|
+
candidate = url.to_s.strip
|
|
68
|
+
return nil if candidate.empty?
|
|
69
|
+
|
|
70
|
+
parse_http_uri(candidate)
|
|
71
|
+
{ "url" => candidate, "path" => path }
|
|
72
|
+
rescue ArgumentError
|
|
73
|
+
nil
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def dedupe_policy_refs(policy_refs)
|
|
77
|
+
seen = {}
|
|
78
|
+
list = []
|
|
79
|
+
Array(policy_refs).compact.each do |policy_ref|
|
|
80
|
+
key = [policy_ref["url"], policy_ref["path"]]
|
|
81
|
+
next if seen[key]
|
|
82
|
+
|
|
83
|
+
seen[key] = true
|
|
84
|
+
list << policy_ref
|
|
85
|
+
end
|
|
86
|
+
list
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def odrl_token(value)
|
|
90
|
+
value.to_s.strip.downcase.split(/[#:]/).last.to_s
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module TrustTxt
|
|
6
|
+
def trusttxt_record(requested_uri)
|
|
7
|
+
fetch_record(
|
|
8
|
+
"trusttxt:#{origin_key(requested_uri)}",
|
|
9
|
+
[trusttxt_uri(requested_uri), trusttxt_well_known_uri(requested_uri)],
|
|
10
|
+
fallback: { "signals" => [] }
|
|
11
|
+
) do |body|
|
|
12
|
+
signals = extract_trusttxt_signals(body)
|
|
13
|
+
{ "signals" => sort_usage_preference_signals(signals) }
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def extract_trusttxt_signals(body)
|
|
18
|
+
preference = nil
|
|
19
|
+
|
|
20
|
+
body.to_s.gsub("\r\n", "\n").gsub("\r", "\n").each_line do |line|
|
|
21
|
+
content = line.sub(/\s*#.*\z/, "").strip
|
|
22
|
+
next if content.empty?
|
|
23
|
+
|
|
24
|
+
key, raw_value = content.split("=", 2)
|
|
25
|
+
next unless raw_value
|
|
26
|
+
next unless key.to_s.strip.casecmp?("datatrainingallowed")
|
|
27
|
+
|
|
28
|
+
preference = case raw_value.to_s.strip.downcase
|
|
29
|
+
when "yes"
|
|
30
|
+
"allow"
|
|
31
|
+
when "no"
|
|
32
|
+
"disallow"
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
return [] unless preference
|
|
37
|
+
|
|
38
|
+
[
|
|
39
|
+
build_signal(
|
|
40
|
+
preference,
|
|
41
|
+
"ai-training",
|
|
42
|
+
path: "/*",
|
|
43
|
+
conditions: { "label" => "datatrainingallowed" }
|
|
44
|
+
)
|
|
45
|
+
]
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module UsagePreferences
|
|
6
|
+
def extract_content_signal_signals(value, user_agent: nil)
|
|
7
|
+
parse_yes_no_preferences(value).map do |label, verb|
|
|
8
|
+
usage_preference_signal(label, verb, path: "/*", user_agent: user_agent)
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def extract_content_usage_robot_signals(value, user_agent: nil)
|
|
13
|
+
path, statement = parse_content_usage_rule(value)
|
|
14
|
+
return [] if statement.nil? || statement.empty?
|
|
15
|
+
|
|
16
|
+
extract_content_usage_statement_signals(statement, path: normalize_output_path(path || "/*"), user_agent: user_agent)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def extract_content_usage_statement_signals(value, path:, user_agent: nil)
|
|
20
|
+
parse_structured_usage_preferences(value).map do |label, verb|
|
|
21
|
+
usage_preference_signal(label, verb, path: path, user_agent: user_agent)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def sort_usage_preference_signals(signals)
|
|
26
|
+
Array(signals).sort_by do |signal|
|
|
27
|
+
[
|
|
28
|
+
-path_specificity(signal["path"]),
|
|
29
|
+
wildcard_signal?(signal) ? 1 : 0,
|
|
30
|
+
signal_noun(signal),
|
|
31
|
+
allow_signal?(signal) ? 1 : 0,
|
|
32
|
+
signal.dig("conditions", "user-agent").to_s
|
|
33
|
+
]
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
def usage_preference_signal(label, verb, path:, user_agent: nil)
|
|
40
|
+
conditions = {}
|
|
41
|
+
user_agent_glob = robot_user_agent_glob(user_agent)
|
|
42
|
+
conditions["user-agent"] = user_agent_glob unless user_agent_glob == "*"
|
|
43
|
+
normalized_noun = usage_preference_noun(label)
|
|
44
|
+
conditions["label"] = label if normalized_noun != label
|
|
45
|
+
|
|
46
|
+
build_signal(verb, normalized_noun, path: path, conditions: conditions)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def usage_preference_noun(label)
|
|
50
|
+
case label
|
|
51
|
+
when "ai-train", "train-ai"
|
|
52
|
+
"ai-training"
|
|
53
|
+
else
|
|
54
|
+
label
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def parse_content_usage_rule(value)
|
|
59
|
+
text = value.to_s.strip
|
|
60
|
+
return [nil, text] unless text.start_with?("/")
|
|
61
|
+
|
|
62
|
+
path, statement = text.split(/[ \t]+/, 2)
|
|
63
|
+
[path, statement.to_s.strip]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def parse_yes_no_preferences(value)
|
|
67
|
+
preferences = {}
|
|
68
|
+
value.to_s.split(",").each do |entry|
|
|
69
|
+
label, raw_value = entry.split("=", 2)
|
|
70
|
+
next unless raw_value
|
|
71
|
+
|
|
72
|
+
verb = case raw_value.to_s.strip.downcase
|
|
73
|
+
when "yes"
|
|
74
|
+
"allow"
|
|
75
|
+
when "no"
|
|
76
|
+
"disallow"
|
|
77
|
+
end
|
|
78
|
+
next unless verb
|
|
79
|
+
|
|
80
|
+
preferences[label.to_s.strip.downcase] = verb
|
|
81
|
+
end
|
|
82
|
+
preferences.to_a
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def parse_structured_usage_preferences(value)
|
|
86
|
+
preferences = {}
|
|
87
|
+
value.to_s.split(",").each do |entry|
|
|
88
|
+
label_part, raw_value = entry.split("=", 2)
|
|
89
|
+
next unless raw_value
|
|
90
|
+
|
|
91
|
+
label = label_part.to_s.split(";", 2).first.to_s.strip.downcase
|
|
92
|
+
token = raw_value.to_s.split(";", 2).first.to_s.strip.delete_prefix('"').delete_suffix('"').downcase
|
|
93
|
+
verb = if token == "y"
|
|
94
|
+
"allow"
|
|
95
|
+
else
|
|
96
|
+
(token == "n" ? "disallow" : nil)
|
|
97
|
+
end
|
|
98
|
+
next unless verb
|
|
99
|
+
|
|
100
|
+
preferences[label] = verb
|
|
101
|
+
end
|
|
102
|
+
preferences.to_a
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "cgi"
|
|
4
|
+
require "digest"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
require "json"
|
|
7
|
+
require "openssl"
|
|
8
|
+
require "time"
|
|
9
|
+
require "timeout"
|
|
10
|
+
require "uri"
|
|
11
|
+
|
|
12
|
+
module FetchUtil
|
|
13
|
+
class Regulatory
|
|
14
|
+
CACHE_TTL = 86_400
|
|
15
|
+
CACHE_VERSION = 2
|
|
16
|
+
DEFAULT_CACHE_PATH = File.expand_path("~/.local/state/fetch_util/regulatory-cache")
|
|
17
|
+
MACHINE_SOURCES = %w[
|
|
18
|
+
robotstxt
|
|
19
|
+
contentsignal
|
|
20
|
+
contentusagerobots
|
|
21
|
+
contentusageheader
|
|
22
|
+
trusttxt
|
|
23
|
+
xrobotstag
|
|
24
|
+
metarobots
|
|
25
|
+
tdmrep
|
|
26
|
+
tdmheaders
|
|
27
|
+
tdmmeta
|
|
28
|
+
tdmpolicy
|
|
29
|
+
].freeze
|
|
30
|
+
HUMAN_SOURCES = %w[human].freeze
|
|
31
|
+
SOURCE_CLASSES = {
|
|
32
|
+
"machine" => MACHINE_SOURCES,
|
|
33
|
+
"human" => HUMAN_SOURCES
|
|
34
|
+
}.freeze
|
|
35
|
+
|
|
36
|
+
Response = Struct.new(:url, :status, :headers, :body, :redirects, keyword_init: true)
|
|
37
|
+
autoload :HttpClient, "fetch_util/regulatory/http_client"
|
|
38
|
+
autoload :Orchestration, "fetch_util/regulatory/orchestration"
|
|
39
|
+
autoload :SourceSelection, "fetch_util/regulatory/source_selection"
|
|
40
|
+
autoload :Signals, "fetch_util/regulatory/signals"
|
|
41
|
+
autoload :FetchRecords, "fetch_util/regulatory/fetch_records"
|
|
42
|
+
autoload :CacheStore, "fetch_util/regulatory/cache_store"
|
|
43
|
+
autoload :Robots, "fetch_util/regulatory/robots"
|
|
44
|
+
autoload :RobotGlobs, "fetch_util/regulatory/robot_globs"
|
|
45
|
+
autoload :Headers, "fetch_util/regulatory/headers"
|
|
46
|
+
autoload :Directives, "fetch_util/regulatory/directives"
|
|
47
|
+
autoload :TdmSupport, "fetch_util/regulatory/tdm_support"
|
|
48
|
+
autoload :TdmPage, "fetch_util/regulatory/tdm_page"
|
|
49
|
+
autoload :TrustTxt, "fetch_util/regulatory/trust_txt"
|
|
50
|
+
autoload :UsagePreferences, "fetch_util/regulatory/usage_preferences"
|
|
51
|
+
autoload :Page, "fetch_util/regulatory/page"
|
|
52
|
+
autoload :TdmRep, "fetch_util/regulatory/tdm_rep"
|
|
53
|
+
autoload :TdmPolicy, "fetch_util/regulatory/tdm_policy"
|
|
54
|
+
autoload :Human, "fetch_util/regulatory/human"
|
|
55
|
+
|
|
56
|
+
include Orchestration
|
|
57
|
+
include SourceSelection
|
|
58
|
+
include Signals
|
|
59
|
+
include FetchRecords
|
|
60
|
+
include CacheStore
|
|
61
|
+
include Robots
|
|
62
|
+
include RobotGlobs
|
|
63
|
+
include Headers
|
|
64
|
+
include Directives
|
|
65
|
+
include TdmSupport
|
|
66
|
+
include TdmPage
|
|
67
|
+
include TrustTxt
|
|
68
|
+
include UsagePreferences
|
|
69
|
+
include Page
|
|
70
|
+
include TdmRep
|
|
71
|
+
include TdmPolicy
|
|
72
|
+
include Human
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "time"
|
|
5
|
+
|
|
6
|
+
module FetchUtil
|
|
7
|
+
class RequestLog
|
|
8
|
+
DEFAULT_PATH = File.expand_path("~/.local/state/fetch_util/requests.log")
|
|
9
|
+
|
|
10
|
+
def initialize(path: ENV.fetch("FETCH_UTIL_REQUEST_LOG", DEFAULT_PATH))
|
|
11
|
+
@path = path
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
attr_reader :path
|
|
15
|
+
|
|
16
|
+
def append(entry, duration: nil)
|
|
17
|
+
FileUtils.mkdir_p(File.dirname(path))
|
|
18
|
+
line = "#{Time.now.utc.iso8601}\t#{entry}"
|
|
19
|
+
line = "#{line}\t#{format("%.2f", duration)}s" if duration
|
|
20
|
+
File.open(path, "a") { |file| file.puts(line) }
|
|
21
|
+
path
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Result
|
|
5
|
+
attr_reader :url, :final_url, :title, :byline, :excerpt, :site_name,
|
|
6
|
+
:published_time, :canonical_url, :language, :html, :markdown,
|
|
7
|
+
:metadata, :reader_mode, :content_type, :suspect, :warnings,
|
|
8
|
+
:content_completeness_ratio, :content_format, :paywall_state
|
|
9
|
+
|
|
10
|
+
def initialize(url:, final_url:, title:, byline:, excerpt:, site_name:, published_time:,
|
|
11
|
+
canonical_url:, language:, html:, markdown:, metadata:, reader_mode:, content_type:, suspect:, warnings:,
|
|
12
|
+
content_completeness_ratio: 1.0, content_format: nil, paywall_state: nil)
|
|
13
|
+
@url = url
|
|
14
|
+
@final_url = final_url
|
|
15
|
+
@title = title
|
|
16
|
+
@byline = byline
|
|
17
|
+
@excerpt = excerpt
|
|
18
|
+
@site_name = site_name
|
|
19
|
+
@published_time = published_time
|
|
20
|
+
@canonical_url = canonical_url
|
|
21
|
+
@language = language
|
|
22
|
+
@html = html
|
|
23
|
+
@markdown = markdown
|
|
24
|
+
@metadata = metadata.freeze
|
|
25
|
+
@reader_mode = reader_mode
|
|
26
|
+
@content_type = content_type
|
|
27
|
+
@suspect = suspect
|
|
28
|
+
@warnings = warnings.freeze
|
|
29
|
+
@content_completeness_ratio = content_completeness_ratio
|
|
30
|
+
@content_format = content_format&.freeze
|
|
31
|
+
@paywall_state = paywall_state&.freeze
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def to_h
|
|
35
|
+
{
|
|
36
|
+
url: url,
|
|
37
|
+
final_url: final_url,
|
|
38
|
+
title: title,
|
|
39
|
+
byline: byline,
|
|
40
|
+
excerpt: excerpt,
|
|
41
|
+
site_name: site_name,
|
|
42
|
+
published_time: published_time,
|
|
43
|
+
canonical_url: canonical_url,
|
|
44
|
+
language: language,
|
|
45
|
+
html: html,
|
|
46
|
+
markdown: markdown,
|
|
47
|
+
metadata: metadata,
|
|
48
|
+
reader_mode: reader_mode,
|
|
49
|
+
content_type: content_type,
|
|
50
|
+
suspect: suspect,
|
|
51
|
+
warnings: warnings,
|
|
52
|
+
content_completeness_ratio: content_completeness_ratio,
|
|
53
|
+
content_format: content_format,
|
|
54
|
+
paywall_state: paywall_state
|
|
55
|
+
}
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Searcher
|
|
5
|
+
module ResultFiltering
|
|
6
|
+
private
|
|
7
|
+
|
|
8
|
+
def search_engine_self_link?(title, url, snippet)
|
|
9
|
+
host, path = result_location(url)
|
|
10
|
+
text = compact_text([title, snippet].compact.join(" ")).downcase
|
|
11
|
+
return false if host.empty?
|
|
12
|
+
|
|
13
|
+
return true if duckduckgo_self_link?(host, path, title, text)
|
|
14
|
+
return true if google_self_link?(host, path, title, text)
|
|
15
|
+
return true if search_shell_result?(host, path, title)
|
|
16
|
+
|
|
17
|
+
false
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def low_value_result?(title, url, snippet)
|
|
21
|
+
host, path = result_location(url)
|
|
22
|
+
return false if host.empty?
|
|
23
|
+
return true if non_html_document_url?(url)
|
|
24
|
+
return true if host == "duckduckgo.com" && path == "/y.js"
|
|
25
|
+
return true if host.start_with?("translate.google.")
|
|
26
|
+
return true if facebook_noise_result?(host, path, title, snippet)
|
|
27
|
+
return true if pinterest_noise_result?(host, path, title)
|
|
28
|
+
return true if host.end_with?("threads.net") || host.end_with?("threads.com")
|
|
29
|
+
return true if tiktok_noise_result?(host, path, snippet)
|
|
30
|
+
return true if host.end_with?("walmart.com") && path.match?(%r{\A/(search|browse|c|cp|b)\b})
|
|
31
|
+
|
|
32
|
+
false
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def result_location(url)
|
|
36
|
+
[host_for(url).to_s, path_for(url)]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def search_action_text?(text)
|
|
40
|
+
/\b(redo search without this site|block this site from all results|go to google home|duckduckgo)\b/.match?(text)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def duckduckgo_self_link?(host, path, title, text)
|
|
44
|
+
return false unless host.end_with?("duckduckgo.com")
|
|
45
|
+
|
|
46
|
+
(path == "/" && (title.casecmp?("DuckDuckGo") || search_action_text?(text))) ||
|
|
47
|
+
(path.start_with?("/html") && search_action_text?(text))
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def google_self_link?(host, path, title, text)
|
|
51
|
+
if host.end_with?("google.com")
|
|
52
|
+
return true if path == "/" && search_action_text?(text)
|
|
53
|
+
return true if %w[/search /preferences /advanced_search /setprefs].include?(path)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
return false unless host.match?(/\Agoogle\.[a-z.]+\z/)
|
|
57
|
+
|
|
58
|
+
google_home_shell = /before you continue to google|go to google home/.match?(text) || title.casecmp?("Before you continue to Google")
|
|
59
|
+
(%w[/ /webhp].include?(path) && google_home_shell) ||
|
|
60
|
+
(path.start_with?("/intl/") && /\bgoogle apps|about google|products\b/.match?(text))
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def search_shell_result?(host, path, title)
|
|
64
|
+
return true if host.end_with?("search.brave.com") && path == "/search" && title.casecmp?("Brave Search")
|
|
65
|
+
return true if host.end_with?("bing.com") && path == "/search" && title.casecmp?("Bing")
|
|
66
|
+
return true if host.end_with?("ecosia.org") && path == "/search" && title.casecmp?("Ecosia")
|
|
67
|
+
|
|
68
|
+
false
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def facebook_noise_result?(host, path, title, snippet)
|
|
72
|
+
return false unless host.end_with?("facebook.com")
|
|
73
|
+
|
|
74
|
+
path.match?(%r{\A/(groups|events|watch|share|reel|photo)\b}) ||
|
|
75
|
+
title.end_with?(" - Facebook") ||
|
|
76
|
+
title.match?(/\(@[^)]+\)/) ||
|
|
77
|
+
snippet.to_s.match?(/\b\d+[,\dKMB+.]*\s*(followers?|likes?|members?)\b/i)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def pinterest_noise_result?(host, path, title)
|
|
81
|
+
return false unless host.include?("pinterest.")
|
|
82
|
+
|
|
83
|
+
!path.match?(%r{\A/search/}) || title.end_with?(" - Pinterest")
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def tiktok_noise_result?(host, path, snippet)
|
|
87
|
+
return false unless host.end_with?("tiktok.com")
|
|
88
|
+
|
|
89
|
+
host.start_with?("shop.") ||
|
|
90
|
+
path.match?(%r{\A/@[^/]+/video/}) ||
|
|
91
|
+
snippet.to_s.match?(/\bAll Categories\b/i)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def non_html_document_url?(url)
|
|
95
|
+
normalized = url.to_s.downcase
|
|
96
|
+
path = path_for(normalized).downcase
|
|
97
|
+
|
|
98
|
+
path.end_with?(".pdf") || path.match?(%r{/pdf(?:/|\z)}) || normalized.match?(/[?&](?:format|download)=pdf\b/)
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|