fetch_util 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +97 -0
- data/CHANGELOG.md +48 -0
- data/LICENSE.txt +21 -0
- data/README.md +199 -0
- data/Rakefile +18 -0
- data/SKILL.md +92 -0
- data/exe/fetch_util +6 -0
- data/lib/fetch_util/assets/extract.js +1 -0
- data/lib/fetch_util/assets/vendor/readability.js +2314 -0
- data/lib/fetch_util/assets/vendor/turndown.js +974 -0
- data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
- data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
- data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
- data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
- data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
- data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
- data/lib/fetch_util/browser/navigation.rb +13 -0
- data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
- data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
- data/lib/fetch_util/browser/site_stabilization.rb +13 -0
- data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
- data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
- data/lib/fetch_util/browser/stabilization.rb +13 -0
- data/lib/fetch_util/browser.rb +135 -0
- data/lib/fetch_util/cli.rb +124 -0
- data/lib/fetch_util/extractor.rb +56 -0
- data/lib/fetch_util/fetcher.rb +242 -0
- data/lib/fetch_util/parallel_fetcher.rb +97 -0
- data/lib/fetch_util/raw_docs_fallback.rb +260 -0
- data/lib/fetch_util/regulatory/cache_store.rb +92 -0
- data/lib/fetch_util/regulatory/directives.rb +106 -0
- data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
- data/lib/fetch_util/regulatory/headers.rb +39 -0
- data/lib/fetch_util/regulatory/http_client.rb +70 -0
- data/lib/fetch_util/regulatory/human.rb +104 -0
- data/lib/fetch_util/regulatory/orchestration.rb +82 -0
- data/lib/fetch_util/regulatory/page.rb +70 -0
- data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
- data/lib/fetch_util/regulatory/robots.rb +117 -0
- data/lib/fetch_util/regulatory/signals.rb +106 -0
- data/lib/fetch_util/regulatory/source_selection.rb +60 -0
- data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
- data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
- data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
- data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
- data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
- data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
- data/lib/fetch_util/regulatory.rb +74 -0
- data/lib/fetch_util/request_log.rb +24 -0
- data/lib/fetch_util/result.rb +58 -0
- data/lib/fetch_util/searcher/result_filtering.rb +102 -0
- data/lib/fetch_util/searcher.rb +332 -0
- data/lib/fetch_util/version.rb +5 -0
- data/lib/fetch_util.rb +115 -0
- metadata +145 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module Human
|
|
6
|
+
HUMAN_PATTERNS = [
|
|
7
|
+
{
|
|
8
|
+
"verb" => "disallow",
|
|
9
|
+
"noun" => "text-and-data-mining",
|
|
10
|
+
"regex" => /
|
|
11
|
+
\b(?:do\s+not|does\s+not|must\s+not|may\s+not|shall\s+not|not\s+permit(?:ted)?|not\s+allow(?:ed)?|forbid(?:den|s)?|prohibit(?:ed|s)?)\b.{0,120}
|
|
12
|
+
\b(?:text\s+and\s+data\s+mining|text-and-data-mining|tdm)\b
|
|
13
|
+
/ix
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"verb" => "disallow",
|
|
17
|
+
"noun" => "ai-training",
|
|
18
|
+
"regex" => /
|
|
19
|
+
\b(?:do\s+not|does\s+not|must\s+not|may\s+not|shall\s+not|not\s+permit(?:ted)?|not\s+allow(?:ed)?|forbid(?:den|s)?|prohibit(?:ed|s)?)\b.{0,120}
|
|
20
|
+
\b(?:ai\s+training|training\s+(?:of|for)\s+(?:ai|models?)|machine\s+learning|large\s+language\s+model|large\s+language\s+models|llm|generative\s+ai)\b
|
|
21
|
+
/ix
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"verb" => "disallow",
|
|
25
|
+
"noun" => "index",
|
|
26
|
+
"regex" => /
|
|
27
|
+
\b(?:do\s+not|does\s+not|must\s+not|may\s+not|shall\s+not|not\s+permit(?:ted)?|not\s+allow(?:ed)?|forbid(?:den|s)?|prohibit(?:ed|s)?)\b.{0,120}
|
|
28
|
+
\b(?:index(?:ing)?|search\s+engine(?:s)?)\b
|
|
29
|
+
/ix
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"verb" => "disallow",
|
|
33
|
+
"noun" => "fetch",
|
|
34
|
+
"regex" => /
|
|
35
|
+
\b(?:do\s+not|does\s+not|must\s+not|may\s+not|shall\s+not|not\s+permit(?:ted)?|not\s+allow(?:ed)?|forbid(?:den|s)?|prohibit(?:ed|s)?)\b.{0,120}
|
|
36
|
+
\b(?:crawl(?:ing)?|scrap(?:e|ing)|fetch(?:ing)?|bot(?:s)?)\b
|
|
37
|
+
/ix
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"verb" => "allow",
|
|
41
|
+
"noun" => "text-and-data-mining",
|
|
42
|
+
"regex" => /\b(?:allow(?:ed|s)?|permit(?:ted|s)?|may|can)\b.{0,120}\b(?:text\s+and\s+data\s+mining|text-and-data-mining|tdm)\b/i
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"verb" => "allow",
|
|
46
|
+
"noun" => "ai-training",
|
|
47
|
+
"regex" => /
|
|
48
|
+
\b(?:allow(?:ed|s)?|permit(?:ted|s)?|may|can)\b.{0,120}
|
|
49
|
+
\b(?:ai\s+training|training\s+(?:of|for)\s+(?:ai|models?)|machine\s+learning|large\s+language\s+model|large\s+language\s+models|llm|generative\s+ai)\b
|
|
50
|
+
/ix
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"verb" => "allow",
|
|
54
|
+
"noun" => "index",
|
|
55
|
+
"regex" => /\b(?:allow(?:ed|s)?|permit(?:ted|s)?|may|can)\b.{0,120}\b(?:index(?:ing)?|search\s+engine(?:s)?)\b/i
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"verb" => "allow",
|
|
59
|
+
"noun" => "fetch",
|
|
60
|
+
"regex" => /\b(?:allow(?:ed|s)?|permit(?:ted|s)?|may|can)\b.{0,120}\b(?:crawl(?:ing)?|scrap(?:e|ing)|fetch(?:ing)?|bot(?:s)?)\b/i
|
|
61
|
+
}
|
|
62
|
+
].freeze
|
|
63
|
+
|
|
64
|
+
def extract_human_signals(body, path:)
|
|
65
|
+
chunks = human_text_chunks(body)
|
|
66
|
+
seen = {}
|
|
67
|
+
|
|
68
|
+
HUMAN_PATTERNS.filter_map do |entry|
|
|
69
|
+
evidence = chunks.find { |chunk| chunk.match?(entry["regex"]) }
|
|
70
|
+
next unless evidence
|
|
71
|
+
next if entry["verb"] == "allow" && negative_human_chunk?(evidence)
|
|
72
|
+
|
|
73
|
+
key = [entry["verb"], entry["noun"], evidence]
|
|
74
|
+
next if seen[key]
|
|
75
|
+
|
|
76
|
+
seen[key] = true
|
|
77
|
+
build_signal(
|
|
78
|
+
entry["verb"],
|
|
79
|
+
entry["noun"],
|
|
80
|
+
path: path,
|
|
81
|
+
conditions: { "evidence" => evidence }
|
|
82
|
+
)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private
|
|
87
|
+
|
|
88
|
+
def human_text_chunks(body)
|
|
89
|
+
text = body.to_s
|
|
90
|
+
text = text.gsub(%r{<script\b.*?</script>}mi, " ")
|
|
91
|
+
text = text.gsub(%r{<style\b.*?</style>}mi, " ")
|
|
92
|
+
text = text.gsub(%r{<noscript\b.*?</noscript>}mi, " ")
|
|
93
|
+
text = text.gsub(/<[^>]+>/, " ")
|
|
94
|
+
text = CGI.unescapeHTML(text)
|
|
95
|
+
text = FetchUtil.normalize_whitespace(text)
|
|
96
|
+
text.split(/(?<=[.!?])\s+/).map { |chunk| chunk.strip }.reject(&:empty?)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def negative_human_chunk?(chunk)
|
|
100
|
+
chunk.match?(/\b(?:do\s+not|does\s+not|must\s+not|may\s+not|shall\s+not|not\s+permit(?:ted)?|not\s+allow(?:ed)?|forbid(?:den|s)?|prohibit(?:ed|s)?)\b/i)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module Orchestration
|
|
6
|
+
def initialize(client: nil, cache_path: DEFAULT_CACHE_PATH, sources: nil, timeout: 20, user_agent: nil)
|
|
7
|
+
@client = client || HttpClient.new(timeout: timeout, user_agent: user_agent || default_user_agent)
|
|
8
|
+
@cache_path = cache_path || DEFAULT_CACHE_PATH
|
|
9
|
+
@source_tokens = sources
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def call(url)
|
|
13
|
+
requested_uri = parse_http_uri(url)
|
|
14
|
+
origin_query = origin_query?(requested_uri)
|
|
15
|
+
query_target = request_target(requested_uri)
|
|
16
|
+
effective_query_target = query_target
|
|
17
|
+
selected_sources = resolve_sources(@source_tokens)
|
|
18
|
+
result = {}
|
|
19
|
+
policy_refs = []
|
|
20
|
+
|
|
21
|
+
if needs_tdmrep_fetch?(selected_sources)
|
|
22
|
+
record = tdmrep_record(requested_uri)
|
|
23
|
+
if selected_sources.include?("tdmrep")
|
|
24
|
+
add_source_payload(result, "tdmrep", scoped_signals(record["signals"], origin_query: origin_query, query_target: query_target))
|
|
25
|
+
end
|
|
26
|
+
policy_refs.concat(record["policies"])
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
if selected_sources.include?("trusttxt")
|
|
30
|
+
record = trusttxt_record(requested_uri)
|
|
31
|
+
add_source_payload(result, "trusttxt", scoped_signals(record["signals"], origin_query: origin_query, query_target: query_target))
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
if needs_robots_fetch?(selected_sources)
|
|
35
|
+
record = robots_record(requested_uri)
|
|
36
|
+
%w[robotstxt contentsignal contentusagerobots].each do |source|
|
|
37
|
+
next unless selected_sources.include?(source)
|
|
38
|
+
|
|
39
|
+
add_source_payload(
|
|
40
|
+
result,
|
|
41
|
+
source,
|
|
42
|
+
scoped_signals(record.dig("signals", source), origin_query: origin_query, query_target: query_target)
|
|
43
|
+
)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
if needs_page_fetch?(selected_sources)
|
|
48
|
+
record = page_record(requested_uri)
|
|
49
|
+
effective_query_target = page_query_target(record, fallback: query_target)
|
|
50
|
+
%w[xrobotstag metarobots tdmheaders tdmmeta contentusageheader human].each do |source|
|
|
51
|
+
next unless selected_sources.include?(source)
|
|
52
|
+
|
|
53
|
+
add_source_payload(
|
|
54
|
+
result,
|
|
55
|
+
source,
|
|
56
|
+
scoped_signals(record.dig("signals", source), origin_query: origin_query, query_target: effective_query_target)
|
|
57
|
+
)
|
|
58
|
+
end
|
|
59
|
+
policy_refs.concat(record["policies"])
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
if selected_sources.include?("tdmpolicy")
|
|
63
|
+
add_source_payload(
|
|
64
|
+
result,
|
|
65
|
+
"tdmpolicy",
|
|
66
|
+
scoped_signals(expanded_tdm_policy_signals(policy_refs), origin_query: origin_query, query_target: effective_query_target)
|
|
67
|
+
)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
result
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
private
|
|
74
|
+
|
|
75
|
+
attr_reader :cache_path, :client
|
|
76
|
+
|
|
77
|
+
def default_user_agent
|
|
78
|
+
"fetch_util/#{FetchUtil::VERSION}"
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module Page
|
|
6
|
+
def page_record(requested_uri)
|
|
7
|
+
fetch_record("page:#{requested_uri}", requested_uri.to_s, fallback: empty_page_record, require_success: false) do |_body, response|
|
|
8
|
+
final_uri = parse_http_uri(response.url)
|
|
9
|
+
final_path = request_target(final_uri)
|
|
10
|
+
xrobotstag = []
|
|
11
|
+
contentusageheader = []
|
|
12
|
+
tdmheaders = []
|
|
13
|
+
header_policies = []
|
|
14
|
+
|
|
15
|
+
response_chain(response).each do |step|
|
|
16
|
+
path = request_target(parse_http_uri(step.url))
|
|
17
|
+
xrobotstag.concat(extract_x_robot_signals(step.headers, path: path))
|
|
18
|
+
contentusageheader.concat(extract_content_usage_header_signals(step.headers, path: path))
|
|
19
|
+
step_tdmheaders, step_policies = extract_tdm_value_signals(
|
|
20
|
+
reservation: first_header_value(step.headers, "tdm-reservation"),
|
|
21
|
+
policy_url: first_header_value(step.headers, "tdm-policy"),
|
|
22
|
+
path: path
|
|
23
|
+
)
|
|
24
|
+
tdmheaders.concat(step_tdmheaders)
|
|
25
|
+
header_policies.concat(step_policies)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
metarobots = []
|
|
29
|
+
tdmmeta = []
|
|
30
|
+
meta_policies = []
|
|
31
|
+
human = []
|
|
32
|
+
if html_content?(response.headers, response.body)
|
|
33
|
+
meta_tags = parse_meta_tags(response.body)
|
|
34
|
+
metarobots = sort_generic_signals(extract_meta_robot_signals(meta_tags, path: final_path))
|
|
35
|
+
tdmmeta, meta_policies = extract_tdm_meta_signals(meta_tags, path: final_path)
|
|
36
|
+
human = sort_generic_signals(extract_human_signals(response.body, path: final_path))
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
{
|
|
40
|
+
"final_url" => final_uri.to_s,
|
|
41
|
+
"signals" => {
|
|
42
|
+
"xrobotstag" => sort_generic_signals(xrobotstag),
|
|
43
|
+
"metarobots" => metarobots,
|
|
44
|
+
"tdmheaders" => sort_generic_signals(tdmheaders),
|
|
45
|
+
"tdmmeta" => sort_generic_signals(tdmmeta),
|
|
46
|
+
"contentusageheader" => sort_usage_preference_signals(contentusageheader),
|
|
47
|
+
"human" => human
|
|
48
|
+
},
|
|
49
|
+
"policies" => dedupe_policy_refs(header_policies + meta_policies)
|
|
50
|
+
}
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def empty_page_record
|
|
55
|
+
{
|
|
56
|
+
"final_url" => nil,
|
|
57
|
+
"signals" => {
|
|
58
|
+
"xrobotstag" => [],
|
|
59
|
+
"metarobots" => [],
|
|
60
|
+
"tdmheaders" => [],
|
|
61
|
+
"tdmmeta" => [],
|
|
62
|
+
"contentusageheader" => [],
|
|
63
|
+
"human" => []
|
|
64
|
+
},
|
|
65
|
+
"policies" => []
|
|
66
|
+
}
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module RobotGlobs
|
|
6
|
+
private
|
|
7
|
+
|
|
8
|
+
def robot_user_agent_glob(user_agent)
|
|
9
|
+
token = user_agent.to_s.strip
|
|
10
|
+
return "*" if token.empty? || token == "*"
|
|
11
|
+
return token if token.end_with?("*")
|
|
12
|
+
|
|
13
|
+
"#{token}*"
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module Robots
|
|
6
|
+
ROBOT_DIRECTIVES = %w[
|
|
7
|
+
all
|
|
8
|
+
follow
|
|
9
|
+
index
|
|
10
|
+
indexifembedded
|
|
11
|
+
max-image-preview
|
|
12
|
+
max-snippet
|
|
13
|
+
max-video-preview
|
|
14
|
+
noai
|
|
15
|
+
noarchive
|
|
16
|
+
nocache
|
|
17
|
+
nofollow
|
|
18
|
+
noimageai
|
|
19
|
+
noimageindex
|
|
20
|
+
noindex
|
|
21
|
+
none
|
|
22
|
+
nosnippet
|
|
23
|
+
notranslate
|
|
24
|
+
unavailable_after
|
|
25
|
+
].freeze
|
|
26
|
+
def robots_record(requested_uri)
|
|
27
|
+
fetch_record("robotstxt:#{origin_key(requested_uri)}", robots_uri(requested_uri), fallback: empty_robots_record) do |body|
|
|
28
|
+
payload = extract_robots_source_signals(body)
|
|
29
|
+
{
|
|
30
|
+
"signals" => {
|
|
31
|
+
"robotstxt" => sort_robot_signals(payload["robotstxt"]),
|
|
32
|
+
"contentsignal" => sort_usage_preference_signals(payload["contentsignal"]),
|
|
33
|
+
"contentusagerobots" => sort_usage_preference_signals(payload["contentusagerobots"])
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def extract_robots_source_signals(body)
|
|
40
|
+
signals = {
|
|
41
|
+
"robotstxt" => [],
|
|
42
|
+
"contentsignal" => [],
|
|
43
|
+
"contentusagerobots" => []
|
|
44
|
+
}
|
|
45
|
+
user_agents = []
|
|
46
|
+
in_rules = false
|
|
47
|
+
|
|
48
|
+
body.to_s.gsub("\r\n", "\n").gsub("\r", "\n").each_line do |line|
|
|
49
|
+
content = line.sub(/\s*#.*\z/, "").strip
|
|
50
|
+
next if content.empty?
|
|
51
|
+
|
|
52
|
+
field, value = content.split(":", 2)
|
|
53
|
+
next unless value
|
|
54
|
+
|
|
55
|
+
field = field.strip.downcase
|
|
56
|
+
value = value.strip
|
|
57
|
+
|
|
58
|
+
case field
|
|
59
|
+
when "user-agent"
|
|
60
|
+
user_agents = [] if in_rules
|
|
61
|
+
in_rules = false
|
|
62
|
+
user_agents << value unless value.empty?
|
|
63
|
+
when "allow", "disallow"
|
|
64
|
+
next if user_agents.empty?
|
|
65
|
+
|
|
66
|
+
in_rules = true
|
|
67
|
+
user_agents.each do |user_agent|
|
|
68
|
+
signals["robotstxt"] << robot_signal(field, user_agent, value)
|
|
69
|
+
end
|
|
70
|
+
when "content-signal"
|
|
71
|
+
next if user_agents.empty?
|
|
72
|
+
|
|
73
|
+
in_rules = true
|
|
74
|
+
user_agents.each do |user_agent|
|
|
75
|
+
signals["contentsignal"].concat(extract_content_signal_signals(value, user_agent: user_agent))
|
|
76
|
+
end
|
|
77
|
+
when "content-usage"
|
|
78
|
+
next if user_agents.empty?
|
|
79
|
+
|
|
80
|
+
in_rules = true
|
|
81
|
+
user_agents.each do |user_agent|
|
|
82
|
+
signals["contentusagerobots"].concat(extract_content_usage_robot_signals(value, user_agent: user_agent))
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
signals
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
private
|
|
91
|
+
|
|
92
|
+
def empty_robots_record
|
|
93
|
+
{
|
|
94
|
+
"signals" => {
|
|
95
|
+
"robotstxt" => [],
|
|
96
|
+
"contentsignal" => [],
|
|
97
|
+
"contentusagerobots" => []
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def robot_signal(field, user_agent, value)
|
|
103
|
+
verb = value.to_s.strip.empty? ? "allow" : field
|
|
104
|
+
conditions = {}
|
|
105
|
+
user_agent_glob = robot_user_agent_glob(user_agent)
|
|
106
|
+
conditions["user-agent"] = user_agent_glob unless user_agent_glob == "*"
|
|
107
|
+
|
|
108
|
+
build_signal(
|
|
109
|
+
verb,
|
|
110
|
+
"*",
|
|
111
|
+
path: normalize_output_path(value),
|
|
112
|
+
conditions: conditions
|
|
113
|
+
)
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module Signals
|
|
6
|
+
private
|
|
7
|
+
|
|
8
|
+
def scoped_signals(signals, origin_query:, query_target:)
|
|
9
|
+
list = Array(signals).map { |signal| deep_copy(signal) }
|
|
10
|
+
return list if origin_query
|
|
11
|
+
|
|
12
|
+
list.filter_map do |signal|
|
|
13
|
+
next unless signal_matches_target?(signal["path"], query_target)
|
|
14
|
+
|
|
15
|
+
signal.reject { |key, _value| key == "path" }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def signal_matches_target?(path, query_target)
|
|
20
|
+
return true if path.nil? || path.empty?
|
|
21
|
+
|
|
22
|
+
Regexp.new("\\A#{signal_path_pattern(path)}").match?(query_target)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def signal_path_pattern(path)
|
|
26
|
+
escaped = Regexp.escape(path.to_s)
|
|
27
|
+
escaped = escaped.gsub("\\*", ".*")
|
|
28
|
+
if escaped.end_with?("\\$")
|
|
29
|
+
"#{escaped[0...-2]}$"
|
|
30
|
+
else
|
|
31
|
+
"#{escaped}.*"
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def sort_specificity_signals(signals)
|
|
36
|
+
Array(signals).sort_by do |signal|
|
|
37
|
+
[
|
|
38
|
+
*signal_sort_prefix(signal),
|
|
39
|
+
signal.dig("conditions", "policy").to_s
|
|
40
|
+
]
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def signal_sort_prefix(signal)
|
|
45
|
+
[
|
|
46
|
+
-path_specificity(signal["path"]),
|
|
47
|
+
allow_signal?(signal) ? 0 : 1
|
|
48
|
+
]
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def sort_generic_signals(signals)
|
|
52
|
+
Array(signals).sort_by do |signal|
|
|
53
|
+
[
|
|
54
|
+
allow_signal?(signal) ? 1 : 0,
|
|
55
|
+
wildcard_signal?(signal) ? 1 : 0,
|
|
56
|
+
signal_verb(signal),
|
|
57
|
+
signal_noun(signal)
|
|
58
|
+
]
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def signal_verb(signal)
|
|
63
|
+
signal.keys.find { |key| %w[allow disallow].include?(key) }.to_s
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def signal_noun(signal)
|
|
67
|
+
signal.values_at("allow", "disallow").compact.first.to_s
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def allow_signal?(signal)
|
|
71
|
+
signal.key?("allow")
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def wildcard_signal?(signal)
|
|
75
|
+
signal.dig("conditions", "user-agent").to_s == "*" || signal.dig("conditions", "user-agent").to_s.empty?
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def path_specificity(path)
|
|
79
|
+
path.to_s.delete("*$").length
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def integer_or_value(value)
|
|
83
|
+
Integer(value, exception: false) || value
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def build_signal(verb, noun, path: nil, conditions: nil)
|
|
87
|
+
signal = { verb => noun }
|
|
88
|
+
signal["path"] = path if path
|
|
89
|
+
signal["conditions"] = conditions if conditions && !conditions.empty?
|
|
90
|
+
signal
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def sort_robot_signals(signals)
|
|
94
|
+
signals.sort_by do |signal|
|
|
95
|
+
prefix = signal_sort_prefix(signal)
|
|
96
|
+
[
|
|
97
|
+
prefix.first,
|
|
98
|
+
wildcard_signal?(signal) ? 1 : 0,
|
|
99
|
+
prefix.last,
|
|
100
|
+
signal.dig("conditions", "user-agent").to_s
|
|
101
|
+
]
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module SourceSelection
|
|
6
|
+
private
|
|
7
|
+
|
|
8
|
+
def all_sources
|
|
9
|
+
@all_sources ||= (MACHINE_SOURCES + HUMAN_SOURCES).freeze
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def resolve_sources(selection)
|
|
13
|
+
tokens = Array(selection || "machine").flat_map { |value| value.to_s.split(",") }
|
|
14
|
+
tokens = tokens.map(&:strip).reject(&:empty?)
|
|
15
|
+
selected = []
|
|
16
|
+
|
|
17
|
+
tokens.each do |token|
|
|
18
|
+
remove = token.start_with?("-")
|
|
19
|
+
name = remove ? token[1..] : token
|
|
20
|
+
expansions = SOURCE_CLASSES.fetch(name, [name])
|
|
21
|
+
|
|
22
|
+
expansions.each do |source|
|
|
23
|
+
validate_source!(source)
|
|
24
|
+
if remove
|
|
25
|
+
selected.delete(source)
|
|
26
|
+
else
|
|
27
|
+
selected << source unless selected.include?(source)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
selected
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def validate_source!(source)
|
|
36
|
+
return if all_sources.include?(source)
|
|
37
|
+
|
|
38
|
+
raise ArgumentError, "unsupported regulatory source: #{source}"
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def needs_page_fetch?(selected_sources)
|
|
42
|
+
(selected_sources & (HUMAN_SOURCES + %w[xrobotstag metarobots tdmheaders tdmmeta contentusageheader tdmpolicy])).any?
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def needs_tdmrep_fetch?(selected_sources)
|
|
46
|
+
(selected_sources & %w[tdmrep tdmpolicy]).any?
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def needs_robots_fetch?(selected_sources)
|
|
50
|
+
(selected_sources & %w[robotstxt contentsignal contentusagerobots]).any?
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def add_source_payload(result, source, signals)
|
|
54
|
+
return if signals.nil? || signals.empty?
|
|
55
|
+
|
|
56
|
+
result[source] = signals
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module TdmPage
|
|
6
|
+
def extract_tdm_meta_signals(meta_tags, path:)
|
|
7
|
+
reservation = nil
|
|
8
|
+
policy_url = nil
|
|
9
|
+
|
|
10
|
+
meta_tags.each do |attributes|
|
|
11
|
+
name = attributes["name"].to_s.strip.downcase
|
|
12
|
+
reservation ||= attributes["content"] if name == "tdm-reservation"
|
|
13
|
+
policy_url ||= attributes["content"] if name == "tdm-policy"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
extract_tdm_value_signals(reservation: reservation, policy_url: policy_url, path: path)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def extract_tdm_value_signals(reservation:, policy_url:, path:)
|
|
20
|
+
value = reservation.to_s.strip
|
|
21
|
+
return [[], []] unless %w[0 1].include?(value)
|
|
22
|
+
|
|
23
|
+
conditions = {}
|
|
24
|
+
policy = policy_url.to_s.strip
|
|
25
|
+
conditions["policy"] = policy if value == "1" && !policy.empty?
|
|
26
|
+
signals = [
|
|
27
|
+
build_signal(
|
|
28
|
+
value == "1" ? "disallow" : "allow",
|
|
29
|
+
"text-and-data-mining",
|
|
30
|
+
path: path,
|
|
31
|
+
conditions: conditions
|
|
32
|
+
)
|
|
33
|
+
]
|
|
34
|
+
policies = value == "1" ? [policy_ref(policy, path)] : []
|
|
35
|
+
[signals, dedupe_policy_refs(policies)]
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Regulatory
|
|
5
|
+
module TdmPolicy
|
|
6
|
+
def expanded_tdm_policy_signals(policy_refs)
|
|
7
|
+
dedupe_policy_refs(policy_refs).flat_map do |policy_ref|
|
|
8
|
+
record = tdm_policy_record(policy_ref["url"])
|
|
9
|
+
Array(record["signals"]).map do |template|
|
|
10
|
+
signal = deep_copy(template)
|
|
11
|
+
signal["path"] ||= policy_ref["path"]
|
|
12
|
+
conditions = signal["conditions"] || {}
|
|
13
|
+
conditions["policy"] = policy_ref["url"]
|
|
14
|
+
signal["conditions"] = conditions
|
|
15
|
+
signal
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def tdm_policy_record(url)
|
|
21
|
+
cache_fetch("tdmpolicy:#{url}") do
|
|
22
|
+
response = safe_get(url)
|
|
23
|
+
signals = []
|
|
24
|
+
if response&.status&.between?(200, 299) && json_like_response?(response.headers, response.body)
|
|
25
|
+
signals = extract_tdm_policy_signals(response.body)
|
|
26
|
+
end
|
|
27
|
+
{ "signals" => sort_specificity_signals(signals) }
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def extract_tdm_policy_signals(body)
|
|
32
|
+
payload = JSON.parse(body.to_s)
|
|
33
|
+
permissions = array_value(payload, "permission")
|
|
34
|
+
|
|
35
|
+
permissions.filter_map do |permission|
|
|
36
|
+
next unless permission.is_a?(Hash)
|
|
37
|
+
next unless tdm_policy_action?(permission["action"])
|
|
38
|
+
|
|
39
|
+
conditions = {}
|
|
40
|
+
duties = array_value(permission, "duty").filter_map { |item| duty_name(item) }
|
|
41
|
+
conditions["duty"] = duties if duties.any?
|
|
42
|
+
purpose = permission_purpose(permission)
|
|
43
|
+
conditions["purpose"] = purpose if purpose
|
|
44
|
+
|
|
45
|
+
signal = build_signal("allow", "text-and-data-mining", conditions: conditions)
|
|
46
|
+
target_path = policy_target_path(permission["target"])
|
|
47
|
+
signal["path"] = target_path if target_path
|
|
48
|
+
signal
|
|
49
|
+
end
|
|
50
|
+
rescue JSON::ParserError
|
|
51
|
+
[]
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|