fetch_util 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +2 -0
  3. data/.rubocop.yml +97 -0
  4. data/CHANGELOG.md +48 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +199 -0
  7. data/Rakefile +18 -0
  8. data/SKILL.md +92 -0
  9. data/exe/fetch_util +6 -0
  10. data/lib/fetch_util/assets/extract.js +1 -0
  11. data/lib/fetch_util/assets/vendor/readability.js +2314 -0
  12. data/lib/fetch_util/assets/vendor/turndown.js +974 -0
  13. data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
  14. data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
  15. data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
  16. data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
  17. data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
  18. data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
  19. data/lib/fetch_util/browser/navigation.rb +13 -0
  20. data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
  21. data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
  22. data/lib/fetch_util/browser/site_stabilization.rb +13 -0
  23. data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
  24. data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
  25. data/lib/fetch_util/browser/stabilization.rb +13 -0
  26. data/lib/fetch_util/browser.rb +135 -0
  27. data/lib/fetch_util/cli.rb +124 -0
  28. data/lib/fetch_util/extractor.rb +56 -0
  29. data/lib/fetch_util/fetcher.rb +242 -0
  30. data/lib/fetch_util/parallel_fetcher.rb +97 -0
  31. data/lib/fetch_util/raw_docs_fallback.rb +260 -0
  32. data/lib/fetch_util/regulatory/cache_store.rb +92 -0
  33. data/lib/fetch_util/regulatory/directives.rb +106 -0
  34. data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
  35. data/lib/fetch_util/regulatory/headers.rb +39 -0
  36. data/lib/fetch_util/regulatory/http_client.rb +70 -0
  37. data/lib/fetch_util/regulatory/human.rb +104 -0
  38. data/lib/fetch_util/regulatory/orchestration.rb +82 -0
  39. data/lib/fetch_util/regulatory/page.rb +70 -0
  40. data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
  41. data/lib/fetch_util/regulatory/robots.rb +117 -0
  42. data/lib/fetch_util/regulatory/signals.rb +106 -0
  43. data/lib/fetch_util/regulatory/source_selection.rb +60 -0
  44. data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
  45. data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
  46. data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
  47. data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
  48. data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
  49. data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
  50. data/lib/fetch_util/regulatory.rb +74 -0
  51. data/lib/fetch_util/request_log.rb +24 -0
  52. data/lib/fetch_util/result.rb +58 -0
  53. data/lib/fetch_util/searcher/result_filtering.rb +102 -0
  54. data/lib/fetch_util/searcher.rb +332 -0
  55. data/lib/fetch_util/version.rb +5 -0
  56. data/lib/fetch_util.rb +115 -0
  57. metadata +145 -0
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module Human
6
+ HUMAN_PATTERNS = [
7
+ {
8
+ "verb" => "disallow",
9
+ "noun" => "text-and-data-mining",
10
+ "regex" => /
11
+ \b(?:do\s+not|does\s+not|must\s+not|may\s+not|shall\s+not|not\s+permit(?:ted)?|not\s+allow(?:ed)?|forbid(?:den|s)?|prohibit(?:ed|s)?)\b.{0,120}
12
+ \b(?:text\s+and\s+data\s+mining|text-and-data-mining|tdm)\b
13
+ /ix
14
+ },
15
+ {
16
+ "verb" => "disallow",
17
+ "noun" => "ai-training",
18
+ "regex" => /
19
+ \b(?:do\s+not|does\s+not|must\s+not|may\s+not|shall\s+not|not\s+permit(?:ted)?|not\s+allow(?:ed)?|forbid(?:den|s)?|prohibit(?:ed|s)?)\b.{0,120}
20
+ \b(?:ai\s+training|training\s+(?:of|for)\s+(?:ai|models?)|machine\s+learning|large\s+language\s+model|large\s+language\s+models|llm|generative\s+ai)\b
21
+ /ix
22
+ },
23
+ {
24
+ "verb" => "disallow",
25
+ "noun" => "index",
26
+ "regex" => /
27
+ \b(?:do\s+not|does\s+not|must\s+not|may\s+not|shall\s+not|not\s+permit(?:ted)?|not\s+allow(?:ed)?|forbid(?:den|s)?|prohibit(?:ed|s)?)\b.{0,120}
28
+ \b(?:index(?:ing)?|search\s+engine(?:s)?)\b
29
+ /ix
30
+ },
31
+ {
32
+ "verb" => "disallow",
33
+ "noun" => "fetch",
34
+ "regex" => /
35
+ \b(?:do\s+not|does\s+not|must\s+not|may\s+not|shall\s+not|not\s+permit(?:ted)?|not\s+allow(?:ed)?|forbid(?:den|s)?|prohibit(?:ed|s)?)\b.{0,120}
36
+ \b(?:crawl(?:ing)?|scrap(?:e|ing)|fetch(?:ing)?|bot(?:s)?)\b
37
+ /ix
38
+ },
39
+ {
40
+ "verb" => "allow",
41
+ "noun" => "text-and-data-mining",
42
+ "regex" => /\b(?:allow(?:ed|s)?|permit(?:ted|s)?|may|can)\b.{0,120}\b(?:text\s+and\s+data\s+mining|text-and-data-mining|tdm)\b/i
43
+ },
44
+ {
45
+ "verb" => "allow",
46
+ "noun" => "ai-training",
47
+ "regex" => /
48
+ \b(?:allow(?:ed|s)?|permit(?:ted|s)?|may|can)\b.{0,120}
49
+ \b(?:ai\s+training|training\s+(?:of|for)\s+(?:ai|models?)|machine\s+learning|large\s+language\s+model|large\s+language\s+models|llm|generative\s+ai)\b
50
+ /ix
51
+ },
52
+ {
53
+ "verb" => "allow",
54
+ "noun" => "index",
55
+ "regex" => /\b(?:allow(?:ed|s)?|permit(?:ted|s)?|may|can)\b.{0,120}\b(?:index(?:ing)?|search\s+engine(?:s)?)\b/i
56
+ },
57
+ {
58
+ "verb" => "allow",
59
+ "noun" => "fetch",
60
+ "regex" => /\b(?:allow(?:ed|s)?|permit(?:ted|s)?|may|can)\b.{0,120}\b(?:crawl(?:ing)?|scrap(?:e|ing)|fetch(?:ing)?|bot(?:s)?)\b/i
61
+ }
62
+ ].freeze
63
+
64
+ def extract_human_signals(body, path:)
65
+ chunks = human_text_chunks(body)
66
+ seen = {}
67
+
68
+ HUMAN_PATTERNS.filter_map do |entry|
69
+ evidence = chunks.find { |chunk| chunk.match?(entry["regex"]) }
70
+ next unless evidence
71
+ next if entry["verb"] == "allow" && negative_human_chunk?(evidence)
72
+
73
+ key = [entry["verb"], entry["noun"], evidence]
74
+ next if seen[key]
75
+
76
+ seen[key] = true
77
+ build_signal(
78
+ entry["verb"],
79
+ entry["noun"],
80
+ path: path,
81
+ conditions: { "evidence" => evidence }
82
+ )
83
+ end
84
+ end
85
+
86
+ private
87
+
88
+ def human_text_chunks(body)
89
+ text = body.to_s
90
+ text = text.gsub(%r{<script\b.*?</script>}mi, " ")
91
+ text = text.gsub(%r{<style\b.*?</style>}mi, " ")
92
+ text = text.gsub(%r{<noscript\b.*?</noscript>}mi, " ")
93
+ text = text.gsub(/<[^>]+>/, " ")
94
+ text = CGI.unescapeHTML(text)
95
+ text = FetchUtil.normalize_whitespace(text)
96
+ text.split(/(?<=[.!?])\s+/).map { |chunk| chunk.strip }.reject(&:empty?)
97
+ end
98
+
99
+ def negative_human_chunk?(chunk)
100
+ chunk.match?(/\b(?:do\s+not|does\s+not|must\s+not|may\s+not|shall\s+not|not\s+permit(?:ted)?|not\s+allow(?:ed)?|forbid(?:den|s)?|prohibit(?:ed|s)?)\b/i)
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module Orchestration
6
+ def initialize(client: nil, cache_path: DEFAULT_CACHE_PATH, sources: nil, timeout: 20, user_agent: nil)
7
+ @client = client || HttpClient.new(timeout: timeout, user_agent: user_agent || default_user_agent)
8
+ @cache_path = cache_path || DEFAULT_CACHE_PATH
9
+ @source_tokens = sources
10
+ end
11
+
12
+ def call(url)
13
+ requested_uri = parse_http_uri(url)
14
+ origin_query = origin_query?(requested_uri)
15
+ query_target = request_target(requested_uri)
16
+ effective_query_target = query_target
17
+ selected_sources = resolve_sources(@source_tokens)
18
+ result = {}
19
+ policy_refs = []
20
+
21
+ if needs_tdmrep_fetch?(selected_sources)
22
+ record = tdmrep_record(requested_uri)
23
+ if selected_sources.include?("tdmrep")
24
+ add_source_payload(result, "tdmrep", scoped_signals(record["signals"], origin_query: origin_query, query_target: query_target))
25
+ end
26
+ policy_refs.concat(record["policies"])
27
+ end
28
+
29
+ if selected_sources.include?("trusttxt")
30
+ record = trusttxt_record(requested_uri)
31
+ add_source_payload(result, "trusttxt", scoped_signals(record["signals"], origin_query: origin_query, query_target: query_target))
32
+ end
33
+
34
+ if needs_robots_fetch?(selected_sources)
35
+ record = robots_record(requested_uri)
36
+ %w[robotstxt contentsignal contentusagerobots].each do |source|
37
+ next unless selected_sources.include?(source)
38
+
39
+ add_source_payload(
40
+ result,
41
+ source,
42
+ scoped_signals(record.dig("signals", source), origin_query: origin_query, query_target: query_target)
43
+ )
44
+ end
45
+ end
46
+
47
+ if needs_page_fetch?(selected_sources)
48
+ record = page_record(requested_uri)
49
+ effective_query_target = page_query_target(record, fallback: query_target)
50
+ %w[xrobotstag metarobots tdmheaders tdmmeta contentusageheader human].each do |source|
51
+ next unless selected_sources.include?(source)
52
+
53
+ add_source_payload(
54
+ result,
55
+ source,
56
+ scoped_signals(record.dig("signals", source), origin_query: origin_query, query_target: effective_query_target)
57
+ )
58
+ end
59
+ policy_refs.concat(record["policies"])
60
+ end
61
+
62
+ if selected_sources.include?("tdmpolicy")
63
+ add_source_payload(
64
+ result,
65
+ "tdmpolicy",
66
+ scoped_signals(expanded_tdm_policy_signals(policy_refs), origin_query: origin_query, query_target: effective_query_target)
67
+ )
68
+ end
69
+
70
+ result
71
+ end
72
+
73
+ private
74
+
75
+ attr_reader :cache_path, :client
76
+
77
+ def default_user_agent
78
+ "fetch_util/#{FetchUtil::VERSION}"
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module Page
6
+ def page_record(requested_uri)
7
+ fetch_record("page:#{requested_uri}", requested_uri.to_s, fallback: empty_page_record, require_success: false) do |_body, response|
8
+ final_uri = parse_http_uri(response.url)
9
+ final_path = request_target(final_uri)
10
+ xrobotstag = []
11
+ contentusageheader = []
12
+ tdmheaders = []
13
+ header_policies = []
14
+
15
+ response_chain(response).each do |step|
16
+ path = request_target(parse_http_uri(step.url))
17
+ xrobotstag.concat(extract_x_robot_signals(step.headers, path: path))
18
+ contentusageheader.concat(extract_content_usage_header_signals(step.headers, path: path))
19
+ step_tdmheaders, step_policies = extract_tdm_value_signals(
20
+ reservation: first_header_value(step.headers, "tdm-reservation"),
21
+ policy_url: first_header_value(step.headers, "tdm-policy"),
22
+ path: path
23
+ )
24
+ tdmheaders.concat(step_tdmheaders)
25
+ header_policies.concat(step_policies)
26
+ end
27
+
28
+ metarobots = []
29
+ tdmmeta = []
30
+ meta_policies = []
31
+ human = []
32
+ if html_content?(response.headers, response.body)
33
+ meta_tags = parse_meta_tags(response.body)
34
+ metarobots = sort_generic_signals(extract_meta_robot_signals(meta_tags, path: final_path))
35
+ tdmmeta, meta_policies = extract_tdm_meta_signals(meta_tags, path: final_path)
36
+ human = sort_generic_signals(extract_human_signals(response.body, path: final_path))
37
+ end
38
+
39
+ {
40
+ "final_url" => final_uri.to_s,
41
+ "signals" => {
42
+ "xrobotstag" => sort_generic_signals(xrobotstag),
43
+ "metarobots" => metarobots,
44
+ "tdmheaders" => sort_generic_signals(tdmheaders),
45
+ "tdmmeta" => sort_generic_signals(tdmmeta),
46
+ "contentusageheader" => sort_usage_preference_signals(contentusageheader),
47
+ "human" => human
48
+ },
49
+ "policies" => dedupe_policy_refs(header_policies + meta_policies)
50
+ }
51
+ end
52
+ end
53
+
54
+ def empty_page_record
55
+ {
56
+ "final_url" => nil,
57
+ "signals" => {
58
+ "xrobotstag" => [],
59
+ "metarobots" => [],
60
+ "tdmheaders" => [],
61
+ "tdmmeta" => [],
62
+ "contentusageheader" => [],
63
+ "human" => []
64
+ },
65
+ "policies" => []
66
+ }
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module RobotGlobs
6
+ private
7
+
8
+ def robot_user_agent_glob(user_agent)
9
+ token = user_agent.to_s.strip
10
+ return "*" if token.empty? || token == "*"
11
+ return token if token.end_with?("*")
12
+
13
+ "#{token}*"
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module Robots
6
+ ROBOT_DIRECTIVES = %w[
7
+ all
8
+ follow
9
+ index
10
+ indexifembedded
11
+ max-image-preview
12
+ max-snippet
13
+ max-video-preview
14
+ noai
15
+ noarchive
16
+ nocache
17
+ nofollow
18
+ noimageai
19
+ noimageindex
20
+ noindex
21
+ none
22
+ nosnippet
23
+ notranslate
24
+ unavailable_after
25
+ ].freeze
26
+ def robots_record(requested_uri)
27
+ fetch_record("robotstxt:#{origin_key(requested_uri)}", robots_uri(requested_uri), fallback: empty_robots_record) do |body|
28
+ payload = extract_robots_source_signals(body)
29
+ {
30
+ "signals" => {
31
+ "robotstxt" => sort_robot_signals(payload["robotstxt"]),
32
+ "contentsignal" => sort_usage_preference_signals(payload["contentsignal"]),
33
+ "contentusagerobots" => sort_usage_preference_signals(payload["contentusagerobots"])
34
+ }
35
+ }
36
+ end
37
+ end
38
+
39
+ def extract_robots_source_signals(body)
40
+ signals = {
41
+ "robotstxt" => [],
42
+ "contentsignal" => [],
43
+ "contentusagerobots" => []
44
+ }
45
+ user_agents = []
46
+ in_rules = false
47
+
48
+ body.to_s.gsub("\r\n", "\n").gsub("\r", "\n").each_line do |line|
49
+ content = line.sub(/\s*#.*\z/, "").strip
50
+ next if content.empty?
51
+
52
+ field, value = content.split(":", 2)
53
+ next unless value
54
+
55
+ field = field.strip.downcase
56
+ value = value.strip
57
+
58
+ case field
59
+ when "user-agent"
60
+ user_agents = [] if in_rules
61
+ in_rules = false
62
+ user_agents << value unless value.empty?
63
+ when "allow", "disallow"
64
+ next if user_agents.empty?
65
+
66
+ in_rules = true
67
+ user_agents.each do |user_agent|
68
+ signals["robotstxt"] << robot_signal(field, user_agent, value)
69
+ end
70
+ when "content-signal"
71
+ next if user_agents.empty?
72
+
73
+ in_rules = true
74
+ user_agents.each do |user_agent|
75
+ signals["contentsignal"].concat(extract_content_signal_signals(value, user_agent: user_agent))
76
+ end
77
+ when "content-usage"
78
+ next if user_agents.empty?
79
+
80
+ in_rules = true
81
+ user_agents.each do |user_agent|
82
+ signals["contentusagerobots"].concat(extract_content_usage_robot_signals(value, user_agent: user_agent))
83
+ end
84
+ end
85
+ end
86
+
87
+ signals
88
+ end
89
+
90
+ private
91
+
92
+ def empty_robots_record
93
+ {
94
+ "signals" => {
95
+ "robotstxt" => [],
96
+ "contentsignal" => [],
97
+ "contentusagerobots" => []
98
+ }
99
+ }
100
+ end
101
+
102
+ def robot_signal(field, user_agent, value)
103
+ verb = value.to_s.strip.empty? ? "allow" : field
104
+ conditions = {}
105
+ user_agent_glob = robot_user_agent_glob(user_agent)
106
+ conditions["user-agent"] = user_agent_glob unless user_agent_glob == "*"
107
+
108
+ build_signal(
109
+ verb,
110
+ "*",
111
+ path: normalize_output_path(value),
112
+ conditions: conditions
113
+ )
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module Signals
6
+ private
7
+
8
+ def scoped_signals(signals, origin_query:, query_target:)
9
+ list = Array(signals).map { |signal| deep_copy(signal) }
10
+ return list if origin_query
11
+
12
+ list.filter_map do |signal|
13
+ next unless signal_matches_target?(signal["path"], query_target)
14
+
15
+ signal.reject { |key, _value| key == "path" }
16
+ end
17
+ end
18
+
19
+ def signal_matches_target?(path, query_target)
20
+ return true if path.nil? || path.empty?
21
+
22
+ Regexp.new("\\A#{signal_path_pattern(path)}").match?(query_target)
23
+ end
24
+
25
+ def signal_path_pattern(path)
26
+ escaped = Regexp.escape(path.to_s)
27
+ escaped = escaped.gsub("\\*", ".*")
28
+ if escaped.end_with?("\\$")
29
+ "#{escaped[0...-2]}$"
30
+ else
31
+ "#{escaped}.*"
32
+ end
33
+ end
34
+
35
+ def sort_specificity_signals(signals)
36
+ Array(signals).sort_by do |signal|
37
+ [
38
+ *signal_sort_prefix(signal),
39
+ signal.dig("conditions", "policy").to_s
40
+ ]
41
+ end
42
+ end
43
+
44
+ def signal_sort_prefix(signal)
45
+ [
46
+ -path_specificity(signal["path"]),
47
+ allow_signal?(signal) ? 0 : 1
48
+ ]
49
+ end
50
+
51
+ def sort_generic_signals(signals)
52
+ Array(signals).sort_by do |signal|
53
+ [
54
+ allow_signal?(signal) ? 1 : 0,
55
+ wildcard_signal?(signal) ? 1 : 0,
56
+ signal_verb(signal),
57
+ signal_noun(signal)
58
+ ]
59
+ end
60
+ end
61
+
62
+ def signal_verb(signal)
63
+ signal.keys.find { |key| %w[allow disallow].include?(key) }.to_s
64
+ end
65
+
66
+ def signal_noun(signal)
67
+ signal.values_at("allow", "disallow").compact.first.to_s
68
+ end
69
+
70
+ def allow_signal?(signal)
71
+ signal.key?("allow")
72
+ end
73
+
74
+ def wildcard_signal?(signal)
75
+ signal.dig("conditions", "user-agent").to_s == "*" || signal.dig("conditions", "user-agent").to_s.empty?
76
+ end
77
+
78
+ def path_specificity(path)
79
+ path.to_s.delete("*$").length
80
+ end
81
+
82
+ def integer_or_value(value)
83
+ Integer(value, exception: false) || value
84
+ end
85
+
86
+ def build_signal(verb, noun, path: nil, conditions: nil)
87
+ signal = { verb => noun }
88
+ signal["path"] = path if path
89
+ signal["conditions"] = conditions if conditions && !conditions.empty?
90
+ signal
91
+ end
92
+
93
+ def sort_robot_signals(signals)
94
+ signals.sort_by do |signal|
95
+ prefix = signal_sort_prefix(signal)
96
+ [
97
+ prefix.first,
98
+ wildcard_signal?(signal) ? 1 : 0,
99
+ prefix.last,
100
+ signal.dig("conditions", "user-agent").to_s
101
+ ]
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module SourceSelection
6
+ private
7
+
8
+ def all_sources
9
+ @all_sources ||= (MACHINE_SOURCES + HUMAN_SOURCES).freeze
10
+ end
11
+
12
+ def resolve_sources(selection)
13
+ tokens = Array(selection || "machine").flat_map { |value| value.to_s.split(",") }
14
+ tokens = tokens.map(&:strip).reject(&:empty?)
15
+ selected = []
16
+
17
+ tokens.each do |token|
18
+ remove = token.start_with?("-")
19
+ name = remove ? token[1..] : token
20
+ expansions = SOURCE_CLASSES.fetch(name, [name])
21
+
22
+ expansions.each do |source|
23
+ validate_source!(source)
24
+ if remove
25
+ selected.delete(source)
26
+ else
27
+ selected << source unless selected.include?(source)
28
+ end
29
+ end
30
+ end
31
+
32
+ selected
33
+ end
34
+
35
+ def validate_source!(source)
36
+ return if all_sources.include?(source)
37
+
38
+ raise ArgumentError, "unsupported regulatory source: #{source}"
39
+ end
40
+
41
+ def needs_page_fetch?(selected_sources)
42
+ (selected_sources & (HUMAN_SOURCES + %w[xrobotstag metarobots tdmheaders tdmmeta contentusageheader tdmpolicy])).any?
43
+ end
44
+
45
+ def needs_tdmrep_fetch?(selected_sources)
46
+ (selected_sources & %w[tdmrep tdmpolicy]).any?
47
+ end
48
+
49
+ def needs_robots_fetch?(selected_sources)
50
+ (selected_sources & %w[robotstxt contentsignal contentusagerobots]).any?
51
+ end
52
+
53
+ def add_source_payload(result, source, signals)
54
+ return if signals.nil? || signals.empty?
55
+
56
+ result[source] = signals
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module TdmPage
6
+ def extract_tdm_meta_signals(meta_tags, path:)
7
+ reservation = nil
8
+ policy_url = nil
9
+
10
+ meta_tags.each do |attributes|
11
+ name = attributes["name"].to_s.strip.downcase
12
+ reservation ||= attributes["content"] if name == "tdm-reservation"
13
+ policy_url ||= attributes["content"] if name == "tdm-policy"
14
+ end
15
+
16
+ extract_tdm_value_signals(reservation: reservation, policy_url: policy_url, path: path)
17
+ end
18
+
19
+ def extract_tdm_value_signals(reservation:, policy_url:, path:)
20
+ value = reservation.to_s.strip
21
+ return [[], []] unless %w[0 1].include?(value)
22
+
23
+ conditions = {}
24
+ policy = policy_url.to_s.strip
25
+ conditions["policy"] = policy if value == "1" && !policy.empty?
26
+ signals = [
27
+ build_signal(
28
+ value == "1" ? "disallow" : "allow",
29
+ "text-and-data-mining",
30
+ path: path,
31
+ conditions: conditions
32
+ )
33
+ ]
34
+ policies = value == "1" ? [policy_ref(policy, path)] : []
35
+ [signals, dedupe_policy_refs(policies)]
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Regulatory
5
+ module TdmPolicy
6
+ def expanded_tdm_policy_signals(policy_refs)
7
+ dedupe_policy_refs(policy_refs).flat_map do |policy_ref|
8
+ record = tdm_policy_record(policy_ref["url"])
9
+ Array(record["signals"]).map do |template|
10
+ signal = deep_copy(template)
11
+ signal["path"] ||= policy_ref["path"]
12
+ conditions = signal["conditions"] || {}
13
+ conditions["policy"] = policy_ref["url"]
14
+ signal["conditions"] = conditions
15
+ signal
16
+ end
17
+ end
18
+ end
19
+
20
+ def tdm_policy_record(url)
21
+ cache_fetch("tdmpolicy:#{url}") do
22
+ response = safe_get(url)
23
+ signals = []
24
+ if response&.status&.between?(200, 299) && json_like_response?(response.headers, response.body)
25
+ signals = extract_tdm_policy_signals(response.body)
26
+ end
27
+ { "signals" => sort_specificity_signals(signals) }
28
+ end
29
+ end
30
+
31
+ def extract_tdm_policy_signals(body)
32
+ payload = JSON.parse(body.to_s)
33
+ permissions = array_value(payload, "permission")
34
+
35
+ permissions.filter_map do |permission|
36
+ next unless permission.is_a?(Hash)
37
+ next unless tdm_policy_action?(permission["action"])
38
+
39
+ conditions = {}
40
+ duties = array_value(permission, "duty").filter_map { |item| duty_name(item) }
41
+ conditions["duty"] = duties if duties.any?
42
+ purpose = permission_purpose(permission)
43
+ conditions["purpose"] = purpose if purpose
44
+
45
+ signal = build_signal("allow", "text-and-data-mining", conditions: conditions)
46
+ target_path = policy_target_path(permission["target"])
47
+ signal["path"] = target_path if target_path
48
+ signal
49
+ end
50
+ rescue JSON::ParserError
51
+ []
52
+ end
53
+ end
54
+ end
55
+ end