fetch_util 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +97 -0
- data/CHANGELOG.md +48 -0
- data/LICENSE.txt +21 -0
- data/README.md +199 -0
- data/Rakefile +18 -0
- data/SKILL.md +92 -0
- data/exe/fetch_util +6 -0
- data/lib/fetch_util/assets/extract.js +1 -0
- data/lib/fetch_util/assets/vendor/readability.js +2314 -0
- data/lib/fetch_util/assets/vendor/turndown.js +974 -0
- data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
- data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
- data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
- data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
- data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
- data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
- data/lib/fetch_util/browser/navigation.rb +13 -0
- data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
- data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
- data/lib/fetch_util/browser/site_stabilization.rb +13 -0
- data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
- data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
- data/lib/fetch_util/browser/stabilization.rb +13 -0
- data/lib/fetch_util/browser.rb +135 -0
- data/lib/fetch_util/cli.rb +124 -0
- data/lib/fetch_util/extractor.rb +56 -0
- data/lib/fetch_util/fetcher.rb +242 -0
- data/lib/fetch_util/parallel_fetcher.rb +97 -0
- data/lib/fetch_util/raw_docs_fallback.rb +260 -0
- data/lib/fetch_util/regulatory/cache_store.rb +92 -0
- data/lib/fetch_util/regulatory/directives.rb +106 -0
- data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
- data/lib/fetch_util/regulatory/headers.rb +39 -0
- data/lib/fetch_util/regulatory/http_client.rb +70 -0
- data/lib/fetch_util/regulatory/human.rb +104 -0
- data/lib/fetch_util/regulatory/orchestration.rb +82 -0
- data/lib/fetch_util/regulatory/page.rb +70 -0
- data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
- data/lib/fetch_util/regulatory/robots.rb +117 -0
- data/lib/fetch_util/regulatory/signals.rb +106 -0
- data/lib/fetch_util/regulatory/source_selection.rb +60 -0
- data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
- data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
- data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
- data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
- data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
- data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
- data/lib/fetch_util/regulatory.rb +74 -0
- data/lib/fetch_util/request_log.rb +24 -0
- data/lib/fetch_util/result.rb +58 -0
- data/lib/fetch_util/searcher/result_filtering.rb +102 -0
- data/lib/fetch_util/searcher.rb +332 -0
- data/lib/fetch_util/version.rb +5 -0
- data/lib/fetch_util.rb +115 -0
- metadata +145 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "cgi"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module FetchUtil
|
|
7
|
+
class Searcher
|
|
8
|
+
MAX_SNIPPET_LENGTH = 180
|
|
9
|
+
|
|
10
|
+
SOURCES = {
|
|
11
|
+
"duckduckgo" => "https://duckduckgo.com/?q=%<query>s&ia=web&kl=us-en",
|
|
12
|
+
"google" => "https://www.google.com/search?hl=en&q=%<query>s",
|
|
13
|
+
"bing" => "https://www.bing.com/search?setlang=en-US&q=%<query>s",
|
|
14
|
+
"ecosia" => "https://www.ecosia.org/search?q=%<query>s",
|
|
15
|
+
"brave" => "https://search.brave.com/search?q=%<query>s"
|
|
16
|
+
}.freeze
|
|
17
|
+
|
|
18
|
+
DEFAULT_SOURCES = %w[duckduckgo google].freeze
|
|
19
|
+
|
|
20
|
+
autoload :ResultFiltering, "fetch_util/searcher/result_filtering"
|
|
21
|
+
include ResultFiltering
|
|
22
|
+
private_constant :ResultFiltering
|
|
23
|
+
|
|
24
|
+
def initialize(fetcher: nil, request_log: RequestLog.new, sources: nil, limit: 10, concurrency: 2, verbose: false, **fetch_options)
|
|
25
|
+
@request_log = request_log
|
|
26
|
+
@sources = Array(sources || DEFAULT_SOURCES).map(&:to_s)
|
|
27
|
+
@limit = limit.to_i
|
|
28
|
+
@verbose = verbose
|
|
29
|
+
@fetcher = fetcher || ParallelFetcher.new(concurrency: concurrency, request_log: request_log, **fetch_options)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def search(query)
|
|
33
|
+
encoded_query = query.to_s.strip
|
|
34
|
+
raise ArgumentError, "query must not be empty" if encoded_query.empty?
|
|
35
|
+
|
|
36
|
+
urls = search_urls(encoded_query)
|
|
37
|
+
@request_log.append(search_request_uri(encoded_query))
|
|
38
|
+
fetched = begin
|
|
39
|
+
@fetcher.fetch(urls.values)
|
|
40
|
+
rescue ParallelFetcher::ParallelFetchError => e
|
|
41
|
+
raise unless e.results&.compact&.any?
|
|
42
|
+
|
|
43
|
+
e.results
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
{
|
|
47
|
+
query: encoded_query,
|
|
48
|
+
results: formatted_results(aggregate(urls.keys, fetched).first(limit))
|
|
49
|
+
}
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
attr_reader :limit
|
|
55
|
+
|
|
56
|
+
def search_urls(query)
|
|
57
|
+
urls = {}
|
|
58
|
+
|
|
59
|
+
@sources.each do |source|
|
|
60
|
+
template = SOURCES.fetch(source) do
|
|
61
|
+
raise ArgumentError, "unsupported search source: #{source}"
|
|
62
|
+
end
|
|
63
|
+
urls[source] = format(template, query: CGI.escape(query))
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
urls
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def search_request_uri(query)
|
|
70
|
+
"search://#{@sources.join(",")}?q=#{CGI.escape(query)}"
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def aggregate(sources, fetched)
|
|
74
|
+
parsed = {}
|
|
75
|
+
max_size = 0
|
|
76
|
+
|
|
77
|
+
sources.zip(fetched).each do |source, result|
|
|
78
|
+
items = parse_markdown(result.markdown)
|
|
79
|
+
parsed[source] = items
|
|
80
|
+
max_size = [max_size, items.length].max
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
items = []
|
|
84
|
+
seen = {}
|
|
85
|
+
|
|
86
|
+
max_size.times do |index|
|
|
87
|
+
sources.each do |source|
|
|
88
|
+
item = parsed.fetch(source)[index]
|
|
89
|
+
next unless item
|
|
90
|
+
|
|
91
|
+
item = item.merge(rank: index + 1)
|
|
92
|
+
|
|
93
|
+
existing = seen[item[:url]]
|
|
94
|
+
if existing
|
|
95
|
+
merge_result!(existing, source, item)
|
|
96
|
+
next
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
result = build_result(source, item)
|
|
100
|
+
seen[item[:url]] = result
|
|
101
|
+
items << result
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
items
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def build_result(source, item)
|
|
109
|
+
result = {
|
|
110
|
+
title: item[:title],
|
|
111
|
+
url: item[:url],
|
|
112
|
+
sources: [source],
|
|
113
|
+
ranks: { source => item[:rank] }
|
|
114
|
+
}
|
|
115
|
+
result[:snippet] = item[:snippet] if item[:snippet]
|
|
116
|
+
result
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def merge_result!(result, source, item)
|
|
120
|
+
result[:sources] << source unless result[:sources].include?(source)
|
|
121
|
+
result[:ranks][source] ||= item[:rank]
|
|
122
|
+
return if !item[:snippet] || (result[:snippet] && result[:snippet].length >= item[:snippet].length)
|
|
123
|
+
|
|
124
|
+
result[:snippet] = item[:snippet]
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def formatted_results(results)
|
|
128
|
+
results.map do |result|
|
|
129
|
+
item = {
|
|
130
|
+
title: result[:title],
|
|
131
|
+
url: result[:url]
|
|
132
|
+
}
|
|
133
|
+
item[:snippet] = result[:snippet] if result[:snippet]
|
|
134
|
+
if verbose?
|
|
135
|
+
item[:sources] = result[:sources]
|
|
136
|
+
item[:ranks] = result[:ranks]
|
|
137
|
+
end
|
|
138
|
+
item
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def parse_markdown(markdown)
|
|
143
|
+
markdown.to_s.lines.filter_map do |line|
|
|
144
|
+
parsed = parse_markdown_line(line)
|
|
145
|
+
next unless parsed
|
|
146
|
+
|
|
147
|
+
normalized_item(parsed[:title], parsed[:url], parsed[:snippet])
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def parse_markdown_line(line)
|
|
152
|
+
stripped = line.to_s.strip
|
|
153
|
+
return nil unless stripped.start_with?("- [")
|
|
154
|
+
|
|
155
|
+
title_end = stripped.index("](")
|
|
156
|
+
return nil unless title_end
|
|
157
|
+
|
|
158
|
+
url_start = title_end + 2
|
|
159
|
+
url_end = markdown_url_end_index(stripped, url_start)
|
|
160
|
+
return nil unless url_end
|
|
161
|
+
|
|
162
|
+
title = stripped[3...title_end]
|
|
163
|
+
url = stripped[url_start...url_end]
|
|
164
|
+
remainder = stripped[(url_end + 1)..].to_s
|
|
165
|
+
snippet = remainder.start_with?(" - ") ? remainder[3..] : nil
|
|
166
|
+
|
|
167
|
+
{ title: title, url: url, snippet: snippet }
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def markdown_url_end_index(line, url_start)
|
|
171
|
+
depth = 0
|
|
172
|
+
|
|
173
|
+
url_start.upto(line.length - 1) do |index|
|
|
174
|
+
char = line[index]
|
|
175
|
+
if char == "("
|
|
176
|
+
depth += 1
|
|
177
|
+
elsif char == ")"
|
|
178
|
+
return index if depth.zero?
|
|
179
|
+
|
|
180
|
+
depth -= 1
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
nil
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def normalized_item(title, url, snippet)
|
|
188
|
+
normalized_url = normalize_url(url)
|
|
189
|
+
return nil unless normalized_url
|
|
190
|
+
|
|
191
|
+
normalized_title = normalize_title(title, normalized_url)
|
|
192
|
+
return nil if normalized_title.empty? || generic_title?(normalized_title, normalized_url)
|
|
193
|
+
|
|
194
|
+
normalized_snippet = normalize_snippet(snippet, normalized_title, normalized_url)
|
|
195
|
+
return nil if search_engine_self_link?(normalized_title, normalized_url, normalized_snippet)
|
|
196
|
+
return nil if low_value_result?(normalized_title, normalized_url, normalized_snippet)
|
|
197
|
+
|
|
198
|
+
item = {
|
|
199
|
+
title: normalized_title,
|
|
200
|
+
url: normalized_url
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
item[:snippet] = normalized_snippet if normalized_snippet
|
|
204
|
+
item
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def verbose?
|
|
208
|
+
@verbose
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def compact_text(value)
|
|
212
|
+
FetchUtil.normalize_whitespace(value)
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def normalize_url(url)
|
|
216
|
+
parsed = URI.parse(url.to_s.strip)
|
|
217
|
+
return nil unless parsed.is_a?(URI::HTTP) && parsed.host
|
|
218
|
+
|
|
219
|
+
parsed.host = parsed.host.downcase
|
|
220
|
+
parsed.path = "/" if parsed.path.to_s.empty?
|
|
221
|
+
parsed.path = parsed.path.sub(%r{/$}, "") unless parsed.path == "/"
|
|
222
|
+
parsed.fragment = nil unless keep_fragment?(parsed)
|
|
223
|
+
parsed.to_s
|
|
224
|
+
rescue URI::InvalidURIError
|
|
225
|
+
nil
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def host_for(url)
|
|
229
|
+
FetchUtil.strip_www_host(url)
|
|
230
|
+
rescue URI::InvalidURIError
|
|
231
|
+
nil
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def path_for(url)
|
|
235
|
+
URI.parse(url).path.to_s
|
|
236
|
+
rescue URI::InvalidURIError
|
|
237
|
+
""
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def generic_title?(title, url)
|
|
241
|
+
return true if title.start_with?("More on ")
|
|
242
|
+
|
|
243
|
+
host = host_for(url)
|
|
244
|
+
return false if host.nil? || host.empty?
|
|
245
|
+
|
|
246
|
+
title.casecmp?(host)
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def normalize_title(title, url)
|
|
250
|
+
text = compact_text(title)
|
|
251
|
+
host = host_for(url)
|
|
252
|
+
|
|
253
|
+
if host && !host.empty?
|
|
254
|
+
trimmed = text.sub(/\A#{Regexp.escape(host)}\s+/i, "")
|
|
255
|
+
text = trimmed if trimmed.length >= 8
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
text = text.sub(/\A(?:[[:alnum:].-]+\s+[>›]\s+)+/, "")
|
|
259
|
+
text = strip_slug_prefix(text)
|
|
260
|
+
compact_text(text)
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
def keep_fragment?(uri)
|
|
264
|
+
fragment = compact_text(uri.fragment)
|
|
265
|
+
return false if fragment.empty?
|
|
266
|
+
return false if noise_fragment?(fragment)
|
|
267
|
+
|
|
268
|
+
FetchUtil.docs_like_url?(uri)
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
def noise_fragment?(fragment)
|
|
272
|
+
fragment.match?(/\A(?:top|contents?|content|main|main-content|skip(?:-to)?-(?:content|main)|toc)\z/i)
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def strip_slug_prefix(text)
|
|
276
|
+
match = text.match(/\A([a-z0-9-]{4,})\s+(?=[A-Z])/)
|
|
277
|
+
return text unless match
|
|
278
|
+
|
|
279
|
+
prefix = match[1].downcase
|
|
280
|
+
return text unless prefix.match?(/[-\d]/) || %w[api blog doc docs guide guides help kb learn manual reference tutorial wiki].include?(prefix)
|
|
281
|
+
|
|
282
|
+
text.sub(/\A#{Regexp.escape(match[1])}\s+/, "")
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def normalize_snippet(snippet, title, url)
|
|
286
|
+
text = compact_text(snippet)
|
|
287
|
+
text = text.sub(/\A#{Regexp.escape(title)}\s*/i, "")
|
|
288
|
+
text = text.gsub(%r{https?://\S+}, " ")
|
|
289
|
+
text = text.sub(/\A[[:word:].-]+\s*(?:[>›]\s*[[:word:]_.()%-]+\s*)+/, "")
|
|
290
|
+
text = compact_text(text)
|
|
291
|
+
return nil if text.empty? || text.casecmp?(title)
|
|
292
|
+
|
|
293
|
+
host = host_for(url)
|
|
294
|
+
return nil if domain_only?(text, host)
|
|
295
|
+
return nil if breadcrumb_text?(text)
|
|
296
|
+
return nil if metadata_only_snippet?(text)
|
|
297
|
+
return nil if jammed_navigation_text?(text)
|
|
298
|
+
|
|
299
|
+
truncate(text, MAX_SNIPPET_LENGTH)
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
def truncate(text, max_length)
|
|
303
|
+
return text if text.length <= max_length
|
|
304
|
+
|
|
305
|
+
"#{text[0, max_length - 3].rstrip}..."
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def domain_only?(text, host)
|
|
309
|
+
return true if text.match?(/\A[a-z0-9.-]+\.[a-z]{2,}\z/i)
|
|
310
|
+
return false if host.nil? || host.empty?
|
|
311
|
+
|
|
312
|
+
text.casecmp?(host)
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def jammed_navigation_text?(text)
|
|
316
|
+
text.length > 20 && !text.include?(" ") && text.scan(/[A-Z]/).length >= 3
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
def breadcrumb_text?(text)
|
|
320
|
+
text.include?("›") || text.match?(/(?:\A|\s)>|>\s/)
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
def metadata_only_snippet?(text)
|
|
324
|
+
normalized = text.gsub(/([[:alpha:]])(\d)/, '\1 \2').gsub(/(\d)([[:alpha:]])/, '\1 \2')
|
|
325
|
+
site = normalized.match?(/\A(?:Reddit|Stack Overflow|Medium)\b/i)
|
|
326
|
+
counters = normalized.match?(/\b\d+\+?\s*(?:comments?|answers?|likes?)\b/i)
|
|
327
|
+
age = normalized.match?(/\b\d+\s*(?:years?|months?|days?|hours?|minutes?)\s+ago\b/i)
|
|
328
|
+
|
|
329
|
+
site && (counters || age)
|
|
330
|
+
end
|
|
331
|
+
end
|
|
332
|
+
end
|
data/lib/fetch_util.rb
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
require_relative "fetch_util/version"
|
|
6
|
+
|
|
7
|
+
module FetchUtil
|
|
8
|
+
class Error < StandardError; end
|
|
9
|
+
class BrowserError < Error; end
|
|
10
|
+
class ExtractionError < Error; end
|
|
11
|
+
|
|
12
|
+
DOCS_LIKE_EXACT_HOSTS = %w[
|
|
13
|
+
developer.mozilla.org
|
|
14
|
+
doc.rust-lang.org
|
|
15
|
+
docs.rs
|
|
16
|
+
fastapi.tiangolo.com
|
|
17
|
+
learn.microsoft.com
|
|
18
|
+
ncbi.nlm.nih.gov
|
|
19
|
+
nextjs.org
|
|
20
|
+
pkg.go.dev
|
|
21
|
+
platform.claude.com
|
|
22
|
+
react.dev
|
|
23
|
+
rubydoc.info
|
|
24
|
+
rubyapi.org
|
|
25
|
+
].freeze
|
|
26
|
+
DOCS_LIKE_PATH_KEYWORDS = %w[
|
|
27
|
+
api
|
|
28
|
+
book
|
|
29
|
+
books
|
|
30
|
+
concept
|
|
31
|
+
concepts
|
|
32
|
+
definition
|
|
33
|
+
definitions
|
|
34
|
+
dictionary
|
|
35
|
+
doc
|
|
36
|
+
docs
|
|
37
|
+
guide
|
|
38
|
+
guides
|
|
39
|
+
howto
|
|
40
|
+
library
|
|
41
|
+
libraries
|
|
42
|
+
manual
|
|
43
|
+
reference
|
|
44
|
+
sdk
|
|
45
|
+
tutorial
|
|
46
|
+
].freeze
|
|
47
|
+
DOCS_LIKE_PATH_PATTERN = %r{/
|
|
48
|
+
(?:
|
|
49
|
+
docs?|reference|api(?:/reference)?|tutorial|guide|guides|library|libraries|
|
|
50
|
+
book|books|dictionary|definition|definitions|concept|concepts|
|
|
51
|
+
get(?:ting)?-started|quick-start|how-to|howto|manual|sdk|learn
|
|
52
|
+
)
|
|
53
|
+
(?:/|\b)
|
|
54
|
+
}x
|
|
55
|
+
|
|
56
|
+
autoload :Browser, "fetch_util/browser"
|
|
57
|
+
autoload :CLI, "fetch_util/cli"
|
|
58
|
+
autoload :Extractor, "fetch_util/extractor"
|
|
59
|
+
autoload :Fetcher, "fetch_util/fetcher"
|
|
60
|
+
autoload :ParallelFetcher, "fetch_util/parallel_fetcher"
|
|
61
|
+
autoload :Regulatory, "fetch_util/regulatory"
|
|
62
|
+
autoload :RawDocsFallback, "fetch_util/raw_docs_fallback"
|
|
63
|
+
autoload :RequestLog, "fetch_util/request_log"
|
|
64
|
+
autoload :Result, "fetch_util/result"
|
|
65
|
+
autoload :Searcher, "fetch_util/searcher"
|
|
66
|
+
|
|
67
|
+
module_function
|
|
68
|
+
|
|
69
|
+
def fetch(url, **options)
|
|
70
|
+
Fetcher.new(**options).fetch(url)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def fetch_many(urls, **options)
|
|
74
|
+
ParallelFetcher.new(**options).fetch(urls)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def search(query, **options)
|
|
78
|
+
Searcher.new(**options).search(query)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def regulatory(url, **options)
|
|
82
|
+
Regulatory.new(**options).call(url)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def normalize_whitespace(value)
|
|
86
|
+
text = value.to_s
|
|
87
|
+
text = text.encode("UTF-8", invalid: :replace, undef: :replace, replace: " ") unless text.encoding == Encoding::UTF_8 && text.valid_encoding?
|
|
88
|
+
text.gsub(/\u00A0/, " ").gsub(/\s+/, " ").strip
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def strip_www_host(url)
|
|
92
|
+
URI.parse(url.to_s).host.to_s.downcase.sub(/\Awww\./, "")
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def docs_like_url?(value)
|
|
96
|
+
uri = value.is_a?(URI::Generic) ? value : URI.parse(value.to_s.strip)
|
|
97
|
+
return false unless uri.is_a?(URI::HTTP) && uri.host
|
|
98
|
+
|
|
99
|
+
host = strip_www_host(uri)
|
|
100
|
+
path = uri.path.to_s.downcase
|
|
101
|
+
path_terms = path.split(/[^a-z0-9]+/)
|
|
102
|
+
|
|
103
|
+
return true if DOCS_LIKE_EXACT_HOSTS.include?(host)
|
|
104
|
+
return true if host.end_with?(".readthedocs.io")
|
|
105
|
+
return true if host.start_with?("docs.") || host.start_with?("developer.") || host.start_with?("developers.") || host.start_with?("api.")
|
|
106
|
+
return true if host.match?(/\b(?:dictionary|merriam-webster|thefreedictionary|wiktionary|collinsdictionary|reverso)\b/)
|
|
107
|
+
return true if host == "go.dev" && path.match?(%r{\A/ref(?:/|\b)})
|
|
108
|
+
return true if path.match?(DOCS_LIKE_PATH_PATTERN)
|
|
109
|
+
return true if (path_terms & DOCS_LIKE_PATH_KEYWORDS).any?
|
|
110
|
+
|
|
111
|
+
false
|
|
112
|
+
rescue URI::InvalidURIError
|
|
113
|
+
false
|
|
114
|
+
end
|
|
115
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: fetch_util
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.3.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- hmdne
|
|
8
|
+
bindir: exe
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: ferrum
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '0.17'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '0.17'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: nokogiri
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '1.19'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '1.19'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: thor
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '1.3'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '1.3'
|
|
54
|
+
description: An intelligent web-fetch engine for Ruby that renders live pages, recognizes
|
|
55
|
+
what they are, and turns them into clean, usable markdown.
|
|
56
|
+
email:
|
|
57
|
+
- 54514036+hmdne@users.noreply.github.com
|
|
58
|
+
executables:
|
|
59
|
+
- fetch_util
|
|
60
|
+
extensions: []
|
|
61
|
+
extra_rdoc_files: []
|
|
62
|
+
files:
|
|
63
|
+
- ".rspec"
|
|
64
|
+
- ".rubocop.yml"
|
|
65
|
+
- CHANGELOG.md
|
|
66
|
+
- LICENSE.txt
|
|
67
|
+
- README.md
|
|
68
|
+
- Rakefile
|
|
69
|
+
- SKILL.md
|
|
70
|
+
- exe/fetch_util
|
|
71
|
+
- lib/fetch_util.rb
|
|
72
|
+
- lib/fetch_util/assets/extract.js
|
|
73
|
+
- lib/fetch_util/assets/vendor/readability.js
|
|
74
|
+
- lib/fetch_util/assets/vendor/turndown.js
|
|
75
|
+
- lib/fetch_util/browser.rb
|
|
76
|
+
- lib/fetch_util/browser/interaction_helpers.rb
|
|
77
|
+
- lib/fetch_util/browser/interaction_helpers/consent_helpers.rb
|
|
78
|
+
- lib/fetch_util/browser/interaction_helpers/dom_interaction.rb
|
|
79
|
+
- lib/fetch_util/browser/interaction_helpers/timing_helpers.rb
|
|
80
|
+
- lib/fetch_util/browser/navigation.rb
|
|
81
|
+
- lib/fetch_util/browser/navigation/headers_and_readiness.rb
|
|
82
|
+
- lib/fetch_util/browser/navigation/navigator_patch.rb
|
|
83
|
+
- lib/fetch_util/browser/site_stabilization.rb
|
|
84
|
+
- lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb
|
|
85
|
+
- lib/fetch_util/browser/site_stabilization/social_platforms.rb
|
|
86
|
+
- lib/fetch_util/browser/stabilization.rb
|
|
87
|
+
- lib/fetch_util/browser/stabilization/page_flow.rb
|
|
88
|
+
- lib/fetch_util/browser/stabilization/spa_hydration.rb
|
|
89
|
+
- lib/fetch_util/cli.rb
|
|
90
|
+
- lib/fetch_util/extractor.rb
|
|
91
|
+
- lib/fetch_util/fetcher.rb
|
|
92
|
+
- lib/fetch_util/parallel_fetcher.rb
|
|
93
|
+
- lib/fetch_util/raw_docs_fallback.rb
|
|
94
|
+
- lib/fetch_util/regulatory.rb
|
|
95
|
+
- lib/fetch_util/regulatory/cache_store.rb
|
|
96
|
+
- lib/fetch_util/regulatory/directives.rb
|
|
97
|
+
- lib/fetch_util/regulatory/fetch_records.rb
|
|
98
|
+
- lib/fetch_util/regulatory/headers.rb
|
|
99
|
+
- lib/fetch_util/regulatory/http_client.rb
|
|
100
|
+
- lib/fetch_util/regulatory/human.rb
|
|
101
|
+
- lib/fetch_util/regulatory/orchestration.rb
|
|
102
|
+
- lib/fetch_util/regulatory/page.rb
|
|
103
|
+
- lib/fetch_util/regulatory/robot_globs.rb
|
|
104
|
+
- lib/fetch_util/regulatory/robots.rb
|
|
105
|
+
- lib/fetch_util/regulatory/signals.rb
|
|
106
|
+
- lib/fetch_util/regulatory/source_selection.rb
|
|
107
|
+
- lib/fetch_util/regulatory/tdm_page.rb
|
|
108
|
+
- lib/fetch_util/regulatory/tdm_policy.rb
|
|
109
|
+
- lib/fetch_util/regulatory/tdm_rep.rb
|
|
110
|
+
- lib/fetch_util/regulatory/tdm_support.rb
|
|
111
|
+
- lib/fetch_util/regulatory/trust_txt.rb
|
|
112
|
+
- lib/fetch_util/regulatory/usage_preferences.rb
|
|
113
|
+
- lib/fetch_util/request_log.rb
|
|
114
|
+
- lib/fetch_util/result.rb
|
|
115
|
+
- lib/fetch_util/searcher.rb
|
|
116
|
+
- lib/fetch_util/searcher/result_filtering.rb
|
|
117
|
+
- lib/fetch_util/version.rb
|
|
118
|
+
homepage: https://github.com/rbutils/fetch_util
|
|
119
|
+
licenses:
|
|
120
|
+
- MIT
|
|
121
|
+
metadata:
|
|
122
|
+
allowed_push_host: https://rubygems.org
|
|
123
|
+
source_code_uri: https://github.com/rbutils/fetch_util
|
|
124
|
+
changelog_uri: https://github.com/rbutils/fetch_util/blob/master/CHANGELOG.md
|
|
125
|
+
documentation_uri: https://github.com/rbutils/fetch_util#readme
|
|
126
|
+
bug_tracker_uri: https://github.com/rbutils/fetch_util/issues
|
|
127
|
+
rubygems_mfa_required: 'true'
|
|
128
|
+
rdoc_options: []
|
|
129
|
+
require_paths:
|
|
130
|
+
- lib
|
|
131
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
132
|
+
requirements:
|
|
133
|
+
- - ">="
|
|
134
|
+
- !ruby/object:Gem::Version
|
|
135
|
+
version: 3.2.0
|
|
136
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
137
|
+
requirements:
|
|
138
|
+
- - ">="
|
|
139
|
+
- !ruby/object:Gem::Version
|
|
140
|
+
version: '0'
|
|
141
|
+
requirements: []
|
|
142
|
+
rubygems_version: 4.0.10
|
|
143
|
+
specification_version: 4
|
|
144
|
+
summary: AI for fetching in Ruby
|
|
145
|
+
test_files: []
|