fetch_util 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +97 -0
- data/CHANGELOG.md +48 -0
- data/LICENSE.txt +21 -0
- data/README.md +199 -0
- data/Rakefile +18 -0
- data/SKILL.md +92 -0
- data/exe/fetch_util +6 -0
- data/lib/fetch_util/assets/extract.js +1 -0
- data/lib/fetch_util/assets/vendor/readability.js +2314 -0
- data/lib/fetch_util/assets/vendor/turndown.js +974 -0
- data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
- data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
- data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
- data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
- data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
- data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
- data/lib/fetch_util/browser/navigation.rb +13 -0
- data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
- data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
- data/lib/fetch_util/browser/site_stabilization.rb +13 -0
- data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
- data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
- data/lib/fetch_util/browser/stabilization.rb +13 -0
- data/lib/fetch_util/browser.rb +135 -0
- data/lib/fetch_util/cli.rb +124 -0
- data/lib/fetch_util/extractor.rb +56 -0
- data/lib/fetch_util/fetcher.rb +242 -0
- data/lib/fetch_util/parallel_fetcher.rb +97 -0
- data/lib/fetch_util/raw_docs_fallback.rb +260 -0
- data/lib/fetch_util/regulatory/cache_store.rb +92 -0
- data/lib/fetch_util/regulatory/directives.rb +106 -0
- data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
- data/lib/fetch_util/regulatory/headers.rb +39 -0
- data/lib/fetch_util/regulatory/http_client.rb +70 -0
- data/lib/fetch_util/regulatory/human.rb +104 -0
- data/lib/fetch_util/regulatory/orchestration.rb +82 -0
- data/lib/fetch_util/regulatory/page.rb +70 -0
- data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
- data/lib/fetch_util/regulatory/robots.rb +117 -0
- data/lib/fetch_util/regulatory/signals.rb +106 -0
- data/lib/fetch_util/regulatory/source_selection.rb +60 -0
- data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
- data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
- data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
- data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
- data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
- data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
- data/lib/fetch_util/regulatory.rb +74 -0
- data/lib/fetch_util/request_log.rb +24 -0
- data/lib/fetch_util/result.rb +58 -0
- data/lib/fetch_util/searcher/result_filtering.rb +102 -0
- data/lib/fetch_util/searcher.rb +332 -0
- data/lib/fetch_util/version.rb +5 -0
- data/lib/fetch_util.rb +115 -0
- metadata +145 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module SiteStabilization
|
|
6
|
+
module CommunityAndMarketplace
|
|
7
|
+
COOKIE_BUTTON_SELECTORS = 'button, [role="button"], a, input[type="button"], input[type="submit"]'
|
|
8
|
+
EBAY_COOKIE_ACCEPT_LABELS = [
|
|
9
|
+
"accept all",
|
|
10
|
+
"accept all cookies",
|
|
11
|
+
"accept cookies",
|
|
12
|
+
"allow all",
|
|
13
|
+
"allow cookies",
|
|
14
|
+
"agree to cookies",
|
|
15
|
+
"continue with cookies"
|
|
16
|
+
].freeze
|
|
17
|
+
private_constant :COOKIE_BUTTON_SELECTORS, :EBAY_COOKIE_ACCEPT_LABELS
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def reddit_url?(url)
|
|
22
|
+
host_matches?(url, "reddit.com")
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def stabilize_reddit(page)
|
|
26
|
+
retry_until_timeout(capped_timeout(3.0), interval: 0.1) do
|
|
27
|
+
dismiss_reddit_cookie_dialog(page)
|
|
28
|
+
reddit_content_ready?(page)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
settle_after_stabilization(0.25)
|
|
32
|
+
dismiss_reddit_cookie_dialog(page)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def ebay_search_url?(url)
|
|
36
|
+
uri = URI.parse(url)
|
|
37
|
+
host = uri.host.to_s.downcase
|
|
38
|
+
return false unless host == "ebay.com" || host.end_with?(".ebay.com")
|
|
39
|
+
|
|
40
|
+
uri.path.include?("/sch/") || uri.query.to_s.include?("_nkw=")
|
|
41
|
+
rescue URI::InvalidURIError
|
|
42
|
+
false
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def stabilize_ebay_search(page)
|
|
46
|
+
accepted_cookies = false
|
|
47
|
+
|
|
48
|
+
retry_until_timeout(capped_timeout(6.0), interval: 0.15) do
|
|
49
|
+
accepted_cookies ||= click_visible_button_by_text(
|
|
50
|
+
page,
|
|
51
|
+
EBAY_COOKIE_ACCEPT_LABELS,
|
|
52
|
+
selectors: COOKIE_BUTTON_SELECTORS
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
state = safe_evaluate(page, <<~JS, default: { "itemCount" => 0, "challengeVisible" => false })
|
|
56
|
+
(() => {
|
|
57
|
+
const bodyText = document.body ? document.body.innerText : '';
|
|
58
|
+
return {
|
|
59
|
+
itemCount: document.querySelectorAll('li.s-item a[href*="/itm/"], ul.srp-results a[href*="/itm/"]').length,
|
|
60
|
+
challengeVisible: /checking your browser before you access ebay|your browser will redirect to your requested content shortly|pardon our interruption/i.test(bodyText)
|
|
61
|
+
};
|
|
62
|
+
})()
|
|
63
|
+
JS
|
|
64
|
+
|
|
65
|
+
state["itemCount"].to_i >= 4 || (state["challengeVisible"] ? 0.35 : false)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
settle_after_stabilization(0.25) if accepted_cookies
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def reddit_content_ready?(page)
|
|
72
|
+
page.evaluate(<<~JS)
|
|
73
|
+
!!document.querySelector('shreddit-post, faceplate-screen-reader-content, shreddit-comment, [data-testid="comment"]')
|
|
74
|
+
JS
|
|
75
|
+
rescue Ferrum::JavaScriptError
|
|
76
|
+
false
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def dismiss_reddit_cookie_dialog(page)
|
|
80
|
+
removed = dismiss_overlay_dialog(
|
|
81
|
+
page,
|
|
82
|
+
close_selectors: [],
|
|
83
|
+
dialog_selectors: [
|
|
84
|
+
'[data-testid="onboarding-modal"]',
|
|
85
|
+
'[data-testid="gdpr-modal"]',
|
|
86
|
+
'[aria-modal="true"]',
|
|
87
|
+
'[role="dialog"]',
|
|
88
|
+
"shreddit-experience-tree"
|
|
89
|
+
],
|
|
90
|
+
dialog_pattern: "before you continue to reddit|let us know your cookie preferences"
|
|
91
|
+
)
|
|
92
|
+
return true if removed
|
|
93
|
+
|
|
94
|
+
safe_evaluate(page, <<~JS)
|
|
95
|
+
(() => {
|
|
96
|
+
#{js_dom_helpers}
|
|
97
|
+
let removed = false;
|
|
98
|
+
document.querySelectorAll('section, div, aside, form, footer, shreddit-experience-tree').forEach((node) => {
|
|
99
|
+
const text = (node.innerText || node.textContent || '').replace(/\s+/g, ' ').trim();
|
|
100
|
+
if (/before you continue to reddit|let us know your cookie preferences/i.test(text) && text.length < 2000) {
|
|
101
|
+
node.remove();
|
|
102
|
+
removed = true;
|
|
103
|
+
}
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
if (removed) {
|
|
107
|
+
restoreScroll();
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return removed;
|
|
111
|
+
})()
|
|
112
|
+
JS
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module SiteStabilization
|
|
6
|
+
module SocialPlatforms
|
|
7
|
+
COOKIE_BUTTON_SELECTORS = 'button, [role="button"], a, input[type="button"], input[type="submit"]'
|
|
8
|
+
INSTAGRAM_COOKIE_ACCEPT_LABELS = ["accept", "accept all", "accept all cookies"].freeze
|
|
9
|
+
INSTAGRAM_COOKIE_FALLBACK_LABELS = ["allow all cookies", "allow all", "allow cookies"].freeze
|
|
10
|
+
FACEBOOK_COOKIE_DECLINE_LABELS = [
|
|
11
|
+
"decline optional cookies",
|
|
12
|
+
"optionale cookies ablehnen",
|
|
13
|
+
"refuser les cookies optionnels",
|
|
14
|
+
"rechazar cookies opcionales",
|
|
15
|
+
"rifiuta i cookie opzionali"
|
|
16
|
+
].freeze
|
|
17
|
+
FACEBOOK_COOKIE_ACCEPT_LABELS = [
|
|
18
|
+
"allow all cookies",
|
|
19
|
+
"alle cookies erlauben",
|
|
20
|
+
"autoriser tous les cookies",
|
|
21
|
+
"permitir todas las cookies",
|
|
22
|
+
"consenti tutti i cookie"
|
|
23
|
+
].freeze
|
|
24
|
+
private_constant :COOKIE_BUTTON_SELECTORS, :INSTAGRAM_COOKIE_ACCEPT_LABELS, :INSTAGRAM_COOKIE_FALLBACK_LABELS,
|
|
25
|
+
:FACEBOOK_COOKIE_DECLINE_LABELS, :FACEBOOK_COOKIE_ACCEPT_LABELS
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def instagram_url?(url)
|
|
30
|
+
host_matches?(url, "instagram.com")
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def stabilize_instagram(page)
|
|
34
|
+
wait_for_idle_or_content(page) if @wait_for_idle
|
|
35
|
+
accept_instagram_cookie_dialog(page) || accept_cookie_consent(page)
|
|
36
|
+
social_login_phase_pause
|
|
37
|
+
accept_instagram_cookie_dialog(page) || accept_cookie_consent(page)
|
|
38
|
+
retry_until_timeout(capped_timeout(5.0)) { dismiss_instagram_login_modal(page) }
|
|
39
|
+
social_login_phase_pause
|
|
40
|
+
dismiss_instagram_login_modal(page)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def accept_instagram_cookie_dialog(page)
|
|
44
|
+
click_visible_button_by_text(
|
|
45
|
+
page,
|
|
46
|
+
INSTAGRAM_COOKIE_ACCEPT_LABELS,
|
|
47
|
+
INSTAGRAM_COOKIE_FALLBACK_LABELS,
|
|
48
|
+
selectors: COOKIE_BUTTON_SELECTORS
|
|
49
|
+
)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def dismiss_instagram_login_modal(page)
|
|
53
|
+
dismiss_overlay_dialog(
|
|
54
|
+
page,
|
|
55
|
+
close_selectors: [
|
|
56
|
+
'[role="dialog"] button',
|
|
57
|
+
'[role="dialog"] [role="button"]',
|
|
58
|
+
'[role="dialog"] button[aria-label]',
|
|
59
|
+
'[role="dialog"] button[title]',
|
|
60
|
+
'[role="dialog"] button svg',
|
|
61
|
+
'[role="presentation"] button',
|
|
62
|
+
'[role="presentation"] [role="button"]',
|
|
63
|
+
'[role="presentation"] button[aria-label]',
|
|
64
|
+
'[role="presentation"] button[title]',
|
|
65
|
+
'[role="presentation"] button svg',
|
|
66
|
+
'[aria-modal="true"] button',
|
|
67
|
+
'[aria-modal="true"] [role="button"]',
|
|
68
|
+
'[aria-modal="true"] button[aria-label]',
|
|
69
|
+
'[aria-modal="true"] button[title]',
|
|
70
|
+
'[aria-modal="true"] button svg',
|
|
71
|
+
'div[style*="position: fixed"] button',
|
|
72
|
+
'div[style*="position:fixed"] button',
|
|
73
|
+
'div[style*="position: fixed"] [role="button"]',
|
|
74
|
+
'div[style*="position:fixed"] [role="button"]',
|
|
75
|
+
'div[style*="position: fixed"] svg',
|
|
76
|
+
'div[style*="position:fixed"] svg'
|
|
77
|
+
],
|
|
78
|
+
dialog_selectors: ['[role="dialog"]', '[role="presentation"]', '[aria-modal="true"]'],
|
|
79
|
+
overlay_selectors: ['div[style*="position: fixed"]', 'div[style*="position:fixed"]'],
|
|
80
|
+
dialog_pattern: "log in|sign up|create (?:new )?account|don.?t have an account",
|
|
81
|
+
close_label_pattern: "^(?:close|dismiss|x|×)?$",
|
|
82
|
+
allow_empty_close_label: true
|
|
83
|
+
)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def facebook_url?(url)
|
|
87
|
+
host_matches?(url, "facebook.com")
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def stabilize_facebook(page)
|
|
91
|
+
wait_for_idle_or_content(page) if @wait_for_idle
|
|
92
|
+
social_login_phase_pause
|
|
93
|
+
dismiss_facebook_cookie_dialog(page)
|
|
94
|
+
social_login_phase_pause
|
|
95
|
+
retry_until_timeout(capped_timeout(5.0)) { dismiss_facebook_login_dialog(page) }
|
|
96
|
+
social_login_phase_pause
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def dismiss_facebook_cookie_dialog(page)
|
|
100
|
+
click_visible_button_by_text(
|
|
101
|
+
page,
|
|
102
|
+
FACEBOOK_COOKIE_DECLINE_LABELS,
|
|
103
|
+
FACEBOOK_COOKIE_ACCEPT_LABELS
|
|
104
|
+
)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def dismiss_facebook_login_dialog(page)
|
|
108
|
+
dismiss_overlay_dialog(
|
|
109
|
+
page,
|
|
110
|
+
close_selectors: ['[aria-label="Close"]', '[aria-label="close"]'],
|
|
111
|
+
dialog_selectors: ['[role="dialog"]', '[aria-modal="true"]'],
|
|
112
|
+
dialog_pattern: "log in|sign up|create (?:new )?account|see more from"
|
|
113
|
+
)
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module SiteStabilization
|
|
6
|
+
autoload :CommunityAndMarketplace, "fetch_util/browser/site_stabilization/community_and_marketplace"
|
|
7
|
+
autoload :SocialPlatforms, "fetch_util/browser/site_stabilization/social_platforms"
|
|
8
|
+
|
|
9
|
+
include CommunityAndMarketplace
|
|
10
|
+
include SocialPlatforms
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module Stabilization
|
|
6
|
+
module PageFlow
|
|
7
|
+
private
|
|
8
|
+
|
|
9
|
+
def stabilize_page(page, url)
|
|
10
|
+
return stabilize_reddit(page) if reddit_url?(url)
|
|
11
|
+
return stabilize_instagram(page) if instagram_url?(url)
|
|
12
|
+
return stabilize_facebook(page) if facebook_url?(url)
|
|
13
|
+
return stabilize_ebay_search(page) if ebay_search_url?(url)
|
|
14
|
+
|
|
15
|
+
reached_idle = !@wait_for_idle || wait_for_idle_or_content(page)
|
|
16
|
+
preserve_consent = preserve_consent_wall?(page, url)
|
|
17
|
+
accepted_cookies = preserve_consent ? false : accept_cookie_consent(page)
|
|
18
|
+
accepted_cookies = (!preserve_consent && dismiss_privacy_preference_overlay(page)) || accepted_cookies
|
|
19
|
+
sleep @wait if @wait.positive?
|
|
20
|
+
accepted_cookies = (!preserve_consent && accept_cookie_consent(page)) || accepted_cookies
|
|
21
|
+
accepted_cookies = (!preserve_consent && dismiss_privacy_preference_overlay(page)) || accepted_cookies
|
|
22
|
+
|
|
23
|
+
wait_for_spa_hydration(page) if @wait_for_idle && reached_idle
|
|
24
|
+
accepted_cookies = (!preserve_consent && accept_cookie_consent(page)) || accepted_cookies
|
|
25
|
+
accepted_cookies = (!preserve_consent && dismiss_privacy_preference_overlay(page)) || accepted_cookies
|
|
26
|
+
|
|
27
|
+
return unless accepted_cookies && @wait_for_idle && reached_idle
|
|
28
|
+
|
|
29
|
+
page.network.wait_for_idle(duration: @idle_duration, timeout: POST_CONSENT_IDLE_TIMEOUT)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def wait_for_idle_or_content(page)
|
|
33
|
+
content_seen_at = nil
|
|
34
|
+
|
|
35
|
+
retry_until_timeout(@timeout, interval: @idle_duration) do
|
|
36
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
37
|
+
next true if page.network.idle?
|
|
38
|
+
if content_seen_at.nil? && page_has_content?(page)
|
|
39
|
+
content_seen_at = now
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
content_seen_at && (now - content_seen_at) >= @idle_duration
|
|
43
|
+
end
|
|
44
|
+
rescue Ferrum::Error
|
|
45
|
+
false
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def page_has_content?(page)
|
|
49
|
+
page.evaluate(<<~JS)
|
|
50
|
+
(() => {
|
|
51
|
+
const body = document.body;
|
|
52
|
+
if (!body) return false;
|
|
53
|
+
const text = body.innerText || '';
|
|
54
|
+
return text.length > #{CONTENT_READY_MIN_LENGTH};
|
|
55
|
+
})()
|
|
56
|
+
JS
|
|
57
|
+
rescue Ferrum::JavaScriptError, Ferrum::TimeoutError
|
|
58
|
+
false
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def preserve_consent_wall?(page, url)
|
|
62
|
+
host = FetchUtil.strip_www_host(url)
|
|
63
|
+
return false unless host == "youtube.com" || host.end_with?(".youtube.com") || host.match?(/\Agoogle\.[a-z.]+\z/)
|
|
64
|
+
|
|
65
|
+
state = page.evaluate(<<~JS)
|
|
66
|
+
(() => ({
|
|
67
|
+
title: document.title || '',
|
|
68
|
+
text: document.body ? document.body.innerText.replace(/\s+/g, ' ').trim().slice(0, 1200) : ''
|
|
69
|
+
}))()
|
|
70
|
+
JS
|
|
71
|
+
|
|
72
|
+
combined = [state["title"], state["text"]].join(" ").downcase
|
|
73
|
+
/before you continue to (google|youtube)|we use cookies and data|accept all|reject all|more options/.match?(combined)
|
|
74
|
+
rescue URI::InvalidURIError, Ferrum::JavaScriptError, Ferrum::TimeoutError
|
|
75
|
+
false
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module Stabilization
|
|
6
|
+
# rubocop:disable Metrics/ModuleLength
|
|
7
|
+
module SpaHydration
|
|
8
|
+
private
|
|
9
|
+
|
|
10
|
+
def wait_for_spa_hydration(page)
|
|
11
|
+
framework = detect_spa_framework(page)
|
|
12
|
+
return unless framework
|
|
13
|
+
|
|
14
|
+
retry_until_timeout(SPA_HYDRATION_TIMEOUT, interval: SPA_HYDRATION_POLL) do
|
|
15
|
+
spa_hydration_complete?(page, framework)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
sleep SPA_HYDRATION_POLL
|
|
19
|
+
rescue Ferrum::JavaScriptError, Ferrum::TimeoutError
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def detect_spa_framework(page)
|
|
23
|
+
result = page.evaluate(spa_framework_detection_script)
|
|
24
|
+
|
|
25
|
+
result.is_a?(String) ? result.to_sym : nil
|
|
26
|
+
rescue Ferrum::JavaScriptError, Ferrum::TimeoutError
|
|
27
|
+
nil
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def spa_hydration_complete?(page, framework)
|
|
31
|
+
page.evaluate(spa_hydration_completion_script(framework))
|
|
32
|
+
rescue Ferrum::JavaScriptError, Ferrum::TimeoutError
|
|
33
|
+
true
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def spa_framework_detection_script
|
|
37
|
+
<<~JS
|
|
38
|
+
(() => {
|
|
39
|
+
if (window.__NEXT_DATA__ || (window.next && window.next.version)) return 'next';
|
|
40
|
+
if (window.__NUXT__ || window.$nuxt) return 'nuxt';
|
|
41
|
+
if (window.__remixContext) return 'remix';
|
|
42
|
+
if (document.getElementById('__docusaurus')) return 'docusaurus';
|
|
43
|
+
if (document.getElementById('___gatsby')) return 'gatsby';
|
|
44
|
+
|
|
45
|
+
const vueMount = document.querySelector('#app, #root, #__nuxt');
|
|
46
|
+
if ((vueMount && vueMount.__vue_app__) || window.__VUE__) return 'vue';
|
|
47
|
+
if (vueMount && vueMount.__vue__) return 'vue';
|
|
48
|
+
if (document.querySelector('[ng-version]')) return 'angular';
|
|
49
|
+
if (window.__svelte || document.getElementById('svelte-announcer') ||
|
|
50
|
+
document.querySelector('[data-sveltekit-preload-data]') ||
|
|
51
|
+
document.querySelector('[class*="svelte-"]')) return 'svelte';
|
|
52
|
+
if (window.Ember) return 'ember';
|
|
53
|
+
if (document.querySelector('[q\\:container]')) return 'qwik';
|
|
54
|
+
if (document.querySelector('meta[name="generator"][content*="Mintlify" i]') || document.querySelector('#content-area')) {
|
|
55
|
+
const gen = (document.querySelector('meta[name="generator"]') || {}).content || '';
|
|
56
|
+
if (/mintlify/i.test(gen) || document.querySelector('[class*="mintlify"]')) return 'mintlify';
|
|
57
|
+
}
|
|
58
|
+
{
|
|
59
|
+
const gen = (document.querySelector('meta[name="generator"]') || {}).content || '';
|
|
60
|
+
if (/gitbook/i.test(gen) || /\.gitbook\.io$/.test(location.hostname)) return 'gitbook';
|
|
61
|
+
}
|
|
62
|
+
if (document.querySelector('.scalar-api-reference, .scalar-app, [data-scalar]')) return 'scalar';
|
|
63
|
+
if (document.querySelector('redoc, rapi-doc, .redoc-wrap')) return 'redoc';
|
|
64
|
+
if (document.querySelector('meta[name="readme-deploy"]') || document.querySelector('.rm-Article, .rm-LandingPage, .rm-ReferenceMain')) return 'readme';
|
|
65
|
+
|
|
66
|
+
const reactCandidates = document.querySelectorAll('#root, #app, #__next, [data-reactroot], body > div');
|
|
67
|
+
for (const el of reactCandidates) {
|
|
68
|
+
if (el._reactRootContainer || Object.keys(el).some(k => k.startsWith('__reactContainer$'))) return 'react';
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const mounts = document.querySelectorAll('#root, #app, [data-reactroot]');
|
|
72
|
+
if (mounts.length > 0) {
|
|
73
|
+
const bodyText = (document.body.innerText || '').trim();
|
|
74
|
+
if (bodyText.length < 500) return 'generic_spa';
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return null;
|
|
78
|
+
})()
|
|
79
|
+
JS
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def spa_hydration_completion_script(framework)
|
|
83
|
+
<<~JS
|
|
84
|
+
(() => {
|
|
85
|
+
const mainContent = document.querySelector('main, [role="main"], article, .content, .docs-content, #content');
|
|
86
|
+
const mainText = (mainContent ? mainContent.innerText : document.body.innerText || '').trim();
|
|
87
|
+
|
|
88
|
+
if (mainText.length > 500) return true;
|
|
89
|
+
|
|
90
|
+
switch ('#{framework}') {
|
|
91
|
+
case 'next': {
|
|
92
|
+
const mount = document.getElementById('__next');
|
|
93
|
+
if (mount && Object.keys(mount).some(k => k.startsWith('__reactFiber$'))) return true;
|
|
94
|
+
if (window.next && window.next.version) {
|
|
95
|
+
const bodyKids = document.body.children;
|
|
96
|
+
for (let i = 0; i < bodyKids.length; i++) {
|
|
97
|
+
if (Object.keys(bodyKids[i]).some(k => k.startsWith('__reactFiber$'))) return true;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
return false;
|
|
101
|
+
}
|
|
102
|
+
case 'nuxt': {
|
|
103
|
+
const mount = document.getElementById('__nuxt');
|
|
104
|
+
return !!(mount && mount.__vue_app__ && mount.__vue_app__._instance);
|
|
105
|
+
}
|
|
106
|
+
case 'vue': {
|
|
107
|
+
const mount = document.querySelector('#app, #root');
|
|
108
|
+
return !!(mount && (mount.__vue_app__ ? mount.__vue_app__._instance : mount.__vue__));
|
|
109
|
+
}
|
|
110
|
+
case 'react': {
|
|
111
|
+
const candidates = document.querySelectorAll('#root, #app, [data-reactroot], body > div');
|
|
112
|
+
for (const el of candidates) {
|
|
113
|
+
if (Object.keys(el).some(k => k.startsWith('__reactFiber$'))) return true;
|
|
114
|
+
}
|
|
115
|
+
return false;
|
|
116
|
+
}
|
|
117
|
+
case 'angular': {
|
|
118
|
+
const hasNgh = document.querySelectorAll('[ngh]').length > 0;
|
|
119
|
+
if (hasNgh) return false;
|
|
120
|
+
return !!document.querySelector('[ng-version]');
|
|
121
|
+
}
|
|
122
|
+
case 'svelte': {
|
|
123
|
+
return !!document.getElementById('svelte-announcer') ||
|
|
124
|
+
!!document.querySelector('[class*="svelte-"]');
|
|
125
|
+
}
|
|
126
|
+
case 'remix': {
|
|
127
|
+
const candidates = document.querySelectorAll('#root, #app, body > div');
|
|
128
|
+
for (const el of candidates) {
|
|
129
|
+
if (Object.keys(el).some(k => k.startsWith('__reactFiber$'))) return true;
|
|
130
|
+
}
|
|
131
|
+
return !!window.__remixContext;
|
|
132
|
+
}
|
|
133
|
+
case 'ember': {
|
|
134
|
+
return !!window.Ember && !!document.querySelector('.ember-view');
|
|
135
|
+
}
|
|
136
|
+
case 'qwik': {
|
|
137
|
+
return !!document.querySelector('[q\\:container]');
|
|
138
|
+
}
|
|
139
|
+
case 'mintlify': {
|
|
140
|
+
const contentArea = document.querySelector('#content-area, [id="content-area"]');
|
|
141
|
+
if (contentArea && (contentArea.innerText || '').trim().length > 100) return true;
|
|
142
|
+
return false;
|
|
143
|
+
}
|
|
144
|
+
case 'gitbook': {
|
|
145
|
+
const gbMain = document.querySelector('main');
|
|
146
|
+
if (gbMain && (gbMain.innerText || '').trim().length > 100) return true;
|
|
147
|
+
return false;
|
|
148
|
+
}
|
|
149
|
+
case 'scalar': {
|
|
150
|
+
const scalarRef = document.querySelector('.scalar-api-reference, .scalar-app');
|
|
151
|
+
if (scalarRef && (scalarRef.innerText || '').trim().length > 500) return true;
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
case 'redoc': {
|
|
155
|
+
const apiContent = document.querySelector('.api-content, .redoc-wrap [role="main"], .redoc-wrap');
|
|
156
|
+
if (apiContent && (apiContent.innerText || '').trim().length > 500) return true;
|
|
157
|
+
return false;
|
|
158
|
+
}
|
|
159
|
+
case 'readme': {
|
|
160
|
+
const rmArticle = document.querySelector('.rm-Article, .rm-LandingPage, article .markdown-body');
|
|
161
|
+
if (rmArticle && (rmArticle.innerText || '').trim().length > 100) return true;
|
|
162
|
+
return false;
|
|
163
|
+
}
|
|
164
|
+
case 'gatsby':
|
|
165
|
+
case 'docusaurus': {
|
|
166
|
+
const mountId = '#{framework}' === 'gatsby' ? '___gatsby' : '__docusaurus';
|
|
167
|
+
const mount = document.getElementById(mountId);
|
|
168
|
+
return !!(mount && Object.keys(mount).some(k => k.startsWith('__reactFiber$')));
|
|
169
|
+
}
|
|
170
|
+
case 'generic_spa': {
|
|
171
|
+
return mainText.length > 200;
|
|
172
|
+
}
|
|
173
|
+
default:
|
|
174
|
+
return true;
|
|
175
|
+
}
|
|
176
|
+
})()
|
|
177
|
+
JS
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
# rubocop:enable Metrics/ModuleLength
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module Stabilization
|
|
6
|
+
autoload :PageFlow, "fetch_util/browser/stabilization/page_flow"
|
|
7
|
+
autoload :SpaHydration, "fetch_util/browser/stabilization/spa_hydration"
|
|
8
|
+
|
|
9
|
+
include PageFlow
|
|
10
|
+
include SpaHydration
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ferrum"
|
|
4
|
+
require "json"
|
|
5
|
+
require "uri"
|
|
6
|
+
|
|
7
|
+
module FetchUtil
|
|
8
|
+
class Browser
|
|
9
|
+
autoload :InteractionHelpers, "fetch_util/browser/interaction_helpers"
|
|
10
|
+
autoload :Navigation, "fetch_util/browser/navigation"
|
|
11
|
+
autoload :SiteStabilization, "fetch_util/browser/site_stabilization"
|
|
12
|
+
autoload :Stabilization, "fetch_util/browser/stabilization"
|
|
13
|
+
|
|
14
|
+
include InteractionHelpers
|
|
15
|
+
include Navigation
|
|
16
|
+
include SiteStabilization
|
|
17
|
+
include Stabilization
|
|
18
|
+
|
|
19
|
+
# Prefer full Chromium over `headless_shell`: the full browser stays closer
|
|
20
|
+
# to standard Chromium behavior and exposes APIs that some sites expect,
|
|
21
|
+
# which improves extraction consistency. `headless_shell` diverges more
|
|
22
|
+
# often and can change page behavior in ways that degrade extraction.
|
|
23
|
+
BROWSER_CANDIDATES = [
|
|
24
|
+
"/usr/bin/chromium-browser",
|
|
25
|
+
"/usr/bin/chromium",
|
|
26
|
+
"/usr/bin/google-chrome",
|
|
27
|
+
"/usr/lib64/chromium-browser/headless_shell"
|
|
28
|
+
].freeze
|
|
29
|
+
|
|
30
|
+
DEFAULT_VIEWPORT = { width: 1366, height: 900 }.freeze
|
|
31
|
+
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " \
|
|
32
|
+
"(KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
|
|
33
|
+
DEFAULT_ACCEPT_LANGUAGE = "en-US,en;q=0.9"
|
|
34
|
+
SOCIAL_LOGIN_PHASE_WAIT = 0.3
|
|
35
|
+
POST_CONSENT_IDLE_TIMEOUT = 3.0
|
|
36
|
+
CONTENT_READY_MIN_LENGTH = 200
|
|
37
|
+
SPA_HYDRATION_TIMEOUT = 2.0
|
|
38
|
+
SPA_HYDRATION_POLL = 0.15
|
|
39
|
+
NAVIGATION_MAX_RETRIES = 2
|
|
40
|
+
|
|
41
|
+
def initialize(timeout: 20, wait: 0.75, wait_for_idle: true, idle_duration: 0.35,
|
|
42
|
+
viewport: DEFAULT_VIEWPORT, user_agent: DEFAULT_USER_AGENT,
|
|
43
|
+
accept_language: DEFAULT_ACCEPT_LANGUAGE, browser_path: nil,
|
|
44
|
+
browser_options: nil)
|
|
45
|
+
@timeout = timeout.to_f
|
|
46
|
+
@wait = wait.to_f
|
|
47
|
+
@wait_for_idle = wait_for_idle
|
|
48
|
+
@idle_duration = idle_duration.to_f
|
|
49
|
+
@viewport = DEFAULT_VIEWPORT.merge(symbolize_hash(viewport || {}))
|
|
50
|
+
@user_agent = user_agent
|
|
51
|
+
@accept_language = accept_language
|
|
52
|
+
@browser_path = browser_path || ENV["BROWSER_PATH"] || BROWSER_CANDIDATES.find { |path| File.executable?(path) }
|
|
53
|
+
@full_browser = @browser_path && !@browser_path.include?("headless_shell")
|
|
54
|
+
default_opts = { "no-sandbox": nil }
|
|
55
|
+
# Use newer headless mode with the full browser binary for closer runtime
|
|
56
|
+
# parity with standard Chromium. Also disable Ferrum's
|
|
57
|
+
# `enable-automation` flag to reduce tool-specific browser-state
|
|
58
|
+
# differences during extraction.
|
|
59
|
+
if @full_browser
|
|
60
|
+
default_opts["headless"] = "new"
|
|
61
|
+
default_opts["enable-automation"] = false # override Ferrum default
|
|
62
|
+
end
|
|
63
|
+
@browser_options = default_opts.merge(browser_options || {})
|
|
64
|
+
@ferrum = nil
|
|
65
|
+
@mutex = Mutex.new
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Navigate to +url+ in a fresh browser tab, stabilize the page, then yield
|
|
69
|
+
# the +Ferrum::Page+ to the caller. The page is closed after the block
|
|
70
|
+
# returns (or on error), but the underlying Chromium process is kept alive
|
|
71
|
+
# for reuse by subsequent calls.
|
|
72
|
+
def with_page(url)
|
|
73
|
+
raise BrowserError, "No Chromium browser found. Set BROWSER_PATH or install Chromium." unless @browser_path
|
|
74
|
+
|
|
75
|
+
ferrum = ensure_browser
|
|
76
|
+
page = ferrum.create_page
|
|
77
|
+
page.headers.set(default_headers)
|
|
78
|
+
page.bypass_csp
|
|
79
|
+
retries = 0
|
|
80
|
+
begin
|
|
81
|
+
page.go_to(url)
|
|
82
|
+
rescue Ferrum::PendingConnectionsError, Ferrum::TimeoutError
|
|
83
|
+
unless page_loaded_enough?(page)
|
|
84
|
+
raise if retries >= NAVIGATION_MAX_RETRIES
|
|
85
|
+
|
|
86
|
+
retries += 1
|
|
87
|
+
retry
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
stabilize_page(page, url)
|
|
91
|
+
yield page
|
|
92
|
+
rescue Ferrum::Error => e
|
|
93
|
+
raise BrowserError, e.message
|
|
94
|
+
ensure
|
|
95
|
+
page&.close
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Shut down the underlying Chromium process. Safe to call multiple times or
|
|
99
|
+
# when no browser has been started yet. After +quit+, a subsequent
|
|
100
|
+
# +with_page+ call will transparently launch a new process.
|
|
101
|
+
def quit
|
|
102
|
+
@mutex.synchronize do
|
|
103
|
+
@ferrum&.quit
|
|
104
|
+
@ferrum = nil
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
private
|
|
109
|
+
|
|
110
|
+
# Lazily start the shared Chromium process on first use. The
|
|
111
|
+
# +evaluate_on_new_document+ call registers the navigator patch once;
|
|
112
|
+
# Ferrum automatically applies it to every new page/context created
|
|
113
|
+
# afterwards.
|
|
114
|
+
def ensure_browser
|
|
115
|
+
@mutex.synchronize do
|
|
116
|
+
return @ferrum if @ferrum
|
|
117
|
+
|
|
118
|
+
@ferrum = Ferrum::Browser.new(
|
|
119
|
+
headless: true,
|
|
120
|
+
browser_path: @browser_path,
|
|
121
|
+
timeout: @timeout,
|
|
122
|
+
window_size: [@viewport.fetch(:width), @viewport.fetch(:height)],
|
|
123
|
+
browser_options: @browser_options
|
|
124
|
+
)
|
|
125
|
+
@ferrum.evaluate_on_new_document(navigator_patch)
|
|
126
|
+
@ferrum
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def host_matches?(url, host)
|
|
131
|
+
normalized_host = FetchUtil.strip_www_host(url)
|
|
132
|
+
normalized_host == host || normalized_host.end_with?(".#{host}")
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|