fetch_util 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +2 -0
  3. data/.rubocop.yml +97 -0
  4. data/CHANGELOG.md +48 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +199 -0
  7. data/Rakefile +18 -0
  8. data/SKILL.md +92 -0
  9. data/exe/fetch_util +6 -0
  10. data/lib/fetch_util/assets/extract.js +1 -0
  11. data/lib/fetch_util/assets/vendor/readability.js +2314 -0
  12. data/lib/fetch_util/assets/vendor/turndown.js +974 -0
  13. data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
  14. data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
  15. data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
  16. data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
  17. data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
  18. data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
  19. data/lib/fetch_util/browser/navigation.rb +13 -0
  20. data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
  21. data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
  22. data/lib/fetch_util/browser/site_stabilization.rb +13 -0
  23. data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
  24. data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
  25. data/lib/fetch_util/browser/stabilization.rb +13 -0
  26. data/lib/fetch_util/browser.rb +135 -0
  27. data/lib/fetch_util/cli.rb +124 -0
  28. data/lib/fetch_util/extractor.rb +56 -0
  29. data/lib/fetch_util/fetcher.rb +242 -0
  30. data/lib/fetch_util/parallel_fetcher.rb +97 -0
  31. data/lib/fetch_util/raw_docs_fallback.rb +260 -0
  32. data/lib/fetch_util/regulatory/cache_store.rb +92 -0
  33. data/lib/fetch_util/regulatory/directives.rb +106 -0
  34. data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
  35. data/lib/fetch_util/regulatory/headers.rb +39 -0
  36. data/lib/fetch_util/regulatory/http_client.rb +70 -0
  37. data/lib/fetch_util/regulatory/human.rb +104 -0
  38. data/lib/fetch_util/regulatory/orchestration.rb +82 -0
  39. data/lib/fetch_util/regulatory/page.rb +70 -0
  40. data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
  41. data/lib/fetch_util/regulatory/robots.rb +117 -0
  42. data/lib/fetch_util/regulatory/signals.rb +106 -0
  43. data/lib/fetch_util/regulatory/source_selection.rb +60 -0
  44. data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
  45. data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
  46. data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
  47. data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
  48. data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
  49. data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
  50. data/lib/fetch_util/regulatory.rb +74 -0
  51. data/lib/fetch_util/request_log.rb +24 -0
  52. data/lib/fetch_util/result.rb +58 -0
  53. data/lib/fetch_util/searcher/result_filtering.rb +102 -0
  54. data/lib/fetch_util/searcher.rb +332 -0
  55. data/lib/fetch_util/version.rb +5 -0
  56. data/lib/fetch_util.rb +115 -0
  57. metadata +145 -0
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module SiteStabilization
6
+ module CommunityAndMarketplace
7
+ COOKIE_BUTTON_SELECTORS = 'button, [role="button"], a, input[type="button"], input[type="submit"]'
8
+ EBAY_COOKIE_ACCEPT_LABELS = [
9
+ "accept all",
10
+ "accept all cookies",
11
+ "accept cookies",
12
+ "allow all",
13
+ "allow cookies",
14
+ "agree to cookies",
15
+ "continue with cookies"
16
+ ].freeze
17
+ private_constant :COOKIE_BUTTON_SELECTORS, :EBAY_COOKIE_ACCEPT_LABELS
18
+
19
+ private
20
+
21
+ def reddit_url?(url)
22
+ host_matches?(url, "reddit.com")
23
+ end
24
+
25
+ def stabilize_reddit(page)
26
+ retry_until_timeout(capped_timeout(3.0), interval: 0.1) do
27
+ dismiss_reddit_cookie_dialog(page)
28
+ reddit_content_ready?(page)
29
+ end
30
+
31
+ settle_after_stabilization(0.25)
32
+ dismiss_reddit_cookie_dialog(page)
33
+ end
34
+
35
+ def ebay_search_url?(url)
36
+ uri = URI.parse(url)
37
+ host = uri.host.to_s.downcase
38
+ return false unless host == "ebay.com" || host.end_with?(".ebay.com")
39
+
40
+ uri.path.include?("/sch/") || uri.query.to_s.include?("_nkw=")
41
+ rescue URI::InvalidURIError
42
+ false
43
+ end
44
+
45
+ def stabilize_ebay_search(page)
46
+ accepted_cookies = false
47
+
48
+ retry_until_timeout(capped_timeout(6.0), interval: 0.15) do
49
+ accepted_cookies ||= click_visible_button_by_text(
50
+ page,
51
+ EBAY_COOKIE_ACCEPT_LABELS,
52
+ selectors: COOKIE_BUTTON_SELECTORS
53
+ )
54
+
55
+ state = safe_evaluate(page, <<~JS, default: { "itemCount" => 0, "challengeVisible" => false })
56
+ (() => {
57
+ const bodyText = document.body ? document.body.innerText : '';
58
+ return {
59
+ itemCount: document.querySelectorAll('li.s-item a[href*="/itm/"], ul.srp-results a[href*="/itm/"]').length,
60
+ challengeVisible: /checking your browser before you access ebay|your browser will redirect to your requested content shortly|pardon our interruption/i.test(bodyText)
61
+ };
62
+ })()
63
+ JS
64
+
65
+ state["itemCount"].to_i >= 4 || (state["challengeVisible"] ? 0.35 : false)
66
+ end
67
+
68
+ settle_after_stabilization(0.25) if accepted_cookies
69
+ end
70
+
71
+ def reddit_content_ready?(page)
72
+ page.evaluate(<<~JS)
73
+ !!document.querySelector('shreddit-post, faceplate-screen-reader-content, shreddit-comment, [data-testid="comment"]')
74
+ JS
75
+ rescue Ferrum::JavaScriptError
76
+ false
77
+ end
78
+
79
+ def dismiss_reddit_cookie_dialog(page)
80
+ removed = dismiss_overlay_dialog(
81
+ page,
82
+ close_selectors: [],
83
+ dialog_selectors: [
84
+ '[data-testid="onboarding-modal"]',
85
+ '[data-testid="gdpr-modal"]',
86
+ '[aria-modal="true"]',
87
+ '[role="dialog"]',
88
+ "shreddit-experience-tree"
89
+ ],
90
+ dialog_pattern: "before you continue to reddit|let us know your cookie preferences"
91
+ )
92
+ return true if removed
93
+
94
+ safe_evaluate(page, <<~JS)
95
+ (() => {
96
+ #{js_dom_helpers}
97
+ let removed = false;
98
+ document.querySelectorAll('section, div, aside, form, footer, shreddit-experience-tree').forEach((node) => {
99
+ const text = (node.innerText || node.textContent || '').replace(/\s+/g, ' ').trim();
100
+ if (/before you continue to reddit|let us know your cookie preferences/i.test(text) && text.length < 2000) {
101
+ node.remove();
102
+ removed = true;
103
+ }
104
+ });
105
+
106
+ if (removed) {
107
+ restoreScroll();
108
+ }
109
+
110
+ return removed;
111
+ })()
112
+ JS
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module SiteStabilization
6
+ module SocialPlatforms
7
+ COOKIE_BUTTON_SELECTORS = 'button, [role="button"], a, input[type="button"], input[type="submit"]'
8
+ INSTAGRAM_COOKIE_ACCEPT_LABELS = ["accept", "accept all", "accept all cookies"].freeze
9
+ INSTAGRAM_COOKIE_FALLBACK_LABELS = ["allow all cookies", "allow all", "allow cookies"].freeze
10
+ FACEBOOK_COOKIE_DECLINE_LABELS = [
11
+ "decline optional cookies",
12
+ "optionale cookies ablehnen",
13
+ "refuser les cookies optionnels",
14
+ "rechazar cookies opcionales",
15
+ "rifiuta i cookie opzionali"
16
+ ].freeze
17
+ FACEBOOK_COOKIE_ACCEPT_LABELS = [
18
+ "allow all cookies",
19
+ "alle cookies erlauben",
20
+ "autoriser tous les cookies",
21
+ "permitir todas las cookies",
22
+ "consenti tutti i cookie"
23
+ ].freeze
24
+ private_constant :COOKIE_BUTTON_SELECTORS, :INSTAGRAM_COOKIE_ACCEPT_LABELS, :INSTAGRAM_COOKIE_FALLBACK_LABELS,
25
+ :FACEBOOK_COOKIE_DECLINE_LABELS, :FACEBOOK_COOKIE_ACCEPT_LABELS
26
+
27
+ private
28
+
29
+ def instagram_url?(url)
30
+ host_matches?(url, "instagram.com")
31
+ end
32
+
33
+ def stabilize_instagram(page)
34
+ wait_for_idle_or_content(page) if @wait_for_idle
35
+ accept_instagram_cookie_dialog(page) || accept_cookie_consent(page)
36
+ social_login_phase_pause
37
+ accept_instagram_cookie_dialog(page) || accept_cookie_consent(page)
38
+ retry_until_timeout(capped_timeout(5.0)) { dismiss_instagram_login_modal(page) }
39
+ social_login_phase_pause
40
+ dismiss_instagram_login_modal(page)
41
+ end
42
+
43
+ def accept_instagram_cookie_dialog(page)
44
+ click_visible_button_by_text(
45
+ page,
46
+ INSTAGRAM_COOKIE_ACCEPT_LABELS,
47
+ INSTAGRAM_COOKIE_FALLBACK_LABELS,
48
+ selectors: COOKIE_BUTTON_SELECTORS
49
+ )
50
+ end
51
+
52
+ def dismiss_instagram_login_modal(page)
53
+ dismiss_overlay_dialog(
54
+ page,
55
+ close_selectors: [
56
+ '[role="dialog"] button',
57
+ '[role="dialog"] [role="button"]',
58
+ '[role="dialog"] button[aria-label]',
59
+ '[role="dialog"] button[title]',
60
+ '[role="dialog"] button svg',
61
+ '[role="presentation"] button',
62
+ '[role="presentation"] [role="button"]',
63
+ '[role="presentation"] button[aria-label]',
64
+ '[role="presentation"] button[title]',
65
+ '[role="presentation"] button svg',
66
+ '[aria-modal="true"] button',
67
+ '[aria-modal="true"] [role="button"]',
68
+ '[aria-modal="true"] button[aria-label]',
69
+ '[aria-modal="true"] button[title]',
70
+ '[aria-modal="true"] button svg',
71
+ 'div[style*="position: fixed"] button',
72
+ 'div[style*="position:fixed"] button',
73
+ 'div[style*="position: fixed"] [role="button"]',
74
+ 'div[style*="position:fixed"] [role="button"]',
75
+ 'div[style*="position: fixed"] svg',
76
+ 'div[style*="position:fixed"] svg'
77
+ ],
78
+ dialog_selectors: ['[role="dialog"]', '[role="presentation"]', '[aria-modal="true"]'],
79
+ overlay_selectors: ['div[style*="position: fixed"]', 'div[style*="position:fixed"]'],
80
+ dialog_pattern: "log in|sign up|create (?:new )?account|don.?t have an account",
81
+ close_label_pattern: "^(?:close|dismiss|x|×)?$",
82
+ allow_empty_close_label: true
83
+ )
84
+ end
85
+
86
+ def facebook_url?(url)
87
+ host_matches?(url, "facebook.com")
88
+ end
89
+
90
+ def stabilize_facebook(page)
91
+ wait_for_idle_or_content(page) if @wait_for_idle
92
+ social_login_phase_pause
93
+ dismiss_facebook_cookie_dialog(page)
94
+ social_login_phase_pause
95
+ retry_until_timeout(capped_timeout(5.0)) { dismiss_facebook_login_dialog(page) }
96
+ social_login_phase_pause
97
+ end
98
+
99
+ def dismiss_facebook_cookie_dialog(page)
100
+ click_visible_button_by_text(
101
+ page,
102
+ FACEBOOK_COOKIE_DECLINE_LABELS,
103
+ FACEBOOK_COOKIE_ACCEPT_LABELS
104
+ )
105
+ end
106
+
107
+ def dismiss_facebook_login_dialog(page)
108
+ dismiss_overlay_dialog(
109
+ page,
110
+ close_selectors: ['[aria-label="Close"]', '[aria-label="close"]'],
111
+ dialog_selectors: ['[role="dialog"]', '[aria-modal="true"]'],
112
+ dialog_pattern: "log in|sign up|create (?:new )?account|see more from"
113
+ )
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module SiteStabilization
6
+ autoload :CommunityAndMarketplace, "fetch_util/browser/site_stabilization/community_and_marketplace"
7
+ autoload :SocialPlatforms, "fetch_util/browser/site_stabilization/social_platforms"
8
+
9
+ include CommunityAndMarketplace
10
+ include SocialPlatforms
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module Stabilization
6
+ module PageFlow
7
+ private
8
+
9
+ def stabilize_page(page, url)
10
+ return stabilize_reddit(page) if reddit_url?(url)
11
+ return stabilize_instagram(page) if instagram_url?(url)
12
+ return stabilize_facebook(page) if facebook_url?(url)
13
+ return stabilize_ebay_search(page) if ebay_search_url?(url)
14
+
15
+ reached_idle = !@wait_for_idle || wait_for_idle_or_content(page)
16
+ preserve_consent = preserve_consent_wall?(page, url)
17
+ accepted_cookies = preserve_consent ? false : accept_cookie_consent(page)
18
+ accepted_cookies = (!preserve_consent && dismiss_privacy_preference_overlay(page)) || accepted_cookies
19
+ sleep @wait if @wait.positive?
20
+ accepted_cookies = (!preserve_consent && accept_cookie_consent(page)) || accepted_cookies
21
+ accepted_cookies = (!preserve_consent && dismiss_privacy_preference_overlay(page)) || accepted_cookies
22
+
23
+ wait_for_spa_hydration(page) if @wait_for_idle && reached_idle
24
+ accepted_cookies = (!preserve_consent && accept_cookie_consent(page)) || accepted_cookies
25
+ accepted_cookies = (!preserve_consent && dismiss_privacy_preference_overlay(page)) || accepted_cookies
26
+
27
+ return unless accepted_cookies && @wait_for_idle && reached_idle
28
+
29
+ page.network.wait_for_idle(duration: @idle_duration, timeout: POST_CONSENT_IDLE_TIMEOUT)
30
+ end
31
+
32
+ def wait_for_idle_or_content(page)
33
+ content_seen_at = nil
34
+
35
+ retry_until_timeout(@timeout, interval: @idle_duration) do
36
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
37
+ next true if page.network.idle?
38
+ if content_seen_at.nil? && page_has_content?(page)
39
+ content_seen_at = now
40
+ end
41
+
42
+ content_seen_at && (now - content_seen_at) >= @idle_duration
43
+ end
44
+ rescue Ferrum::Error
45
+ false
46
+ end
47
+
48
+ def page_has_content?(page)
49
+ page.evaluate(<<~JS)
50
+ (() => {
51
+ const body = document.body;
52
+ if (!body) return false;
53
+ const text = body.innerText || '';
54
+ return text.length > #{CONTENT_READY_MIN_LENGTH};
55
+ })()
56
+ JS
57
+ rescue Ferrum::JavaScriptError, Ferrum::TimeoutError
58
+ false
59
+ end
60
+
61
+ def preserve_consent_wall?(page, url)
62
+ host = FetchUtil.strip_www_host(url)
63
+ return false unless host == "youtube.com" || host.end_with?(".youtube.com") || host.match?(/\Agoogle\.[a-z.]+\z/)
64
+
65
+ state = page.evaluate(<<~JS)
66
+ (() => ({
67
+ title: document.title || '',
68
+ text: document.body ? document.body.innerText.replace(/\s+/g, ' ').trim().slice(0, 1200) : ''
69
+ }))()
70
+ JS
71
+
72
+ combined = [state["title"], state["text"]].join(" ").downcase
73
+ /before you continue to (google|youtube)|we use cookies and data|accept all|reject all|more options/.match?(combined)
74
+ rescue URI::InvalidURIError, Ferrum::JavaScriptError, Ferrum::TimeoutError
75
+ false
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,183 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module Stabilization
6
+ # rubocop:disable Metrics/ModuleLength
7
+ module SpaHydration
8
+ private
9
+
10
+ def wait_for_spa_hydration(page)
11
+ framework = detect_spa_framework(page)
12
+ return unless framework
13
+
14
+ retry_until_timeout(SPA_HYDRATION_TIMEOUT, interval: SPA_HYDRATION_POLL) do
15
+ spa_hydration_complete?(page, framework)
16
+ end
17
+
18
+ sleep SPA_HYDRATION_POLL
19
+ rescue Ferrum::JavaScriptError, Ferrum::TimeoutError
20
+ end
21
+
22
+ def detect_spa_framework(page)
23
+ result = page.evaluate(spa_framework_detection_script)
24
+
25
+ result.is_a?(String) ? result.to_sym : nil
26
+ rescue Ferrum::JavaScriptError, Ferrum::TimeoutError
27
+ nil
28
+ end
29
+
30
+ def spa_hydration_complete?(page, framework)
31
+ page.evaluate(spa_hydration_completion_script(framework))
32
+ rescue Ferrum::JavaScriptError, Ferrum::TimeoutError
33
+ true
34
+ end
35
+
36
+ def spa_framework_detection_script
37
+ <<~JS
38
+ (() => {
39
+ if (window.__NEXT_DATA__ || (window.next && window.next.version)) return 'next';
40
+ if (window.__NUXT__ || window.$nuxt) return 'nuxt';
41
+ if (window.__remixContext) return 'remix';
42
+ if (document.getElementById('__docusaurus')) return 'docusaurus';
43
+ if (document.getElementById('___gatsby')) return 'gatsby';
44
+
45
+ const vueMount = document.querySelector('#app, #root, #__nuxt');
46
+ if ((vueMount && vueMount.__vue_app__) || window.__VUE__) return 'vue';
47
+ if (vueMount && vueMount.__vue__) return 'vue';
48
+ if (document.querySelector('[ng-version]')) return 'angular';
49
+ if (window.__svelte || document.getElementById('svelte-announcer') ||
50
+ document.querySelector('[data-sveltekit-preload-data]') ||
51
+ document.querySelector('[class*="svelte-"]')) return 'svelte';
52
+ if (window.Ember) return 'ember';
53
+ if (document.querySelector('[q\\:container]')) return 'qwik';
54
+ if (document.querySelector('meta[name="generator"][content*="Mintlify" i]') || document.querySelector('#content-area')) {
55
+ const gen = (document.querySelector('meta[name="generator"]') || {}).content || '';
56
+ if (/mintlify/i.test(gen) || document.querySelector('[class*="mintlify"]')) return 'mintlify';
57
+ }
58
+ {
59
+ const gen = (document.querySelector('meta[name="generator"]') || {}).content || '';
60
+ if (/gitbook/i.test(gen) || /\.gitbook\.io$/.test(location.hostname)) return 'gitbook';
61
+ }
62
+ if (document.querySelector('.scalar-api-reference, .scalar-app, [data-scalar]')) return 'scalar';
63
+ if (document.querySelector('redoc, rapi-doc, .redoc-wrap')) return 'redoc';
64
+ if (document.querySelector('meta[name="readme-deploy"]') || document.querySelector('.rm-Article, .rm-LandingPage, .rm-ReferenceMain')) return 'readme';
65
+
66
+ const reactCandidates = document.querySelectorAll('#root, #app, #__next, [data-reactroot], body > div');
67
+ for (const el of reactCandidates) {
68
+ if (el._reactRootContainer || Object.keys(el).some(k => k.startsWith('__reactContainer$'))) return 'react';
69
+ }
70
+
71
+ const mounts = document.querySelectorAll('#root, #app, [data-reactroot]');
72
+ if (mounts.length > 0) {
73
+ const bodyText = (document.body.innerText || '').trim();
74
+ if (bodyText.length < 500) return 'generic_spa';
75
+ }
76
+
77
+ return null;
78
+ })()
79
+ JS
80
+ end
81
+
82
+ def spa_hydration_completion_script(framework)
83
+ <<~JS
84
+ (() => {
85
+ const mainContent = document.querySelector('main, [role="main"], article, .content, .docs-content, #content');
86
+ const mainText = (mainContent ? mainContent.innerText : document.body.innerText || '').trim();
87
+
88
+ if (mainText.length > 500) return true;
89
+
90
+ switch ('#{framework}') {
91
+ case 'next': {
92
+ const mount = document.getElementById('__next');
93
+ if (mount && Object.keys(mount).some(k => k.startsWith('__reactFiber$'))) return true;
94
+ if (window.next && window.next.version) {
95
+ const bodyKids = document.body.children;
96
+ for (let i = 0; i < bodyKids.length; i++) {
97
+ if (Object.keys(bodyKids[i]).some(k => k.startsWith('__reactFiber$'))) return true;
98
+ }
99
+ }
100
+ return false;
101
+ }
102
+ case 'nuxt': {
103
+ const mount = document.getElementById('__nuxt');
104
+ return !!(mount && mount.__vue_app__ && mount.__vue_app__._instance);
105
+ }
106
+ case 'vue': {
107
+ const mount = document.querySelector('#app, #root');
108
+ return !!(mount && (mount.__vue_app__ ? mount.__vue_app__._instance : mount.__vue__));
109
+ }
110
+ case 'react': {
111
+ const candidates = document.querySelectorAll('#root, #app, [data-reactroot], body > div');
112
+ for (const el of candidates) {
113
+ if (Object.keys(el).some(k => k.startsWith('__reactFiber$'))) return true;
114
+ }
115
+ return false;
116
+ }
117
+ case 'angular': {
118
+ const hasNgh = document.querySelectorAll('[ngh]').length > 0;
119
+ if (hasNgh) return false;
120
+ return !!document.querySelector('[ng-version]');
121
+ }
122
+ case 'svelte': {
123
+ return !!document.getElementById('svelte-announcer') ||
124
+ !!document.querySelector('[class*="svelte-"]');
125
+ }
126
+ case 'remix': {
127
+ const candidates = document.querySelectorAll('#root, #app, body > div');
128
+ for (const el of candidates) {
129
+ if (Object.keys(el).some(k => k.startsWith('__reactFiber$'))) return true;
130
+ }
131
+ return !!window.__remixContext;
132
+ }
133
+ case 'ember': {
134
+ return !!window.Ember && !!document.querySelector('.ember-view');
135
+ }
136
+ case 'qwik': {
137
+ return !!document.querySelector('[q\\:container]');
138
+ }
139
+ case 'mintlify': {
140
+ const contentArea = document.querySelector('#content-area, [id="content-area"]');
141
+ if (contentArea && (contentArea.innerText || '').trim().length > 100) return true;
142
+ return false;
143
+ }
144
+ case 'gitbook': {
145
+ const gbMain = document.querySelector('main');
146
+ if (gbMain && (gbMain.innerText || '').trim().length > 100) return true;
147
+ return false;
148
+ }
149
+ case 'scalar': {
150
+ const scalarRef = document.querySelector('.scalar-api-reference, .scalar-app');
151
+ if (scalarRef && (scalarRef.innerText || '').trim().length > 500) return true;
152
+ return false;
153
+ }
154
+ case 'redoc': {
155
+ const apiContent = document.querySelector('.api-content, .redoc-wrap [role="main"], .redoc-wrap');
156
+ if (apiContent && (apiContent.innerText || '').trim().length > 500) return true;
157
+ return false;
158
+ }
159
+ case 'readme': {
160
+ const rmArticle = document.querySelector('.rm-Article, .rm-LandingPage, article .markdown-body');
161
+ if (rmArticle && (rmArticle.innerText || '').trim().length > 100) return true;
162
+ return false;
163
+ }
164
+ case 'gatsby':
165
+ case 'docusaurus': {
166
+ const mountId = '#{framework}' === 'gatsby' ? '___gatsby' : '__docusaurus';
167
+ const mount = document.getElementById(mountId);
168
+ return !!(mount && Object.keys(mount).some(k => k.startsWith('__reactFiber$')));
169
+ }
170
+ case 'generic_spa': {
171
+ return mainText.length > 200;
172
+ }
173
+ default:
174
+ return true;
175
+ }
176
+ })()
177
+ JS
178
+ end
179
+ end
180
+ # rubocop:enable Metrics/ModuleLength
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module Stabilization
6
+ autoload :PageFlow, "fetch_util/browser/stabilization/page_flow"
7
+ autoload :SpaHydration, "fetch_util/browser/stabilization/spa_hydration"
8
+
9
+ include PageFlow
10
+ include SpaHydration
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,135 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ferrum"
4
+ require "json"
5
+ require "uri"
6
+
7
+ module FetchUtil
8
+ class Browser
9
+ autoload :InteractionHelpers, "fetch_util/browser/interaction_helpers"
10
+ autoload :Navigation, "fetch_util/browser/navigation"
11
+ autoload :SiteStabilization, "fetch_util/browser/site_stabilization"
12
+ autoload :Stabilization, "fetch_util/browser/stabilization"
13
+
14
+ include InteractionHelpers
15
+ include Navigation
16
+ include SiteStabilization
17
+ include Stabilization
18
+
19
+ # Prefer full Chromium over `headless_shell`: the full browser stays closer
20
+ # to standard Chromium behavior and exposes APIs that some sites expect,
21
+ # which improves extraction consistency. `headless_shell` diverges more
22
+ # often and can change page behavior in ways that degrade extraction.
23
+ BROWSER_CANDIDATES = [
24
+ "/usr/bin/chromium-browser",
25
+ "/usr/bin/chromium",
26
+ "/usr/bin/google-chrome",
27
+ "/usr/lib64/chromium-browser/headless_shell"
28
+ ].freeze
29
+
30
+ DEFAULT_VIEWPORT = { width: 1366, height: 900 }.freeze
31
+ DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " \
32
+ "(KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
33
+ DEFAULT_ACCEPT_LANGUAGE = "en-US,en;q=0.9"
34
+ SOCIAL_LOGIN_PHASE_WAIT = 0.3
35
+ POST_CONSENT_IDLE_TIMEOUT = 3.0
36
+ CONTENT_READY_MIN_LENGTH = 200
37
+ SPA_HYDRATION_TIMEOUT = 2.0
38
+ SPA_HYDRATION_POLL = 0.15
39
+ NAVIGATION_MAX_RETRIES = 2
40
+
41
+ def initialize(timeout: 20, wait: 0.75, wait_for_idle: true, idle_duration: 0.35,
42
+ viewport: DEFAULT_VIEWPORT, user_agent: DEFAULT_USER_AGENT,
43
+ accept_language: DEFAULT_ACCEPT_LANGUAGE, browser_path: nil,
44
+ browser_options: nil)
45
+ @timeout = timeout.to_f
46
+ @wait = wait.to_f
47
+ @wait_for_idle = wait_for_idle
48
+ @idle_duration = idle_duration.to_f
49
+ @viewport = DEFAULT_VIEWPORT.merge(symbolize_hash(viewport || {}))
50
+ @user_agent = user_agent
51
+ @accept_language = accept_language
52
+ @browser_path = browser_path || ENV["BROWSER_PATH"] || BROWSER_CANDIDATES.find { |path| File.executable?(path) }
53
+ @full_browser = @browser_path && !@browser_path.include?("headless_shell")
54
+ default_opts = { "no-sandbox": nil }
55
+ # Use newer headless mode with the full browser binary for closer runtime
56
+ # parity with standard Chromium. Also disable Ferrum's
57
+ # `enable-automation` flag to reduce tool-specific browser-state
58
+ # differences during extraction.
59
+ if @full_browser
60
+ default_opts["headless"] = "new"
61
+ default_opts["enable-automation"] = false # override Ferrum default
62
+ end
63
+ @browser_options = default_opts.merge(browser_options || {})
64
+ @ferrum = nil
65
+ @mutex = Mutex.new
66
+ end
67
+
68
+ # Navigate to +url+ in a fresh browser tab, stabilize the page, then yield
69
+ # the +Ferrum::Page+ to the caller. The page is closed after the block
70
+ # returns (or on error), but the underlying Chromium process is kept alive
71
+ # for reuse by subsequent calls.
72
+ def with_page(url)
73
+ raise BrowserError, "No Chromium browser found. Set BROWSER_PATH or install Chromium." unless @browser_path
74
+
75
+ ferrum = ensure_browser
76
+ page = ferrum.create_page
77
+ page.headers.set(default_headers)
78
+ page.bypass_csp
79
+ retries = 0
80
+ begin
81
+ page.go_to(url)
82
+ rescue Ferrum::PendingConnectionsError, Ferrum::TimeoutError
83
+ unless page_loaded_enough?(page)
84
+ raise if retries >= NAVIGATION_MAX_RETRIES
85
+
86
+ retries += 1
87
+ retry
88
+ end
89
+ end
90
+ stabilize_page(page, url)
91
+ yield page
92
+ rescue Ferrum::Error => e
93
+ raise BrowserError, e.message
94
+ ensure
95
+ page&.close
96
+ end
97
+
98
+ # Shut down the underlying Chromium process. Safe to call multiple times or
99
+ # when no browser has been started yet. After +quit+, a subsequent
100
+ # +with_page+ call will transparently launch a new process.
101
+ def quit
102
+ @mutex.synchronize do
103
+ @ferrum&.quit
104
+ @ferrum = nil
105
+ end
106
+ end
107
+
108
+ private
109
+
110
+ # Lazily start the shared Chromium process on first use. The
111
+ # +evaluate_on_new_document+ call registers the navigator patch once;
112
+ # Ferrum automatically applies it to every new page/context created
113
+ # afterwards.
114
+ def ensure_browser
115
+ @mutex.synchronize do
116
+ return @ferrum if @ferrum
117
+
118
+ @ferrum = Ferrum::Browser.new(
119
+ headless: true,
120
+ browser_path: @browser_path,
121
+ timeout: @timeout,
122
+ window_size: [@viewport.fetch(:width), @viewport.fetch(:height)],
123
+ browser_options: @browser_options
124
+ )
125
+ @ferrum.evaluate_on_new_document(navigator_patch)
126
+ @ferrum
127
+ end
128
+ end
129
+
130
+ def host_matches?(url, host)
131
+ normalized_host = FetchUtil.strip_www_host(url)
132
+ normalized_host == host || normalized_host.end_with?(".#{host}")
133
+ end
134
+ end
135
+ end