fetch_util 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +2 -0
  3. data/.rubocop.yml +97 -0
  4. data/CHANGELOG.md +48 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +199 -0
  7. data/Rakefile +18 -0
  8. data/SKILL.md +92 -0
  9. data/exe/fetch_util +6 -0
  10. data/lib/fetch_util/assets/extract.js +1 -0
  11. data/lib/fetch_util/assets/vendor/readability.js +2314 -0
  12. data/lib/fetch_util/assets/vendor/turndown.js +974 -0
  13. data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
  14. data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
  15. data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
  16. data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
  17. data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
  18. data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
  19. data/lib/fetch_util/browser/navigation.rb +13 -0
  20. data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
  21. data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
  22. data/lib/fetch_util/browser/site_stabilization.rb +13 -0
  23. data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
  24. data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
  25. data/lib/fetch_util/browser/stabilization.rb +13 -0
  26. data/lib/fetch_util/browser.rb +135 -0
  27. data/lib/fetch_util/cli.rb +124 -0
  28. data/lib/fetch_util/extractor.rb +56 -0
  29. data/lib/fetch_util/fetcher.rb +242 -0
  30. data/lib/fetch_util/parallel_fetcher.rb +97 -0
  31. data/lib/fetch_util/raw_docs_fallback.rb +260 -0
  32. data/lib/fetch_util/regulatory/cache_store.rb +92 -0
  33. data/lib/fetch_util/regulatory/directives.rb +106 -0
  34. data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
  35. data/lib/fetch_util/regulatory/headers.rb +39 -0
  36. data/lib/fetch_util/regulatory/http_client.rb +70 -0
  37. data/lib/fetch_util/regulatory/human.rb +104 -0
  38. data/lib/fetch_util/regulatory/orchestration.rb +82 -0
  39. data/lib/fetch_util/regulatory/page.rb +70 -0
  40. data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
  41. data/lib/fetch_util/regulatory/robots.rb +117 -0
  42. data/lib/fetch_util/regulatory/signals.rb +106 -0
  43. data/lib/fetch_util/regulatory/source_selection.rb +60 -0
  44. data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
  45. data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
  46. data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
  47. data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
  48. data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
  49. data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
  50. data/lib/fetch_util/regulatory.rb +74 -0
  51. data/lib/fetch_util/request_log.rb +24 -0
  52. data/lib/fetch_util/result.rb +58 -0
  53. data/lib/fetch_util/searcher/result_filtering.rb +102 -0
  54. data/lib/fetch_util/searcher.rb +332 -0
  55. data/lib/fetch_util/version.rb +5 -0
  56. data/lib/fetch_util.rb +115 -0
  57. metadata +145 -0
@@ -0,0 +1,224 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module InteractionHelpers
6
+ # rubocop:disable Metrics/ModuleLength
7
+ module ConsentHelpers
8
+ CONSENT_ACTION_LABELS = [
9
+ "Accept All", "Allow all", "Confirm My Choices", "Save preferences", "Customize Choices",
10
+ "Alle akzeptieren", "Alles akzeptieren", "Alle zulassen",
11
+ "Tout accepter", "Accepter tout", "Autoriser tout",
12
+ "Aceptar todo", "Aceptar todas",
13
+ "Accetta tutto", "Accetta tutti",
14
+ "Aceitar tudo", "Aceitar todos", "Aceitar todos os cookies",
15
+ "Alles accepteren", "Accepteer alles",
16
+ "Godta alle", "Aksepter alle", "Godkjenn alle",
17
+ "Acceptera alla", "Godkänn alla", "Tillåt alla",
18
+ "Accepter alle", "Tillad alle",
19
+ "Hyväksy kaikki", "Salli kaikki",
20
+ "Priimti visus", "Leisti visus",
21
+ "Pieņemt visus", "Atļaut visus",
22
+ "Přijmout vše", "Povolit vše",
23
+ "Zaakceptuj wszystkie", "Akceptuję",
24
+ "Mindent elfogadok", "Összes elfogadása", "Elfogadom",
25
+ "Acceptă tot", "Acceptă toate",
26
+ "Прифати сите", "Прихвати све",
27
+ "すべて受け入れる", "すべて許可",
28
+ "모두 허용", "全部接受"
29
+ ].freeze
30
+ CONSENT_FALLBACK_LABELS = [
31
+ "Essential only", "Reject All",
32
+ "Alle ablehnen", "Tout refuser", "Rechazar todo",
33
+ "Rifiuta tutto", "Rejeitar tudo", "Alles weigeren",
34
+ "Avvis alle", "Avvisa alla", "Afvis alle",
35
+ "Hylkää kaikki", "Atmesti visus", "Noraidīt visus",
36
+ "Odmítnout vše", "Odrzuć wszystkie",
37
+ "Összes elutasítása", "Respinge tot",
38
+ "Одбиј ги сите", "Одбиј све"
39
+ ].freeze
40
+ private_constant :CONSENT_ACTION_LABELS, :CONSENT_FALLBACK_LABELS
41
+
42
+ private
43
+
44
+ def accept_cookie_consent(page)
45
+ safe_evaluate(page, <<~JS)
46
+ (() => {
47
+ const quickIndicator = document.querySelector(#{consent_quick_indicator_selector_js});
48
+ const bodyPreview = ((document.body && (document.body.textContent || document.body.innerText)) || '')
49
+ .replace(/\s+/g, ' ')
50
+ .trim()
51
+ .slice(0, 4000)
52
+ .toLowerCase();
53
+ const bodyLooksLikeConsent = /(cookie|privacy|consent|personal data|personalized ads|personalized content|vendors want your permission|configurações avançadas de cookies|declaração de cookies|gerenciar cookies|utilizamos cookies|dados pessoais|informasjonskapsler|personvern|kakor|sekretess|samtycke|evästeet|evästeasetukset|tietosuoja|slapukai|slapukų|privatumas|sīkdatnes|sīkfailus|privātums|sütiket|adatvédelem|cookie-uri|confidențialitate|колачиња|приватност)/.test(bodyPreview) &&
54
+ /(accept|reject|manage|allow|agree|consent|cookies|privacy|more options|aceitar|rejeitar|gerenciar|godta|aksepter|godkjenn|godkänn|acceptera|tillåt|hyväksy|salli|priimti|sutinku|leisti|pieņemt|piekrītu|atļaut|elfogad|összes|acceptă|sunt de acord|прифати|се согласувам)/.test(bodyPreview);
55
+ const hasConsentDialog = quickIndicator ||
56
+ document.querySelector('[role="dialog"][aria-modal="true"]') ||
57
+ document.querySelector('dialog') ||
58
+ bodyLooksLikeConsent;
59
+ if (!hasConsentDialog) return false;
60
+
61
+ const pattern = /^(accept(?: all(?: cookies?)?)?|allow all(?: cookies)?|allow cookies|agree(?: to cookies| and continue| & continue)?|i agree|ok(?:ay)?|accept & continue|continue with cookies|consent|got it|i understand|continue|accept and close|close|accept recommended settings|przejdź do serwisu|zaakceptuj(?:\s+(?:wszystkie|wszystko))?|akceptuję|zgadzam się|zgoda|akzeptieren|alle akzeptieren|zustimmen|accepter (?:tout|les cookies|et continuer)|tout accepter|j'accepte|accepter|aceptar (?:todo|todas|cookies)|acepto|accetta (?:tutto|tutti)|accetto|aceitar(?:\s+(?:tudo|todos|cookies|todos os cookies))?|aceitar cookies|aceitar todos os cookies|aceitar e continuar|aceitar e fechar|aceito|accetta e continua|akkoord|ga akkoord|alles accepteren|accepteer alles|すべて受け入れる|すべて許可|同意する|同意して閉じる|쿠키 허용|모두 허용|동의하고 계속|동의|接受全部|全部接受|同意并继续|接受并继续|принять все|согласен|согласиться|souhlasím|přijmout vše|přijmout(?:\s+(?:všechny|vše))?|povolit vše|souhlasit|godkänn alla|acceptera alla|tillåt alla|jag godkänner|godkänn|accepter alle|tillad alle|jeg accepterer|acceptér|godta alle|tanggap semua|setuju|terima semua|godkjenn alle|aksepter alle|aksepter|jeg godtar|godta|accepta-ho tot|accepta|accepto|d'acord|razumem|prihvatam|прихватам|прихвати све|qəbul edirəm|ყველას მიღება|සියල්ල පිළිගන්න|පිළිගන්න|همه را بپذیرید|ተቀበል|ሁሉንም ተቀበል|allow all|confirm my choices|pokračovat|hyväksy(?:\s+kaikki)?|salli kaikki|salli evästeet|hyväksyn|priimti visus|sutinku|leisti visus|прифати(?:\s+(?:ги\s+)?сите)?|се согласувам|acceptă(?:\s+tot(?:ul)?)?|accept toate|acceptă toate|sunt de acord|pieņemt(?:\s+visus)?|piekrītu|atļaut(?:\s+visus)?|apstiprināt|elfogad(?:om)?|mindent elfogad(?:ok)?|összes elfogadása|elfogadom az összeset|hozzájárulok)(?:\s+and\s+.*)?$/i;
62
+ const consentPattern = /(cookie|cookies|privacy|consent|gdpr|ccpa|onetrust|before you continue|we use cookies and data|device identifiers|personalized ads|personalized content|trusted third party partners?|privacy preference center|your privacy settings|your privacy choices|manage privacy preferences|manage consent preferences|cookie information|cookie list|cookies details|list of partners(?: \(vendors\))?|configurações avançadas de cookies|declaração de cookies|gerenciar cookies|utilizamos cookies|dados pessoais|pliki cookie|datenschutz|données personnelles|datos personales|dati personali|wish to store|access information on your devices|preferenze cookie|クッキー|Cookieプリファレンス|Cookie設定|同意設定|쿠키|동의|개인정보|接受|隐私设置|cookie 偏好设置|файлы cookie|настройки cookie|souhlas|personalizac|soukromí|nastavení souhlasu|kakor|sekretess|samtycke|cookies og data|privatlivs|samtykke|privatliv|personvern|informasjonskapsler|informasjonskapslar|aller media|dine data|galetes|protecció de dades|política de privadesa|ግላዊነት|ኩኪ|ኩኪዎች|pro pokračování vyberte|technické cookies|jakou formou vám máme zobrazovat obsah|evästeet|evästeasetukset|tietosuoja|yksityisyys|hyväksy evästeet|slapukai|privatumas|slapukų nustatymai|kolačinji|приватност|поставки за колачиња|cookie-uri|confidențialitate|setări cookie|protecția datelor|sīkdatnes|sīkfailus|privātums|privātuma iestatījumi|sīkdatņu iestatījumi|sütiket|adatvédelem|adatvédelmi beállítások|süti beállítások)/i;
63
+ const containerPattern = /(cookie|consent|privacy|onetrust|cookiebot|usercentrics|trustarc|didomi|quantcast|gdpr|ccpa)/i;
64
+ #{js_dom_helpers}
65
+ const parentOrHost = (node) => {
66
+ if (!node) return null;
67
+ if (node.parentElement) return node.parentElement;
68
+ const root = node.getRootNode && node.getRootNode();
69
+ return root && root.host ? root.host : null;
70
+ };
71
+ const candidates = queryAllRoots('button, [role="button"], a, input[type="button"], input[type="submit"]');
72
+
73
+ const consentContext = (el) => {
74
+ let node = el;
75
+ for (let depth = 0; node && depth < 8; depth += 1, node = parentOrHost(node)) {
76
+ const text = textFor(node);
77
+ const attrs = [node.id, node.className, node.getAttribute('aria-label'), node.getAttribute('data-testid')]
78
+ .filter(Boolean)
79
+ .join(' ');
80
+ if (consentPattern.test(text + ' ' + attrs)) return true;
81
+ }
82
+ return bodyLooksLikeConsent;
83
+ };
84
+
85
+ const consentAttrs = (el) => [el.id, el.className, el.getAttribute('aria-label'), el.getAttribute('data-testid')]
86
+ .filter(Boolean)
87
+ .join(' ');
88
+
89
+ let clicked = false;
90
+ for (const el of candidates) {
91
+ const text = textFor(el);
92
+ if (!text || !visible(el) || !pattern.test(text) || !consentContext(el)) continue;
93
+ el.click();
94
+ clicked = true;
95
+ }
96
+
97
+ if (!clicked) {
98
+ const consentContainerSel = #{consent_container_selector_js};
99
+ for (const container of queryAllRoots(consentContainerSel)) {
100
+ if (!visible(container)) continue;
101
+ for (const el of container.querySelectorAll('div, span')) {
102
+ const text = textFor(el);
103
+ if (!text || !visible(el) || !pattern.test(text)) continue;
104
+ if (text.length > 120) continue;
105
+ el.click();
106
+ clicked = true;
107
+ }
108
+ if (clicked) break;
109
+ }
110
+ }
111
+
112
+ for (const el of queryAllRoots('[role="dialog"], [aria-modal="true"], dialog')) {
113
+ const attrs = consentAttrs(el);
114
+ if (!consentContext(el) && !containerPattern.test(attrs)) continue;
115
+ if (!visible(el)) continue;
116
+ el.remove();
117
+ clicked = true;
118
+ }
119
+
120
+ const consentOverlaySelector = #{consent_overlay_selector_js};
121
+ for (const el of queryAllRoots(consentOverlaySelector)) {
122
+ if (el.matches('[role="dialog"], [aria-modal="true"], dialog')) continue;
123
+ const style = window.getComputedStyle(el);
124
+ if (!/fixed|sticky/.test(style.position)) continue;
125
+ if (!visible(el)) continue;
126
+ el.remove();
127
+ clicked = true;
128
+ }
129
+
130
+ if (clicked) {
131
+ restoreScroll();
132
+ }
133
+
134
+ return clicked;
135
+ })()
136
+ JS
137
+ end
138
+
139
+ def dismiss_privacy_preference_overlay(page)
140
+ overlay_present = safe_evaluate(page, <<~'JS', default: false)
141
+ (() => {
142
+ if (!document.querySelector(
143
+ "[id*='onetrust' i], [class*='onetrust' i], " +
144
+ "[id*='cookiebot' i], [class*='cookiebot' i], " +
145
+ "[id*='cookie-consent' i], [class*='cookie-consent' i], " +
146
+ "[id*='cookie_consent' i], [class*='cookie_consent' i], " +
147
+ "[id*='cookieconsent' i], [class*='cookieconsent' i], " +
148
+ "[id*='cookie-banner' i], [class*='cookie-banner' i], " +
149
+ "[id*='cookie-notice' i], [class*='cookie-notice' i], " +
150
+ "[id*='privacy-banner' i], [class*='privacy-banner' i], " +
151
+ "[id*='privacy_banner' i], [class*='privacy_banner' i], " +
152
+ "[id*='privacy-preference' i], [class*='privacy-preference' i], " +
153
+ "[id*='gdpr' i], [class*='gdpr' i]"
154
+ )) return false;
155
+ const text = ((document.body && document.body.innerText) || '').toLowerCase()
156
+ if (!text) return false
157
+ if (!/(privacy preference center|your privacy settings|your privacy choices|manage privacy preferences|manage consent preferences|cookie information|cookie list|cookies details|list of partners(?: \(vendors\))?|personverninnstillinger|informasjonskapsler|sekretessinställningar|kakor|evästeasetukset|tietosuoja|slapukų nustatymai|privatumo nustatymai|sīkdatņu iestatījumi|privātuma iestatījumi|adatvédelmi beállítások|süti beállítások|setări cookie|nastavení souhlasu)/i.test(text)) return false
158
+ return true
159
+ })()
160
+ JS
161
+ return false unless overlay_present
162
+
163
+ click_visible_button_by_text(
164
+ page,
165
+ CONSENT_ACTION_LABELS,
166
+ CONSENT_FALLBACK_LABELS,
167
+ selectors: 'button, [role="button"], a, input[type="button"], input[type="submit"]'
168
+ )
169
+ end
170
+
171
+ def consent_quick_indicator_selector_js
172
+ selector = <<~JS
173
+ [id*="onetrust" i], [class*="onetrust" i],
174
+ [id*="cookiebot" i], [class*="cookiebot" i],
175
+ [id*="usercentrics" i], [class*="usercentrics" i],
176
+ [id*="trustarc" i], [class*="trustarc" i],
177
+ [id*="didomi" i], [class*="didomi" i],
178
+ [id*="quantcast" i], [class*="quantcast" i],
179
+ [id*="cookie-consent" i], [class*="cookie-consent" i],
180
+ [id*="cookie_consent" i], [class*="cookie_consent" i],
181
+ [id*="cookieconsent" i], [class*="cookieconsent" i],
182
+ [id*="cookie-banner" i], [class*="cookie-banner" i],
183
+ [id*="cookie_banner" i], [class*="cookie_banner" i],
184
+ [id*="cookie-notice" i], [class*="cookie-notice" i],
185
+ [id*="gdpr" i], [class*="gdpr" i],
186
+ [id*="ccpa" i], [class*="ccpa" i],
187
+ [id*="privacy-banner" i], [class*="privacy-banner" i],
188
+ [id*="privacy_banner" i], [class*="privacy_banner" i]
189
+ JS
190
+ selector.gsub(/\s+/, " ").strip.inspect
191
+ end
192
+
193
+ def consent_container_selector_js
194
+ selector = <<~JS
195
+ [id*="cookie" i], [class*="cookie" i],
196
+ [id*="consent" i], [class*="consent" i],
197
+ [id*="privacy" i], [class*="privacy" i],
198
+ [id*="gdpr" i], [class*="gdpr" i],
199
+ [id*="ccpa" i], [class*="ccpa" i]
200
+ JS
201
+ selector.gsub(/\s+/, " ").strip.inspect
202
+ end
203
+
204
+ def consent_overlay_selector_js
205
+ selector = <<~JS
206
+ [id*="cookie" i], [class*="cookie" i],
207
+ [id*="consent" i], [class*="consent" i],
208
+ [id*="privacy" i], [class*="privacy" i],
209
+ [id*="gdpr" i], [class*="gdpr" i],
210
+ [id*="ccpa" i], [class*="ccpa" i],
211
+ [id*="onetrust" i], [class*="onetrust" i],
212
+ [id*="cookiebot" i], [class*="cookiebot" i],
213
+ [id*="usercentrics" i], [class*="usercentrics" i],
214
+ [id*="trustarc" i], [class*="trustarc" i],
215
+ [id*="didomi" i], [class*="didomi" i],
216
+ [id*="quantcast" i]
217
+ JS
218
+ selector.gsub(/\s+/, " ").strip.inspect
219
+ end
220
+ end
221
+ # rubocop:enable Metrics/ModuleLength
222
+ end
223
+ end
224
+ end
@@ -0,0 +1,162 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module InteractionHelpers
6
+ # rubocop:disable Metrics/ModuleLength
7
+ module DomInteraction
8
+ private
9
+
10
+ def click_visible_button_by_text(page, primary_labels, fallback_labels = [], selectors: 'button, [role="button"]')
11
+ groups = [Array(primary_labels), Array(fallback_labels)].reject(&:empty?)
12
+
13
+ safe_evaluate(page, <<~JS)
14
+ (() => {
15
+ const labelGroups = #{JSON.generate(groups)};
16
+ #{js_dom_helpers}
17
+ const buttons = queryAllRoots(#{selectors.to_json});
18
+
19
+ for (const labels of labelGroups) {
20
+ const allowed = new Set(labels.map((label) => String(label).toLowerCase()));
21
+ for (const button of buttons) {
22
+ if (!visible(button) || !allowed.has(textFor(button).toLowerCase())) continue;
23
+ button.click();
24
+ return true;
25
+ }
26
+ }
27
+
28
+ return false;
29
+ })()
30
+ JS
31
+ end
32
+
33
+ def dismiss_overlay_dialog(page, close_selectors:, dialog_selectors:, dialog_pattern:, overlay_selectors: [], close_label_pattern: nil,
34
+ allow_empty_close_label: false)
35
+ config = {
36
+ closeSelectors: Array(close_selectors),
37
+ dialogSelectors: Array(dialog_selectors),
38
+ overlaySelectors: Array(overlay_selectors),
39
+ dialogPattern: dialog_pattern,
40
+ closeLabelPattern: close_label_pattern,
41
+ allowEmptyCloseLabel: allow_empty_close_label
42
+ }
43
+
44
+ safe_evaluate(page, <<~JS)
45
+ (() => {
46
+ const config = #{JSON.generate(config)};
47
+ const dialogPattern = new RegExp(config.dialogPattern || '', 'i');
48
+ const closeLabelPattern = config.closeLabelPattern ? new RegExp(config.closeLabelPattern, 'i') : null;
49
+ #{js_dom_helpers}
50
+
51
+ const matchingNodes = [];
52
+ const collectMatchingNodes = (selectors, requireOverlayPrompt) => {
53
+ const selectorText = (selectors || []).join(', ');
54
+ if (!selectorText) return;
55
+
56
+ queryAllRoots(selectorText).forEach((node) => {
57
+ const text = textFor(node).slice(0, 2000);
58
+ if (!dialogPattern.test(text)) return;
59
+ if (requireOverlayPrompt && !/log in|sign up/i.test(text)) return;
60
+ if (!matchingNodes.includes(node)) matchingNodes.push(node);
61
+ });
62
+ };
63
+
64
+ collectMatchingNodes(config.dialogSelectors, false);
65
+ collectMatchingNodes(config.overlaySelectors, true);
66
+
67
+ const withinMatchingNode = (node) => {
68
+ if (!node || matchingNodes.length === 0) return true;
69
+ return matchingNodes.some((container) => container === node || container.contains(node));
70
+ };
71
+
72
+ const clickCloseButton = () => {
73
+ const selectorText = (config.closeSelectors || []).join(', ');
74
+ if (!selectorText) return false;
75
+ const candidates = queryAllRoots(selectorText);
76
+
77
+ for (const candidate of candidates) {
78
+ const button = candidate.closest('button, [role="button"]') || candidate;
79
+ if (!button || !visible(button)) continue;
80
+ if (!withinMatchingNode(candidate) && !withinMatchingNode(button)) continue;
81
+
82
+ const label = textFor(button).toLowerCase();
83
+ const labelMatches = !closeLabelPattern || closeLabelPattern.test(label) || (config.allowEmptyCloseLabel && label === '');
84
+ if (!labelMatches) continue;
85
+
86
+ button.click();
87
+ restoreScroll();
88
+ return true;
89
+ }
90
+
91
+ return false;
92
+ };
93
+
94
+ if (clickCloseButton()) return true;
95
+
96
+ let removed = false;
97
+ matchingNodes.forEach((node) => {
98
+ node.remove();
99
+ removed = true;
100
+ });
101
+
102
+ if (removed) restoreScroll();
103
+ return removed;
104
+ })()
105
+ JS
106
+ end
107
+
108
+ def js_dom_helpers
109
+ <<~JS
110
+ const queryAllRoots = (selectors) => {
111
+ const matches = [];
112
+ const queue = [document];
113
+ while (queue.length) {
114
+ const root = queue.shift();
115
+ if (!root || !root.querySelectorAll) continue;
116
+ root.querySelectorAll(selectors).forEach((el) => matches.push(el));
117
+ root.querySelectorAll('*').forEach((el) => {
118
+ if (el.shadowRoot) queue.push(el.shadowRoot);
119
+ });
120
+ }
121
+ return matches;
122
+ };
123
+
124
+ const visible = (el) => {
125
+ const rect = el.getBoundingClientRect();
126
+ const style = window.getComputedStyle(el);
127
+ return rect.width > 0 && rect.height > 0 && style.visibility !== 'hidden' && style.display !== 'none';
128
+ };
129
+
130
+ const textFor = (el) => {
131
+ const values = [el.innerText, el.textContent, el.value, el.getAttribute('aria-label'), el.getAttribute('title')]
132
+ .filter(Boolean)
133
+ .map((value) => value.replace(/\\s+/g, ' ').trim())
134
+ .filter(Boolean);
135
+ return values[0] || '';
136
+ };
137
+
138
+ const restoreScroll = () => {
139
+ if (document.body) document.body.style.overflow = 'auto';
140
+ if (document.documentElement) document.documentElement.style.overflow = 'auto';
141
+ };
142
+ JS
143
+ end
144
+
145
+ def safe_evaluate(page, script, default: false)
146
+ page.evaluate(script)
147
+ rescue Ferrum::JavaScriptError, Ferrum::TimeoutError
148
+ default
149
+ end
150
+
151
+ def symbolize_hash(hash)
152
+ result = {}
153
+ hash.each do |key, value|
154
+ result[key.to_sym] = value
155
+ end
156
+ result
157
+ end
158
+ end
159
+ # rubocop:enable Metrics/ModuleLength
160
+ end
161
+ end
162
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module InteractionHelpers
6
+ module TimingHelpers
7
+ private
8
+
9
+ def retry_until_timeout(timeout, interval: 0.2)
10
+ deadline = Process.clock_gettime(Process::CLOCK_MONOTONIC) + timeout
11
+
12
+ loop do
13
+ result = yield
14
+ return true if result == true
15
+ return false if Process.clock_gettime(Process::CLOCK_MONOTONIC) >= deadline
16
+
17
+ sleep(result.is_a?(Numeric) ? result : interval)
18
+ end
19
+ end
20
+
21
+ def capped_timeout(max_timeout)
22
+ [@timeout, max_timeout].min
23
+ end
24
+
25
+ def settle_after_stabilization(max_wait)
26
+ sleep [@wait, max_wait].min if @wait.positive?
27
+ end
28
+
29
+ def social_login_phase_pause
30
+ if @wait.positive?
31
+ settle_after_stabilization(SOCIAL_LOGIN_PHASE_WAIT)
32
+ else
33
+ sleep SOCIAL_LOGIN_PHASE_WAIT
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module InteractionHelpers
6
+ autoload :ConsentHelpers, "fetch_util/browser/interaction_helpers/consent_helpers"
7
+ autoload :DomInteraction, "fetch_util/browser/interaction_helpers/dom_interaction"
8
+ autoload :TimingHelpers, "fetch_util/browser/interaction_helpers/timing_helpers"
9
+
10
+ include ConsentHelpers
11
+ include DomInteraction
12
+ include TimingHelpers
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module Navigation
6
+ module HeadersAndReadiness
7
+ private
8
+
9
+ def default_headers
10
+ {
11
+ "User-Agent" => @user_agent,
12
+ "Accept-Language" => @accept_language
13
+ }
14
+ end
15
+
16
+ def page_loaded_enough?(page)
17
+ page.evaluate(<<~JS)
18
+ (() => !!(document && document.body && (document.body.innerText || document.body.textContent || '').trim().length > 0))()
19
+ JS
20
+ rescue Ferrum::JavaScriptError, Ferrum::TimeoutError
21
+ false
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module Navigation
6
+ module NavigatorPatch
7
+ private
8
+
9
+ def navigator_patch
10
+ ua_version = @user_agent[%r{Chrome/([\d.]+)}, 1] || "136.0.7103.113"
11
+ major = ua_version.split(".").first
12
+ languages_json = JSON.generate(@accept_language.split(",").map { |part| part.split(";").first.strip })
13
+ <<~JS
14
+ Object.defineProperty(navigator, "webdriver", { get: () => undefined });
15
+ Object.defineProperty(navigator, "languages", { get: () => #{languages_json} });
16
+ Object.defineProperty(navigator, "platform", { get: () => "Linux x86_64" });
17
+
18
+ Object.defineProperty(navigator, "plugins", {
19
+ get: () => {
20
+ const p = { 0: { name: "PDF Viewer", filename: "internal-pdf-viewer", description: "Portable Document Format" },
21
+ 1: { name: "Chrome PDF Viewer", filename: "internal-pdf-viewer", description: "Portable Document Format" },
22
+ 2: { name: "Chromium PDF Viewer", filename: "internal-pdf-viewer", description: "Portable Document Format" },
23
+ length: 3 };
24
+ p[Symbol.iterator] = function*() { yield p[0]; yield p[1]; yield p[2]; };
25
+ return p;
26
+ }
27
+ });
28
+ Object.defineProperty(navigator, "mimeTypes", {
29
+ get: () => {
30
+ const m = { 0: { type: "application/pdf", suffixes: "pdf", description: "Portable Document Format" },
31
+ length: 1 };
32
+ m[Symbol.iterator] = function*() { yield m[0]; };
33
+ return m;
34
+ }
35
+ });
36
+
37
+ if (!window.chrome) {
38
+ window.chrome = { runtime: {}, loadTimes: function(){}, csi: function(){} };
39
+ }
40
+
41
+ const origQuery = window.Permissions && Permissions.prototype.query;
42
+ if (origQuery) {
43
+ Permissions.prototype.query = function(parameters) {
44
+ return parameters.name === "notifications"
45
+ ? Promise.resolve({ state: Notification.permission })
46
+ : origQuery.call(this, parameters);
47
+ };
48
+ }
49
+
50
+ Object.defineProperty(navigator, "hardwareConcurrency", { get: () => 4 });
51
+
52
+ if (!navigator.deviceMemory) {
53
+ Object.defineProperty(navigator, "deviceMemory", { get: () => 8 });
54
+ }
55
+
56
+ if (!navigator.connection) {
57
+ Object.defineProperty(navigator, "connection", {
58
+ get: () => ({ effectiveType: "4g", rtt: 50, downlink: 10, saveData: false })
59
+ });
60
+ }
61
+
62
+ {
63
+ const uaData = navigator.userAgentData;
64
+ const missingUserAgentData = !uaData || !Array.isArray(uaData.brands) || uaData.brands.length === 0 || !uaData.platform;
65
+ if (missingUserAgentData) {
66
+ Object.defineProperty(navigator, "userAgentData", {
67
+ get: () => ({
68
+ brands: [
69
+ { brand: "Chromium", version: "#{major}" },
70
+ { brand: "Google Chrome", version: "#{major}" },
71
+ { brand: "Not.A/Brand", version: "24" }
72
+ ],
73
+ mobile: false,
74
+ platform: "Linux",
75
+ getHighEntropyValues: function(hints) {
76
+ return Promise.resolve({
77
+ architecture: "x86",
78
+ bitness: "64",
79
+ brands: this.brands,
80
+ fullVersionList: [
81
+ { brand: "Chromium", version: "#{ua_version}" },
82
+ { brand: "Google Chrome", version: "#{ua_version}" }
83
+ ],
84
+ mobile: false,
85
+ model: "",
86
+ platform: "Linux",
87
+ platformVersion: "6.1.0",
88
+ uaFullVersion: "#{ua_version}"
89
+ });
90
+ }
91
+ })
92
+ });
93
+ }
94
+ }
95
+
96
+ {
97
+ const getParameterProto = WebGLRenderingContext.prototype.getParameter;
98
+ WebGLRenderingContext.prototype.getParameter = function(param) {
99
+ const debugExt = this.getExtension('WEBGL_debug_renderer_info');
100
+ if (debugExt) {
101
+ if (param === debugExt.UNMASKED_VENDOR_WEBGL) return 'Google Inc. (Intel)';
102
+ if (param === debugExt.UNMASKED_RENDERER_WEBGL)
103
+ return 'ANGLE (Intel, Mesa Intel(R) UHD Graphics 630 (CFL GT2), OpenGL 4.6)';
104
+ }
105
+ return getParameterProto.call(this, param);
106
+ };
107
+ }
108
+
109
+ Object.defineProperty(screen, "width", { get: () => #{@viewport.fetch(:width)} });
110
+ Object.defineProperty(screen, "height", { get: () => #{@viewport.fetch(:height)} });
111
+ Object.defineProperty(screen, "availWidth", { get: () => #{@viewport.fetch(:width)} });
112
+ Object.defineProperty(screen, "availHeight", { get: () => #{@viewport.fetch(:height)} });
113
+ JS
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FetchUtil
4
+ class Browser
5
+ module Navigation
6
+ autoload :HeadersAndReadiness, "fetch_util/browser/navigation/headers_and_readiness"
7
+ autoload :NavigatorPatch, "fetch_util/browser/navigation/navigator_patch"
8
+
9
+ include HeadersAndReadiness
10
+ include NavigatorPatch
11
+ end
12
+ end
13
+ end