fetch_util 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +97 -0
- data/CHANGELOG.md +48 -0
- data/LICENSE.txt +21 -0
- data/README.md +199 -0
- data/Rakefile +18 -0
- data/SKILL.md +92 -0
- data/exe/fetch_util +6 -0
- data/lib/fetch_util/assets/extract.js +1 -0
- data/lib/fetch_util/assets/vendor/readability.js +2314 -0
- data/lib/fetch_util/assets/vendor/turndown.js +974 -0
- data/lib/fetch_util/browser/interaction_helpers/consent_helpers.rb +224 -0
- data/lib/fetch_util/browser/interaction_helpers/dom_interaction.rb +162 -0
- data/lib/fetch_util/browser/interaction_helpers/timing_helpers.rb +39 -0
- data/lib/fetch_util/browser/interaction_helpers.rb +15 -0
- data/lib/fetch_util/browser/navigation/headers_and_readiness.rb +26 -0
- data/lib/fetch_util/browser/navigation/navigator_patch.rb +118 -0
- data/lib/fetch_util/browser/navigation.rb +13 -0
- data/lib/fetch_util/browser/site_stabilization/community_and_marketplace.rb +117 -0
- data/lib/fetch_util/browser/site_stabilization/social_platforms.rb +118 -0
- data/lib/fetch_util/browser/site_stabilization.rb +13 -0
- data/lib/fetch_util/browser/stabilization/page_flow.rb +80 -0
- data/lib/fetch_util/browser/stabilization/spa_hydration.rb +183 -0
- data/lib/fetch_util/browser/stabilization.rb +13 -0
- data/lib/fetch_util/browser.rb +135 -0
- data/lib/fetch_util/cli.rb +124 -0
- data/lib/fetch_util/extractor.rb +56 -0
- data/lib/fetch_util/fetcher.rb +242 -0
- data/lib/fetch_util/parallel_fetcher.rb +97 -0
- data/lib/fetch_util/raw_docs_fallback.rb +260 -0
- data/lib/fetch_util/regulatory/cache_store.rb +92 -0
- data/lib/fetch_util/regulatory/directives.rb +106 -0
- data/lib/fetch_util/regulatory/fetch_records.rb +108 -0
- data/lib/fetch_util/regulatory/headers.rb +39 -0
- data/lib/fetch_util/regulatory/http_client.rb +70 -0
- data/lib/fetch_util/regulatory/human.rb +104 -0
- data/lib/fetch_util/regulatory/orchestration.rb +82 -0
- data/lib/fetch_util/regulatory/page.rb +70 -0
- data/lib/fetch_util/regulatory/robot_globs.rb +17 -0
- data/lib/fetch_util/regulatory/robots.rb +117 -0
- data/lib/fetch_util/regulatory/signals.rb +106 -0
- data/lib/fetch_util/regulatory/source_selection.rb +60 -0
- data/lib/fetch_util/regulatory/tdm_page.rb +39 -0
- data/lib/fetch_util/regulatory/tdm_policy.rb +55 -0
- data/lib/fetch_util/regulatory/tdm_rep.rb +50 -0
- data/lib/fetch_util/regulatory/tdm_support.rb +94 -0
- data/lib/fetch_util/regulatory/trust_txt.rb +49 -0
- data/lib/fetch_util/regulatory/usage_preferences.rb +106 -0
- data/lib/fetch_util/regulatory.rb +74 -0
- data/lib/fetch_util/request_log.rb +24 -0
- data/lib/fetch_util/result.rb +58 -0
- data/lib/fetch_util/searcher/result_filtering.rb +102 -0
- data/lib/fetch_util/searcher.rb +332 -0
- data/lib/fetch_util/version.rb +5 -0
- data/lib/fetch_util.rb +115 -0
- metadata +145 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module InteractionHelpers
|
|
6
|
+
# rubocop:disable Metrics/ModuleLength
|
|
7
|
+
module ConsentHelpers
|
|
8
|
+
CONSENT_ACTION_LABELS = [
|
|
9
|
+
"Accept All", "Allow all", "Confirm My Choices", "Save preferences", "Customize Choices",
|
|
10
|
+
"Alle akzeptieren", "Alles akzeptieren", "Alle zulassen",
|
|
11
|
+
"Tout accepter", "Accepter tout", "Autoriser tout",
|
|
12
|
+
"Aceptar todo", "Aceptar todas",
|
|
13
|
+
"Accetta tutto", "Accetta tutti",
|
|
14
|
+
"Aceitar tudo", "Aceitar todos", "Aceitar todos os cookies",
|
|
15
|
+
"Alles accepteren", "Accepteer alles",
|
|
16
|
+
"Godta alle", "Aksepter alle", "Godkjenn alle",
|
|
17
|
+
"Acceptera alla", "Godkänn alla", "Tillåt alla",
|
|
18
|
+
"Accepter alle", "Tillad alle",
|
|
19
|
+
"Hyväksy kaikki", "Salli kaikki",
|
|
20
|
+
"Priimti visus", "Leisti visus",
|
|
21
|
+
"Pieņemt visus", "Atļaut visus",
|
|
22
|
+
"Přijmout vše", "Povolit vše",
|
|
23
|
+
"Zaakceptuj wszystkie", "Akceptuję",
|
|
24
|
+
"Mindent elfogadok", "Összes elfogadása", "Elfogadom",
|
|
25
|
+
"Acceptă tot", "Acceptă toate",
|
|
26
|
+
"Прифати сите", "Прихвати све",
|
|
27
|
+
"すべて受け入れる", "すべて許可",
|
|
28
|
+
"모두 허용", "全部接受"
|
|
29
|
+
].freeze
|
|
30
|
+
CONSENT_FALLBACK_LABELS = [
|
|
31
|
+
"Essential only", "Reject All",
|
|
32
|
+
"Alle ablehnen", "Tout refuser", "Rechazar todo",
|
|
33
|
+
"Rifiuta tutto", "Rejeitar tudo", "Alles weigeren",
|
|
34
|
+
"Avvis alle", "Avvisa alla", "Afvis alle",
|
|
35
|
+
"Hylkää kaikki", "Atmesti visus", "Noraidīt visus",
|
|
36
|
+
"Odmítnout vše", "Odrzuć wszystkie",
|
|
37
|
+
"Összes elutasítása", "Respinge tot",
|
|
38
|
+
"Одбиј ги сите", "Одбиј све"
|
|
39
|
+
].freeze
|
|
40
|
+
private_constant :CONSENT_ACTION_LABELS, :CONSENT_FALLBACK_LABELS
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
def accept_cookie_consent(page)
|
|
45
|
+
safe_evaluate(page, <<~JS)
|
|
46
|
+
(() => {
|
|
47
|
+
const quickIndicator = document.querySelector(#{consent_quick_indicator_selector_js});
|
|
48
|
+
const bodyPreview = ((document.body && (document.body.textContent || document.body.innerText)) || '')
|
|
49
|
+
.replace(/\s+/g, ' ')
|
|
50
|
+
.trim()
|
|
51
|
+
.slice(0, 4000)
|
|
52
|
+
.toLowerCase();
|
|
53
|
+
const bodyLooksLikeConsent = /(cookie|privacy|consent|personal data|personalized ads|personalized content|vendors want your permission|configurações avançadas de cookies|declaração de cookies|gerenciar cookies|utilizamos cookies|dados pessoais|informasjonskapsler|personvern|kakor|sekretess|samtycke|evästeet|evästeasetukset|tietosuoja|slapukai|slapukų|privatumas|sīkdatnes|sīkfailus|privātums|sütiket|adatvédelem|cookie-uri|confidențialitate|колачиња|приватност)/.test(bodyPreview) &&
|
|
54
|
+
/(accept|reject|manage|allow|agree|consent|cookies|privacy|more options|aceitar|rejeitar|gerenciar|godta|aksepter|godkjenn|godkänn|acceptera|tillåt|hyväksy|salli|priimti|sutinku|leisti|pieņemt|piekrītu|atļaut|elfogad|összes|acceptă|sunt de acord|прифати|се согласувам)/.test(bodyPreview);
|
|
55
|
+
const hasConsentDialog = quickIndicator ||
|
|
56
|
+
document.querySelector('[role="dialog"][aria-modal="true"]') ||
|
|
57
|
+
document.querySelector('dialog') ||
|
|
58
|
+
bodyLooksLikeConsent;
|
|
59
|
+
if (!hasConsentDialog) return false;
|
|
60
|
+
|
|
61
|
+
const pattern = /^(accept(?: all(?: cookies?)?)?|allow all(?: cookies)?|allow cookies|agree(?: to cookies| and continue| & continue)?|i agree|ok(?:ay)?|accept & continue|continue with cookies|consent|got it|i understand|continue|accept and close|close|accept recommended settings|przejdź do serwisu|zaakceptuj(?:\s+(?:wszystkie|wszystko))?|akceptuję|zgadzam się|zgoda|akzeptieren|alle akzeptieren|zustimmen|accepter (?:tout|les cookies|et continuer)|tout accepter|j'accepte|accepter|aceptar (?:todo|todas|cookies)|acepto|accetta (?:tutto|tutti)|accetto|aceitar(?:\s+(?:tudo|todos|cookies|todos os cookies))?|aceitar cookies|aceitar todos os cookies|aceitar e continuar|aceitar e fechar|aceito|accetta e continua|akkoord|ga akkoord|alles accepteren|accepteer alles|すべて受け入れる|すべて許可|同意する|同意して閉じる|쿠키 허용|모두 허용|동의하고 계속|동의|接受全部|全部接受|同意并继续|接受并继续|принять все|согласен|согласиться|souhlasím|přijmout vše|přijmout(?:\s+(?:všechny|vše))?|povolit vše|souhlasit|godkänn alla|acceptera alla|tillåt alla|jag godkänner|godkänn|accepter alle|tillad alle|jeg accepterer|acceptér|godta alle|tanggap semua|setuju|terima semua|godkjenn alle|aksepter alle|aksepter|jeg godtar|godta|accepta-ho tot|accepta|accepto|d'acord|razumem|prihvatam|прихватам|прихвати све|qəbul edirəm|ყველას მიღება|සියල්ල පිළිගන්න|පිළිගන්න|همه را بپذیرید|ተቀበል|ሁሉንም ተቀበል|allow all|confirm my choices|pokračovat|hyväksy(?:\s+kaikki)?|salli kaikki|salli evästeet|hyväksyn|priimti visus|sutinku|leisti visus|прифати(?:\s+(?:ги\s+)?сите)?|се согласувам|acceptă(?:\s+tot(?:ul)?)?|accept toate|acceptă toate|sunt de acord|pieņemt(?:\s+visus)?|piekrītu|atļaut(?:\s+visus)?|apstiprināt|elfogad(?:om)?|mindent elfogad(?:ok)?|összes elfogadása|elfogadom az összeset|hozzájárulok)(?:\s+and\s+.*)?$/i;
|
|
62
|
+
const consentPattern = /(cookie|cookies|privacy|consent|gdpr|ccpa|onetrust|before you continue|we use cookies and data|device identifiers|personalized ads|personalized content|trusted third party partners?|privacy preference center|your privacy settings|your privacy choices|manage privacy preferences|manage consent preferences|cookie information|cookie list|cookies details|list of partners(?: \(vendors\))?|configurações avançadas de cookies|declaração de cookies|gerenciar cookies|utilizamos cookies|dados pessoais|pliki cookie|datenschutz|données personnelles|datos personales|dati personali|wish to store|access information on your devices|preferenze cookie|クッキー|Cookieプリファレンス|Cookie設定|同意設定|쿠키|동의|개인정보|接受|隐私设置|cookie 偏好设置|файлы cookie|настройки cookie|souhlas|personalizac|soukromí|nastavení souhlasu|kakor|sekretess|samtycke|cookies og data|privatlivs|samtykke|privatliv|personvern|informasjonskapsler|informasjonskapslar|aller media|dine data|galetes|protecció de dades|política de privadesa|ግላዊነት|ኩኪ|ኩኪዎች|pro pokračování vyberte|technické cookies|jakou formou vám máme zobrazovat obsah|evästeet|evästeasetukset|tietosuoja|yksityisyys|hyväksy evästeet|slapukai|privatumas|slapukų nustatymai|kolačinji|приватност|поставки за колачиња|cookie-uri|confidențialitate|setări cookie|protecția datelor|sīkdatnes|sīkfailus|privātums|privātuma iestatījumi|sīkdatņu iestatījumi|sütiket|adatvédelem|adatvédelmi beállítások|süti beállítások)/i;
|
|
63
|
+
const containerPattern = /(cookie|consent|privacy|onetrust|cookiebot|usercentrics|trustarc|didomi|quantcast|gdpr|ccpa)/i;
|
|
64
|
+
#{js_dom_helpers}
|
|
65
|
+
const parentOrHost = (node) => {
|
|
66
|
+
if (!node) return null;
|
|
67
|
+
if (node.parentElement) return node.parentElement;
|
|
68
|
+
const root = node.getRootNode && node.getRootNode();
|
|
69
|
+
return root && root.host ? root.host : null;
|
|
70
|
+
};
|
|
71
|
+
const candidates = queryAllRoots('button, [role="button"], a, input[type="button"], input[type="submit"]');
|
|
72
|
+
|
|
73
|
+
const consentContext = (el) => {
|
|
74
|
+
let node = el;
|
|
75
|
+
for (let depth = 0; node && depth < 8; depth += 1, node = parentOrHost(node)) {
|
|
76
|
+
const text = textFor(node);
|
|
77
|
+
const attrs = [node.id, node.className, node.getAttribute('aria-label'), node.getAttribute('data-testid')]
|
|
78
|
+
.filter(Boolean)
|
|
79
|
+
.join(' ');
|
|
80
|
+
if (consentPattern.test(text + ' ' + attrs)) return true;
|
|
81
|
+
}
|
|
82
|
+
return bodyLooksLikeConsent;
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
const consentAttrs = (el) => [el.id, el.className, el.getAttribute('aria-label'), el.getAttribute('data-testid')]
|
|
86
|
+
.filter(Boolean)
|
|
87
|
+
.join(' ');
|
|
88
|
+
|
|
89
|
+
let clicked = false;
|
|
90
|
+
for (const el of candidates) {
|
|
91
|
+
const text = textFor(el);
|
|
92
|
+
if (!text || !visible(el) || !pattern.test(text) || !consentContext(el)) continue;
|
|
93
|
+
el.click();
|
|
94
|
+
clicked = true;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
if (!clicked) {
|
|
98
|
+
const consentContainerSel = #{consent_container_selector_js};
|
|
99
|
+
for (const container of queryAllRoots(consentContainerSel)) {
|
|
100
|
+
if (!visible(container)) continue;
|
|
101
|
+
for (const el of container.querySelectorAll('div, span')) {
|
|
102
|
+
const text = textFor(el);
|
|
103
|
+
if (!text || !visible(el) || !pattern.test(text)) continue;
|
|
104
|
+
if (text.length > 120) continue;
|
|
105
|
+
el.click();
|
|
106
|
+
clicked = true;
|
|
107
|
+
}
|
|
108
|
+
if (clicked) break;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
for (const el of queryAllRoots('[role="dialog"], [aria-modal="true"], dialog')) {
|
|
113
|
+
const attrs = consentAttrs(el);
|
|
114
|
+
if (!consentContext(el) && !containerPattern.test(attrs)) continue;
|
|
115
|
+
if (!visible(el)) continue;
|
|
116
|
+
el.remove();
|
|
117
|
+
clicked = true;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const consentOverlaySelector = #{consent_overlay_selector_js};
|
|
121
|
+
for (const el of queryAllRoots(consentOverlaySelector)) {
|
|
122
|
+
if (el.matches('[role="dialog"], [aria-modal="true"], dialog')) continue;
|
|
123
|
+
const style = window.getComputedStyle(el);
|
|
124
|
+
if (!/fixed|sticky/.test(style.position)) continue;
|
|
125
|
+
if (!visible(el)) continue;
|
|
126
|
+
el.remove();
|
|
127
|
+
clicked = true;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if (clicked) {
|
|
131
|
+
restoreScroll();
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return clicked;
|
|
135
|
+
})()
|
|
136
|
+
JS
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def dismiss_privacy_preference_overlay(page)
|
|
140
|
+
overlay_present = safe_evaluate(page, <<~'JS', default: false)
|
|
141
|
+
(() => {
|
|
142
|
+
if (!document.querySelector(
|
|
143
|
+
"[id*='onetrust' i], [class*='onetrust' i], " +
|
|
144
|
+
"[id*='cookiebot' i], [class*='cookiebot' i], " +
|
|
145
|
+
"[id*='cookie-consent' i], [class*='cookie-consent' i], " +
|
|
146
|
+
"[id*='cookie_consent' i], [class*='cookie_consent' i], " +
|
|
147
|
+
"[id*='cookieconsent' i], [class*='cookieconsent' i], " +
|
|
148
|
+
"[id*='cookie-banner' i], [class*='cookie-banner' i], " +
|
|
149
|
+
"[id*='cookie-notice' i], [class*='cookie-notice' i], " +
|
|
150
|
+
"[id*='privacy-banner' i], [class*='privacy-banner' i], " +
|
|
151
|
+
"[id*='privacy_banner' i], [class*='privacy_banner' i], " +
|
|
152
|
+
"[id*='privacy-preference' i], [class*='privacy-preference' i], " +
|
|
153
|
+
"[id*='gdpr' i], [class*='gdpr' i]"
|
|
154
|
+
)) return false;
|
|
155
|
+
const text = ((document.body && document.body.innerText) || '').toLowerCase()
|
|
156
|
+
if (!text) return false
|
|
157
|
+
if (!/(privacy preference center|your privacy settings|your privacy choices|manage privacy preferences|manage consent preferences|cookie information|cookie list|cookies details|list of partners(?: \(vendors\))?|personverninnstillinger|informasjonskapsler|sekretessinställningar|kakor|evästeasetukset|tietosuoja|slapukų nustatymai|privatumo nustatymai|sīkdatņu iestatījumi|privātuma iestatījumi|adatvédelmi beállítások|süti beállítások|setări cookie|nastavení souhlasu)/i.test(text)) return false
|
|
158
|
+
return true
|
|
159
|
+
})()
|
|
160
|
+
JS
|
|
161
|
+
return false unless overlay_present
|
|
162
|
+
|
|
163
|
+
click_visible_button_by_text(
|
|
164
|
+
page,
|
|
165
|
+
CONSENT_ACTION_LABELS,
|
|
166
|
+
CONSENT_FALLBACK_LABELS,
|
|
167
|
+
selectors: 'button, [role="button"], a, input[type="button"], input[type="submit"]'
|
|
168
|
+
)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def consent_quick_indicator_selector_js
|
|
172
|
+
selector = <<~JS
|
|
173
|
+
[id*="onetrust" i], [class*="onetrust" i],
|
|
174
|
+
[id*="cookiebot" i], [class*="cookiebot" i],
|
|
175
|
+
[id*="usercentrics" i], [class*="usercentrics" i],
|
|
176
|
+
[id*="trustarc" i], [class*="trustarc" i],
|
|
177
|
+
[id*="didomi" i], [class*="didomi" i],
|
|
178
|
+
[id*="quantcast" i], [class*="quantcast" i],
|
|
179
|
+
[id*="cookie-consent" i], [class*="cookie-consent" i],
|
|
180
|
+
[id*="cookie_consent" i], [class*="cookie_consent" i],
|
|
181
|
+
[id*="cookieconsent" i], [class*="cookieconsent" i],
|
|
182
|
+
[id*="cookie-banner" i], [class*="cookie-banner" i],
|
|
183
|
+
[id*="cookie_banner" i], [class*="cookie_banner" i],
|
|
184
|
+
[id*="cookie-notice" i], [class*="cookie-notice" i],
|
|
185
|
+
[id*="gdpr" i], [class*="gdpr" i],
|
|
186
|
+
[id*="ccpa" i], [class*="ccpa" i],
|
|
187
|
+
[id*="privacy-banner" i], [class*="privacy-banner" i],
|
|
188
|
+
[id*="privacy_banner" i], [class*="privacy_banner" i]
|
|
189
|
+
JS
|
|
190
|
+
selector.gsub(/\s+/, " ").strip.inspect
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def consent_container_selector_js
|
|
194
|
+
selector = <<~JS
|
|
195
|
+
[id*="cookie" i], [class*="cookie" i],
|
|
196
|
+
[id*="consent" i], [class*="consent" i],
|
|
197
|
+
[id*="privacy" i], [class*="privacy" i],
|
|
198
|
+
[id*="gdpr" i], [class*="gdpr" i],
|
|
199
|
+
[id*="ccpa" i], [class*="ccpa" i]
|
|
200
|
+
JS
|
|
201
|
+
selector.gsub(/\s+/, " ").strip.inspect
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def consent_overlay_selector_js
|
|
205
|
+
selector = <<~JS
|
|
206
|
+
[id*="cookie" i], [class*="cookie" i],
|
|
207
|
+
[id*="consent" i], [class*="consent" i],
|
|
208
|
+
[id*="privacy" i], [class*="privacy" i],
|
|
209
|
+
[id*="gdpr" i], [class*="gdpr" i],
|
|
210
|
+
[id*="ccpa" i], [class*="ccpa" i],
|
|
211
|
+
[id*="onetrust" i], [class*="onetrust" i],
|
|
212
|
+
[id*="cookiebot" i], [class*="cookiebot" i],
|
|
213
|
+
[id*="usercentrics" i], [class*="usercentrics" i],
|
|
214
|
+
[id*="trustarc" i], [class*="trustarc" i],
|
|
215
|
+
[id*="didomi" i], [class*="didomi" i],
|
|
216
|
+
[id*="quantcast" i]
|
|
217
|
+
JS
|
|
218
|
+
selector.gsub(/\s+/, " ").strip.inspect
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
# rubocop:enable Metrics/ModuleLength
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module InteractionHelpers
|
|
6
|
+
# rubocop:disable Metrics/ModuleLength
|
|
7
|
+
module DomInteraction
|
|
8
|
+
private
|
|
9
|
+
|
|
10
|
+
def click_visible_button_by_text(page, primary_labels, fallback_labels = [], selectors: 'button, [role="button"]')
|
|
11
|
+
groups = [Array(primary_labels), Array(fallback_labels)].reject(&:empty?)
|
|
12
|
+
|
|
13
|
+
safe_evaluate(page, <<~JS)
|
|
14
|
+
(() => {
|
|
15
|
+
const labelGroups = #{JSON.generate(groups)};
|
|
16
|
+
#{js_dom_helpers}
|
|
17
|
+
const buttons = queryAllRoots(#{selectors.to_json});
|
|
18
|
+
|
|
19
|
+
for (const labels of labelGroups) {
|
|
20
|
+
const allowed = new Set(labels.map((label) => String(label).toLowerCase()));
|
|
21
|
+
for (const button of buttons) {
|
|
22
|
+
if (!visible(button) || !allowed.has(textFor(button).toLowerCase())) continue;
|
|
23
|
+
button.click();
|
|
24
|
+
return true;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
return false;
|
|
29
|
+
})()
|
|
30
|
+
JS
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def dismiss_overlay_dialog(page, close_selectors:, dialog_selectors:, dialog_pattern:, overlay_selectors: [], close_label_pattern: nil,
|
|
34
|
+
allow_empty_close_label: false)
|
|
35
|
+
config = {
|
|
36
|
+
closeSelectors: Array(close_selectors),
|
|
37
|
+
dialogSelectors: Array(dialog_selectors),
|
|
38
|
+
overlaySelectors: Array(overlay_selectors),
|
|
39
|
+
dialogPattern: dialog_pattern,
|
|
40
|
+
closeLabelPattern: close_label_pattern,
|
|
41
|
+
allowEmptyCloseLabel: allow_empty_close_label
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
safe_evaluate(page, <<~JS)
|
|
45
|
+
(() => {
|
|
46
|
+
const config = #{JSON.generate(config)};
|
|
47
|
+
const dialogPattern = new RegExp(config.dialogPattern || '', 'i');
|
|
48
|
+
const closeLabelPattern = config.closeLabelPattern ? new RegExp(config.closeLabelPattern, 'i') : null;
|
|
49
|
+
#{js_dom_helpers}
|
|
50
|
+
|
|
51
|
+
const matchingNodes = [];
|
|
52
|
+
const collectMatchingNodes = (selectors, requireOverlayPrompt) => {
|
|
53
|
+
const selectorText = (selectors || []).join(', ');
|
|
54
|
+
if (!selectorText) return;
|
|
55
|
+
|
|
56
|
+
queryAllRoots(selectorText).forEach((node) => {
|
|
57
|
+
const text = textFor(node).slice(0, 2000);
|
|
58
|
+
if (!dialogPattern.test(text)) return;
|
|
59
|
+
if (requireOverlayPrompt && !/log in|sign up/i.test(text)) return;
|
|
60
|
+
if (!matchingNodes.includes(node)) matchingNodes.push(node);
|
|
61
|
+
});
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
collectMatchingNodes(config.dialogSelectors, false);
|
|
65
|
+
collectMatchingNodes(config.overlaySelectors, true);
|
|
66
|
+
|
|
67
|
+
const withinMatchingNode = (node) => {
|
|
68
|
+
if (!node || matchingNodes.length === 0) return true;
|
|
69
|
+
return matchingNodes.some((container) => container === node || container.contains(node));
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
const clickCloseButton = () => {
|
|
73
|
+
const selectorText = (config.closeSelectors || []).join(', ');
|
|
74
|
+
if (!selectorText) return false;
|
|
75
|
+
const candidates = queryAllRoots(selectorText);
|
|
76
|
+
|
|
77
|
+
for (const candidate of candidates) {
|
|
78
|
+
const button = candidate.closest('button, [role="button"]') || candidate;
|
|
79
|
+
if (!button || !visible(button)) continue;
|
|
80
|
+
if (!withinMatchingNode(candidate) && !withinMatchingNode(button)) continue;
|
|
81
|
+
|
|
82
|
+
const label = textFor(button).toLowerCase();
|
|
83
|
+
const labelMatches = !closeLabelPattern || closeLabelPattern.test(label) || (config.allowEmptyCloseLabel && label === '');
|
|
84
|
+
if (!labelMatches) continue;
|
|
85
|
+
|
|
86
|
+
button.click();
|
|
87
|
+
restoreScroll();
|
|
88
|
+
return true;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return false;
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
if (clickCloseButton()) return true;
|
|
95
|
+
|
|
96
|
+
let removed = false;
|
|
97
|
+
matchingNodes.forEach((node) => {
|
|
98
|
+
node.remove();
|
|
99
|
+
removed = true;
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
if (removed) restoreScroll();
|
|
103
|
+
return removed;
|
|
104
|
+
})()
|
|
105
|
+
JS
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def js_dom_helpers
|
|
109
|
+
<<~JS
|
|
110
|
+
const queryAllRoots = (selectors) => {
|
|
111
|
+
const matches = [];
|
|
112
|
+
const queue = [document];
|
|
113
|
+
while (queue.length) {
|
|
114
|
+
const root = queue.shift();
|
|
115
|
+
if (!root || !root.querySelectorAll) continue;
|
|
116
|
+
root.querySelectorAll(selectors).forEach((el) => matches.push(el));
|
|
117
|
+
root.querySelectorAll('*').forEach((el) => {
|
|
118
|
+
if (el.shadowRoot) queue.push(el.shadowRoot);
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
return matches;
|
|
122
|
+
};
|
|
123
|
+
|
|
124
|
+
const visible = (el) => {
|
|
125
|
+
const rect = el.getBoundingClientRect();
|
|
126
|
+
const style = window.getComputedStyle(el);
|
|
127
|
+
return rect.width > 0 && rect.height > 0 && style.visibility !== 'hidden' && style.display !== 'none';
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
const textFor = (el) => {
|
|
131
|
+
const values = [el.innerText, el.textContent, el.value, el.getAttribute('aria-label'), el.getAttribute('title')]
|
|
132
|
+
.filter(Boolean)
|
|
133
|
+
.map((value) => value.replace(/\\s+/g, ' ').trim())
|
|
134
|
+
.filter(Boolean);
|
|
135
|
+
return values[0] || '';
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
const restoreScroll = () => {
|
|
139
|
+
if (document.body) document.body.style.overflow = 'auto';
|
|
140
|
+
if (document.documentElement) document.documentElement.style.overflow = 'auto';
|
|
141
|
+
};
|
|
142
|
+
JS
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def safe_evaluate(page, script, default: false)
|
|
146
|
+
page.evaluate(script)
|
|
147
|
+
rescue Ferrum::JavaScriptError, Ferrum::TimeoutError
|
|
148
|
+
default
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def symbolize_hash(hash)
|
|
152
|
+
result = {}
|
|
153
|
+
hash.each do |key, value|
|
|
154
|
+
result[key.to_sym] = value
|
|
155
|
+
end
|
|
156
|
+
result
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
# rubocop:enable Metrics/ModuleLength
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module InteractionHelpers
|
|
6
|
+
module TimingHelpers
|
|
7
|
+
private
|
|
8
|
+
|
|
9
|
+
def retry_until_timeout(timeout, interval: 0.2)
|
|
10
|
+
deadline = Process.clock_gettime(Process::CLOCK_MONOTONIC) + timeout
|
|
11
|
+
|
|
12
|
+
loop do
|
|
13
|
+
result = yield
|
|
14
|
+
return true if result == true
|
|
15
|
+
return false if Process.clock_gettime(Process::CLOCK_MONOTONIC) >= deadline
|
|
16
|
+
|
|
17
|
+
sleep(result.is_a?(Numeric) ? result : interval)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def capped_timeout(max_timeout)
|
|
22
|
+
[@timeout, max_timeout].min
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def settle_after_stabilization(max_wait)
|
|
26
|
+
sleep [@wait, max_wait].min if @wait.positive?
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def social_login_phase_pause
|
|
30
|
+
if @wait.positive?
|
|
31
|
+
settle_after_stabilization(SOCIAL_LOGIN_PHASE_WAIT)
|
|
32
|
+
else
|
|
33
|
+
sleep SOCIAL_LOGIN_PHASE_WAIT
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module InteractionHelpers
|
|
6
|
+
autoload :ConsentHelpers, "fetch_util/browser/interaction_helpers/consent_helpers"
|
|
7
|
+
autoload :DomInteraction, "fetch_util/browser/interaction_helpers/dom_interaction"
|
|
8
|
+
autoload :TimingHelpers, "fetch_util/browser/interaction_helpers/timing_helpers"
|
|
9
|
+
|
|
10
|
+
include ConsentHelpers
|
|
11
|
+
include DomInteraction
|
|
12
|
+
include TimingHelpers
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module Navigation
|
|
6
|
+
module HeadersAndReadiness
|
|
7
|
+
private
|
|
8
|
+
|
|
9
|
+
def default_headers
|
|
10
|
+
{
|
|
11
|
+
"User-Agent" => @user_agent,
|
|
12
|
+
"Accept-Language" => @accept_language
|
|
13
|
+
}
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def page_loaded_enough?(page)
|
|
17
|
+
page.evaluate(<<~JS)
|
|
18
|
+
(() => !!(document && document.body && (document.body.innerText || document.body.textContent || '').trim().length > 0))()
|
|
19
|
+
JS
|
|
20
|
+
rescue Ferrum::JavaScriptError, Ferrum::TimeoutError
|
|
21
|
+
false
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module Navigation
|
|
6
|
+
module NavigatorPatch
|
|
7
|
+
private
|
|
8
|
+
|
|
9
|
+
def navigator_patch
|
|
10
|
+
ua_version = @user_agent[%r{Chrome/([\d.]+)}, 1] || "136.0.7103.113"
|
|
11
|
+
major = ua_version.split(".").first
|
|
12
|
+
languages_json = JSON.generate(@accept_language.split(",").map { |part| part.split(";").first.strip })
|
|
13
|
+
<<~JS
|
|
14
|
+
Object.defineProperty(navigator, "webdriver", { get: () => undefined });
|
|
15
|
+
Object.defineProperty(navigator, "languages", { get: () => #{languages_json} });
|
|
16
|
+
Object.defineProperty(navigator, "platform", { get: () => "Linux x86_64" });
|
|
17
|
+
|
|
18
|
+
Object.defineProperty(navigator, "plugins", {
|
|
19
|
+
get: () => {
|
|
20
|
+
const p = { 0: { name: "PDF Viewer", filename: "internal-pdf-viewer", description: "Portable Document Format" },
|
|
21
|
+
1: { name: "Chrome PDF Viewer", filename: "internal-pdf-viewer", description: "Portable Document Format" },
|
|
22
|
+
2: { name: "Chromium PDF Viewer", filename: "internal-pdf-viewer", description: "Portable Document Format" },
|
|
23
|
+
length: 3 };
|
|
24
|
+
p[Symbol.iterator] = function*() { yield p[0]; yield p[1]; yield p[2]; };
|
|
25
|
+
return p;
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
Object.defineProperty(navigator, "mimeTypes", {
|
|
29
|
+
get: () => {
|
|
30
|
+
const m = { 0: { type: "application/pdf", suffixes: "pdf", description: "Portable Document Format" },
|
|
31
|
+
length: 1 };
|
|
32
|
+
m[Symbol.iterator] = function*() { yield m[0]; };
|
|
33
|
+
return m;
|
|
34
|
+
}
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
if (!window.chrome) {
|
|
38
|
+
window.chrome = { runtime: {}, loadTimes: function(){}, csi: function(){} };
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const origQuery = window.Permissions && Permissions.prototype.query;
|
|
42
|
+
if (origQuery) {
|
|
43
|
+
Permissions.prototype.query = function(parameters) {
|
|
44
|
+
return parameters.name === "notifications"
|
|
45
|
+
? Promise.resolve({ state: Notification.permission })
|
|
46
|
+
: origQuery.call(this, parameters);
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
Object.defineProperty(navigator, "hardwareConcurrency", { get: () => 4 });
|
|
51
|
+
|
|
52
|
+
if (!navigator.deviceMemory) {
|
|
53
|
+
Object.defineProperty(navigator, "deviceMemory", { get: () => 8 });
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (!navigator.connection) {
|
|
57
|
+
Object.defineProperty(navigator, "connection", {
|
|
58
|
+
get: () => ({ effectiveType: "4g", rtt: 50, downlink: 10, saveData: false })
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
{
|
|
63
|
+
const uaData = navigator.userAgentData;
|
|
64
|
+
const missingUserAgentData = !uaData || !Array.isArray(uaData.brands) || uaData.brands.length === 0 || !uaData.platform;
|
|
65
|
+
if (missingUserAgentData) {
|
|
66
|
+
Object.defineProperty(navigator, "userAgentData", {
|
|
67
|
+
get: () => ({
|
|
68
|
+
brands: [
|
|
69
|
+
{ brand: "Chromium", version: "#{major}" },
|
|
70
|
+
{ brand: "Google Chrome", version: "#{major}" },
|
|
71
|
+
{ brand: "Not.A/Brand", version: "24" }
|
|
72
|
+
],
|
|
73
|
+
mobile: false,
|
|
74
|
+
platform: "Linux",
|
|
75
|
+
getHighEntropyValues: function(hints) {
|
|
76
|
+
return Promise.resolve({
|
|
77
|
+
architecture: "x86",
|
|
78
|
+
bitness: "64",
|
|
79
|
+
brands: this.brands,
|
|
80
|
+
fullVersionList: [
|
|
81
|
+
{ brand: "Chromium", version: "#{ua_version}" },
|
|
82
|
+
{ brand: "Google Chrome", version: "#{ua_version}" }
|
|
83
|
+
],
|
|
84
|
+
mobile: false,
|
|
85
|
+
model: "",
|
|
86
|
+
platform: "Linux",
|
|
87
|
+
platformVersion: "6.1.0",
|
|
88
|
+
uaFullVersion: "#{ua_version}"
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
})
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
{
|
|
97
|
+
const getParameterProto = WebGLRenderingContext.prototype.getParameter;
|
|
98
|
+
WebGLRenderingContext.prototype.getParameter = function(param) {
|
|
99
|
+
const debugExt = this.getExtension('WEBGL_debug_renderer_info');
|
|
100
|
+
if (debugExt) {
|
|
101
|
+
if (param === debugExt.UNMASKED_VENDOR_WEBGL) return 'Google Inc. (Intel)';
|
|
102
|
+
if (param === debugExt.UNMASKED_RENDERER_WEBGL)
|
|
103
|
+
return 'ANGLE (Intel, Mesa Intel(R) UHD Graphics 630 (CFL GT2), OpenGL 4.6)';
|
|
104
|
+
}
|
|
105
|
+
return getParameterProto.call(this, param);
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
Object.defineProperty(screen, "width", { get: () => #{@viewport.fetch(:width)} });
|
|
110
|
+
Object.defineProperty(screen, "height", { get: () => #{@viewport.fetch(:height)} });
|
|
111
|
+
Object.defineProperty(screen, "availWidth", { get: () => #{@viewport.fetch(:width)} });
|
|
112
|
+
Object.defineProperty(screen, "availHeight", { get: () => #{@viewport.fetch(:height)} });
|
|
113
|
+
JS
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FetchUtil
|
|
4
|
+
class Browser
|
|
5
|
+
module Navigation
|
|
6
|
+
autoload :HeadersAndReadiness, "fetch_util/browser/navigation/headers_and_readiness"
|
|
7
|
+
autoload :NavigatorPatch, "fetch_util/browser/navigation/navigator_patch"
|
|
8
|
+
|
|
9
|
+
include HeadersAndReadiness
|
|
10
|
+
include NavigatorPatch
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|