auspex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/agent/actions.d.ts +5 -0
- package/dist/agent/actions.d.ts.map +1 -0
- package/dist/agent/actions.js +26 -0
- package/dist/agent/actions.js.map +1 -0
- package/dist/agent/agent.d.ts +12 -0
- package/dist/agent/agent.d.ts.map +1 -0
- package/dist/agent/agent.js +147 -0
- package/dist/agent/agent.js.map +1 -0
- package/dist/agent/loop.d.ts +6 -0
- package/dist/agent/loop.d.ts.map +1 -0
- package/dist/agent/loop.js +165 -0
- package/dist/agent/loop.js.map +1 -0
- package/dist/agent/report.d.ts +3 -0
- package/dist/agent/report.d.ts.map +1 -0
- package/dist/agent/report.js +90 -0
- package/dist/agent/report.js.map +1 -0
- package/dist/browser/executor.d.ts +5 -0
- package/dist/browser/executor.d.ts.map +1 -0
- package/dist/browser/executor.js +33 -0
- package/dist/browser/executor.js.map +1 -0
- package/dist/browser/snapshot.d.ts +6 -0
- package/dist/browser/snapshot.d.ts.map +1 -0
- package/dist/browser/snapshot.js +145 -0
- package/dist/browser/snapshot.js.map +1 -0
- package/dist/config/defaults.d.ts +10 -0
- package/dist/config/defaults.d.ts.map +1 -0
- package/dist/config/defaults.js +10 -0
- package/dist/config/defaults.js.map +1 -0
- package/dist/config/schema.d.ts +59 -0
- package/dist/config/schema.d.ts.map +1 -0
- package/dist/config/schema.js +23 -0
- package/dist/config/schema.js.map +1 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/client.d.ts +23 -0
- package/dist/llm/client.d.ts.map +1 -0
- package/dist/llm/client.js +51 -0
- package/dist/llm/client.js.map +1 -0
- package/dist/llm/prompt.d.ts +3 -0
- package/dist/llm/prompt.d.ts.map +1 -0
- package/dist/llm/prompt.js +36 -0
- package/dist/llm/prompt.js.map +1 -0
- package/dist/scraper/extractors/content.d.ts +22 -0
- package/dist/scraper/extractors/content.d.ts.map +1 -0
- package/dist/scraper/extractors/content.js +237 -0
- package/dist/scraper/extractors/content.js.map +1 -0
- package/dist/scraper/extractors/ssr.d.ts +17 -0
- package/dist/scraper/extractors/ssr.d.ts.map +1 -0
- package/dist/scraper/extractors/ssr.js +162 -0
- package/dist/scraper/extractors/ssr.js.map +1 -0
- package/dist/scraper/extractors/to-markdown.d.ts +5 -0
- package/dist/scraper/extractors/to-markdown.d.ts.map +1 -0
- package/dist/scraper/extractors/to-markdown.js +103 -0
- package/dist/scraper/extractors/to-markdown.js.map +1 -0
- package/dist/scraper/index.d.ts +27 -0
- package/dist/scraper/index.d.ts.map +1 -0
- package/dist/scraper/index.js +178 -0
- package/dist/scraper/index.js.map +1 -0
- package/dist/scraper/tiers/tier1-http.d.ts +5 -0
- package/dist/scraper/tiers/tier1-http.d.ts.map +1 -0
- package/dist/scraper/tiers/tier1-http.js +120 -0
- package/dist/scraper/tiers/tier1-http.js.map +1 -0
- package/dist/scraper/tiers/tier2-stealth.d.ts +5 -0
- package/dist/scraper/tiers/tier2-stealth.d.ts.map +1 -0
- package/dist/scraper/tiers/tier2-stealth.js +106 -0
- package/dist/scraper/tiers/tier2-stealth.js.map +1 -0
- package/dist/scraper/tiers/tier3-browser.d.ts +10 -0
- package/dist/scraper/tiers/tier3-browser.d.ts.map +1 -0
- package/dist/scraper/tiers/tier3-browser.js +504 -0
- package/dist/scraper/tiers/tier3-browser.js.map +1 -0
- package/dist/scraper/types.d.ts +130 -0
- package/dist/scraper/types.d.ts.map +1 -0
- package/dist/scraper/types.js +3 -0
- package/dist/scraper/types.js.map +1 -0
- package/dist/security/action-validator.d.ts +83 -0
- package/dist/security/action-validator.d.ts.map +1 -0
- package/dist/security/action-validator.js +36 -0
- package/dist/security/action-validator.js.map +1 -0
- package/dist/security/url-validator.d.ts +9 -0
- package/dist/security/url-validator.d.ts.map +1 -0
- package/dist/security/url-validator.js +69 -0
- package/dist/security/url-validator.js.map +1 -0
- package/dist/types.d.ts +95 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +54 -0
- package/readme.md +760 -0
|
@@ -0,0 +1,504 @@
|
|
|
1
|
+
import { chromium } from "playwright";
|
|
2
|
+
import { extractContent } from "../extractors/content.js";
|
|
3
|
+
import { htmlToMarkdown } from "../extractors/to-markdown.js";
|
|
4
|
+
// ─── Tier 3: Playwright Chromium (fallback final) ──────────────────────────
|
|
5
|
+
//
|
|
6
|
+
// Acionado quando Tier 1 (HTTP) e Tier 2 (Stealth HTTP) falham.
|
|
7
|
+
// Casos típicos: SPAs complexas, anti-bot pesado (Cloudflare, Akamai, etc.).
|
|
8
|
+
//
|
|
9
|
+
// Estratégias aplicadas:
|
|
10
|
+
// 1. Stealth scripts injetados antes de qualquer script da página
|
|
11
|
+
// 2. Interceptar chamadas de API JSON (melhor para SPAs — dados diretos)
|
|
12
|
+
// 3. Bloquear recursos desnecessários (fonts, media, analytics)
|
|
13
|
+
// 4. Aguardar networkidle ou seletor específico
|
|
14
|
+
// 5. Extrair DOM completo e converter para Markdown
|
|
15
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
16
|
+
// User-Agent de Chrome real para Windows (OS mais comum = menos suspeito).
|
|
17
|
+
// Atualizar a cada 2-3 versões major do Chrome.
|
|
18
|
+
const CHROME_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36";
|
|
19
|
+
// Args que reduzem sinais de automação detectáveis
|
|
20
|
+
const STEALTH_ARGS = [
|
|
21
|
+
"--disable-blink-features=AutomationControlled",
|
|
22
|
+
"--disable-features=IsolateOrigins,site-per-process",
|
|
23
|
+
"--disable-infobars",
|
|
24
|
+
"--no-first-run",
|
|
25
|
+
"--no-sandbox",
|
|
26
|
+
"--disable-setuid-sandbox",
|
|
27
|
+
"--disable-dev-shm-usage",
|
|
28
|
+
"--disable-accelerated-2d-canvas",
|
|
29
|
+
"--no-zygote",
|
|
30
|
+
"--disable-gpu",
|
|
31
|
+
"--window-size=1920,1080",
|
|
32
|
+
"--disable-background-networking",
|
|
33
|
+
"--disable-client-side-phishing-detection",
|
|
34
|
+
"--disable-component-update",
|
|
35
|
+
"--disable-default-apps",
|
|
36
|
+
"--disable-domain-reliability",
|
|
37
|
+
"--disable-extensions",
|
|
38
|
+
"--disable-hang-monitor",
|
|
39
|
+
"--disable-popup-blocking",
|
|
40
|
+
"--disable-prompt-on-repost",
|
|
41
|
+
"--disable-sync",
|
|
42
|
+
"--metrics-recording-only",
|
|
43
|
+
"--safebrowsing-disable-auto-update",
|
|
44
|
+
];
|
|
45
|
+
// ─── Script de anti-detecção (injetado antes de qualquer JS da página) ────────
|
|
46
|
+
//
|
|
47
|
+
// Cobre as principais técnicas usadas por anti-bots modernos
|
|
48
|
+
// (Cloudflare, DataDome, Akamai, PerimeterX, Shape Security):
|
|
49
|
+
//
|
|
50
|
+
// 1. navigator.webdriver → remove o flag mais óbvio
|
|
51
|
+
// 2. navigator.plugins → simula os 3 plugins reais do Chrome
|
|
52
|
+
// 3. Propriedades de hardware → concurrency, memory, maxTouchPoints, vendor, platform
|
|
53
|
+
// 4. window.chrome → objeto completo (runtime, loadTimes, csi, app)
|
|
54
|
+
// 5. Notification.permission → 'default' (headless retorna 'denied')
|
|
55
|
+
// 6. Permission API → 'prompt' para notifications
|
|
56
|
+
// 7. Canvas fingerprint → ruído de 1 bit no toDataURL (quebra fingerprinting)
|
|
57
|
+
// 8. WebGL UNMASKED_VENDOR/RENDERER → GPU Intel realista (em vez de llvmpipe/SwiftShader)
|
|
58
|
+
// 9. Screen.colorDepth/pixelDepth → 24 bits
|
|
59
|
+
// 10. Remoção de artefatos → remove vars de outras ferramentas (Selenium, PhantomJS)
|
|
60
|
+
// ──────────────────────────────────────────────────────────────────────────────
|
|
61
|
+
const STEALTH_INIT_SCRIPT = /* language=javascript */ `
|
|
62
|
+
(function () {
|
|
63
|
+
// ── 1. Remove a flag mais básica de automação ─────────────────────────
|
|
64
|
+
Object.defineProperty(navigator, 'webdriver', {
|
|
65
|
+
get: () => undefined,
|
|
66
|
+
configurable: true,
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
// ── 2. Plugins realistas de um Chrome normal ──────────────────────────
|
|
70
|
+
// navigator.plugins.length === 0 é o maior red-flag de headless.
|
|
71
|
+
const makeMime = (type, suffixes, desc, plugin) => {
|
|
72
|
+
const mt = Object.create(MimeType.prototype);
|
|
73
|
+
Object.defineProperties(mt, {
|
|
74
|
+
type: { value: type, enumerable: true },
|
|
75
|
+
suffixes: { value: suffixes, enumerable: true },
|
|
76
|
+
description: { value: desc, enumerable: true },
|
|
77
|
+
enabledPlugin: { value: plugin, enumerable: true },
|
|
78
|
+
});
|
|
79
|
+
return mt;
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
const makePlugin = (name, desc, filename, mimeSpecs) => {
|
|
83
|
+
const p = Object.create(Plugin.prototype);
|
|
84
|
+
Object.defineProperties(p, {
|
|
85
|
+
name: { value: name, enumerable: true },
|
|
86
|
+
description: { value: desc, enumerable: true },
|
|
87
|
+
filename: { value: filename, enumerable: true },
|
|
88
|
+
length: { value: mimeSpecs.length },
|
|
89
|
+
});
|
|
90
|
+
mimeSpecs.forEach((spec, i) => {
|
|
91
|
+
const mt = makeMime(spec.type, spec.suffixes, spec.desc, p);
|
|
92
|
+
Object.defineProperty(p, i, { value: mt, enumerable: true });
|
|
93
|
+
Object.defineProperty(p, spec.type, { value: mt });
|
|
94
|
+
});
|
|
95
|
+
p.item = (i) => p[i] ?? null;
|
|
96
|
+
p.namedItem = (n) => p[n] ?? null;
|
|
97
|
+
return p;
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
const pdfViewer = makePlugin(
|
|
101
|
+
'PDF Viewer', 'Portable Document Format', 'internal-pdf-viewer',
|
|
102
|
+
[
|
|
103
|
+
{ type: 'application/pdf', suffixes: 'pdf', desc: '' },
|
|
104
|
+
{ type: 'text/pdf', suffixes: 'pdf', desc: '' },
|
|
105
|
+
],
|
|
106
|
+
);
|
|
107
|
+
const chromePDF = makePlugin(
|
|
108
|
+
'Chrome PDF Viewer', '', 'mhjfbmdgcfjbbpaeojofohoefgiehjai',
|
|
109
|
+
[{ type: 'application/pdf', suffixes: 'pdf', desc: '' }],
|
|
110
|
+
);
|
|
111
|
+
const nacl = makePlugin(
|
|
112
|
+
'Native Client', '', 'internal-nacl-plugin',
|
|
113
|
+
[
|
|
114
|
+
{ type: 'application/x-nacl', suffixes: '', desc: 'Native Client Executable' },
|
|
115
|
+
{ type: 'application/x-pnacl', suffixes: '', desc: 'Portable Native Client Executable' },
|
|
116
|
+
],
|
|
117
|
+
);
|
|
118
|
+
|
|
119
|
+
const pluginList = [pdfViewer, chromePDF, nacl];
|
|
120
|
+
const pa = Object.create(PluginArray.prototype);
|
|
121
|
+
Object.defineProperty(pa, 'length', { value: pluginList.length });
|
|
122
|
+
pluginList.forEach((plug, i) => {
|
|
123
|
+
Object.defineProperty(pa, i, { value: plug, enumerable: true });
|
|
124
|
+
Object.defineProperty(pa, plug.name, { value: plug });
|
|
125
|
+
});
|
|
126
|
+
pa.item = (i) => pluginList[i] ?? null;
|
|
127
|
+
pa.namedItem = (n) => pa[n] ?? null;
|
|
128
|
+
pa.refresh = () => {};
|
|
129
|
+
|
|
130
|
+
Object.defineProperty(navigator, 'plugins', { get: () => pa });
|
|
131
|
+
|
|
132
|
+
// ── 3. Propriedades de hardware realistas ─────────────────────────────
|
|
133
|
+
Object.defineProperty(navigator, 'languages', { get: () => ['pt-BR', 'pt', 'en-US', 'en'] });
|
|
134
|
+
Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 });
|
|
135
|
+
Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
|
|
136
|
+
Object.defineProperty(navigator, 'maxTouchPoints', { get: () => 0 });
|
|
137
|
+
Object.defineProperty(navigator, 'vendor', { get: () => 'Google Inc.' });
|
|
138
|
+
Object.defineProperty(navigator, 'platform', { get: () => 'Win32' });
|
|
139
|
+
|
|
140
|
+
// ── 4. window.chrome — objeto completo como Chrome real ──────────────
|
|
141
|
+
// Automação headless deixa window.chrome undefined ou com .runtime vazio.
|
|
142
|
+
if (!window.chrome) window.chrome = {};
|
|
143
|
+
|
|
144
|
+
if (!window.chrome.app) {
|
|
145
|
+
window.chrome.app = {
|
|
146
|
+
isInstalled: false,
|
|
147
|
+
getDetails: () => null,
|
|
148
|
+
getIsInstalled: () => false,
|
|
149
|
+
InstallState: { DISABLED: 'disabled', INSTALLED: 'installed', NOT_INSTALLED: 'not_installed' },
|
|
150
|
+
RunningState: { CANNOT_RUN: 'cannot_run', READY_TO_RUN: 'ready_to_run', RUNNING: 'running' },
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if (!window.chrome.runtime) {
|
|
155
|
+
window.chrome.runtime = {
|
|
156
|
+
id: undefined,
|
|
157
|
+
connect: () => { throw Object.assign(new Error('Could not establish connection.'), { message: 'Could not establish connection. Receiving end does not exist.' }); },
|
|
158
|
+
sendMessage: () => { throw Object.assign(new Error('Could not establish connection.'), { message: 'Could not establish connection. Receiving end does not exist.' }); },
|
|
159
|
+
PlatformOs: { MAC: 'mac', WIN: 'win', ANDROID: 'android', CROS: 'cros', LINUX: 'linux', OPENBSD: 'openbsd' },
|
|
160
|
+
PlatformArch: { ARM: 'arm', ARM64: 'arm64', X86_32: 'x86-32', X86_64: 'x86-64', MIPS: 'mips', MIPS64: 'mips64' },
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
if (!window.chrome.loadTimes) {
|
|
165
|
+
window.chrome.loadTimes = () => {
|
|
166
|
+
const now = Date.now() / 1000;
|
|
167
|
+
return {
|
|
168
|
+
requestTime: now - 1.5 - Math.random() * 0.5,
|
|
169
|
+
startLoadTime: now - 1.2 - Math.random() * 0.3,
|
|
170
|
+
commitLoadTime: now - 0.8 - Math.random() * 0.2,
|
|
171
|
+
finishDocumentLoadTime: now - 0.3 - Math.random() * 0.1,
|
|
172
|
+
finishLoadTime: now - 0.1 - Math.random() * 0.05,
|
|
173
|
+
firstPaintTime: now - 0.9 - Math.random() * 0.2,
|
|
174
|
+
firstPaintAfterLoadTime: now - 0.05,
|
|
175
|
+
navigationType: 'Other',
|
|
176
|
+
wasFetchedViaSpdy: true,
|
|
177
|
+
wasNpnNegotiated: true,
|
|
178
|
+
npnNegotiatedProtocol: 'h2',
|
|
179
|
+
wasAlternateProtocolAvailable: false,
|
|
180
|
+
connectionInfo: 'h2',
|
|
181
|
+
};
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
if (!window.chrome.csi) {
|
|
186
|
+
window.chrome.csi = () => ({
|
|
187
|
+
startE: Date.now() - 1000,
|
|
188
|
+
onloadT: Date.now(),
|
|
189
|
+
pageT: 500 + Math.random() * 1000,
|
|
190
|
+
tran: 15,
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// ── 5. Notification API — headless retorna 'denied', real retorna 'default' ─
|
|
195
|
+
try {
|
|
196
|
+
if (typeof Notification !== 'undefined') {
|
|
197
|
+
Object.defineProperty(Notification, 'permission', { get: () => 'default' });
|
|
198
|
+
}
|
|
199
|
+
} catch (_) {}
|
|
200
|
+
|
|
201
|
+
// ── 6. Permission API — 'notifications' deve retornar 'prompt' ────────
|
|
202
|
+
if (navigator.permissions) {
|
|
203
|
+
const origQuery = navigator.permissions.query.bind(navigator.permissions);
|
|
204
|
+
navigator.permissions.query = (params) => {
|
|
205
|
+
if (params && params.name === 'notifications') {
|
|
206
|
+
return Promise.resolve({ state: 'prompt', onchange: null, addEventListener: () => {}, removeEventListener: () => {}, dispatchEvent: () => true });
|
|
207
|
+
}
|
|
208
|
+
return origQuery(params);
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// ── 7. Canvas fingerprint — ruído sutil no último byte do dataURL ─────
|
|
213
|
+
// Técnica: altera 1 bit → output diferente em cada run → quebra fingerprinting.
|
|
214
|
+
// Impacto visual: imperceptível (altera apenas o encoding base64 do último pixel).
|
|
215
|
+
const _origToDataURL = HTMLCanvasElement.prototype.toDataURL;
|
|
216
|
+
HTMLCanvasElement.prototype.toDataURL = function (type, quality) {
|
|
217
|
+
const data = _origToDataURL.call(this, type, quality);
|
|
218
|
+
if (data.length < 12) return data;
|
|
219
|
+
const idx = data.length - 2;
|
|
220
|
+
return data.slice(0, idx) + String.fromCharCode(data.charCodeAt(idx) ^ 0x01) + data.slice(idx + 1);
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
// ── 8. WebGL — GPU Intel realista em vez de llvmpipe/SwiftShader ─────
|
|
224
|
+
// llvmpipe/SwiftShader = fingerprint de VM detectado por todos os anti-bots.
|
|
225
|
+
const WEBGL_VENDOR = 'Google Inc. (Intel)';
|
|
226
|
+
const WEBGL_RENDERER = 'ANGLE (Intel, Intel(R) UHD Graphics 620 Direct3D11 vs_5_0 ps_5_0, D3D11)';
|
|
227
|
+
|
|
228
|
+
const patchWebGL = (Ctx) => {
|
|
229
|
+
if (!Ctx) return;
|
|
230
|
+
const orig = Ctx.prototype.getParameter;
|
|
231
|
+
Ctx.prototype.getParameter = function (param) {
|
|
232
|
+
if (param === 37445) return WEBGL_VENDOR; // UNMASKED_VENDOR_WEBGL
|
|
233
|
+
if (param === 37446) return WEBGL_RENDERER; // UNMASKED_RENDERER_WEBGL
|
|
234
|
+
return orig.call(this, param);
|
|
235
|
+
};
|
|
236
|
+
};
|
|
237
|
+
|
|
238
|
+
patchWebGL(typeof WebGLRenderingContext !== 'undefined' ? WebGLRenderingContext : null);
|
|
239
|
+
patchWebGL(typeof WebGL2RenderingContext !== 'undefined' ? WebGL2RenderingContext : null);
|
|
240
|
+
|
|
241
|
+
// ── 9. Screen depth realista ──────────────────────────────────────────
|
|
242
|
+
try {
|
|
243
|
+
Object.defineProperty(screen, 'colorDepth', { get: () => 24 });
|
|
244
|
+
Object.defineProperty(screen, 'pixelDepth', { get: () => 24 });
|
|
245
|
+
} catch (_) {}
|
|
246
|
+
|
|
247
|
+
// ── 10. Remove artefatos de outras ferramentas de automação ──────────
|
|
248
|
+
const automationVars = ['__nightmare', '_phantom', 'callPhantom',
|
|
249
|
+
'__selenium_evaluate', '__webdriver_evaluate', '_Selenium_IDE_Recorder',
|
|
250
|
+
'__webdriver_script_fn', '__lastWatirAlert', '__lastWatirConfirm'];
|
|
251
|
+
automationVars.forEach(v => { try { delete window[v]; } catch (_) {} });
|
|
252
|
+
|
|
253
|
+
})();
|
|
254
|
+
`;
|
|
255
|
+
// Recursos que bloqueamos para economizar banda/tempo.
|
|
256
|
+
// "image" incluído: extraímos texto/markdown, não renderizamos visualmente.
|
|
257
|
+
const BLOCKED_RESOURCE_TYPES = new Set(["font", "media", "image"]);
|
|
258
|
+
// Padrões de analytics/rastreamento a bloquear
|
|
259
|
+
const BLOCKED_URL_PATTERNS = [
|
|
260
|
+
"google-analytics.com",
|
|
261
|
+
"googletagmanager.com",
|
|
262
|
+
"facebook.net/en_US/fbevents.js",
|
|
263
|
+
"connect.facebook.net",
|
|
264
|
+
"hotjar.com",
|
|
265
|
+
"fullstory.com",
|
|
266
|
+
"segment.com",
|
|
267
|
+
"mixpanel.com",
|
|
268
|
+
"amplitude.com",
|
|
269
|
+
"sentry.io",
|
|
270
|
+
"clarity.ms",
|
|
271
|
+
"doubleclick.net",
|
|
272
|
+
"adnxs.com",
|
|
273
|
+
"criteo.com",
|
|
274
|
+
"taboola.com",
|
|
275
|
+
"outbrain.com",
|
|
276
|
+
];
|
|
277
|
+
export class Tier3Browser {
|
|
278
|
+
browser = null;
|
|
279
|
+
browserConfig;
|
|
280
|
+
constructor(browserConfig = {}) {
|
|
281
|
+
this.browserConfig = browserConfig;
|
|
282
|
+
}
|
|
283
|
+
// ── Lifecycle do browser (singleton reutilizável) ──────────────────────
|
|
284
|
+
async getBrowser() {
|
|
285
|
+
if (this.browser?.isConnected())
|
|
286
|
+
return this.browser;
|
|
287
|
+
const launchOptions = {
|
|
288
|
+
headless: this.browserConfig.headless ?? true,
|
|
289
|
+
args: STEALTH_ARGS,
|
|
290
|
+
};
|
|
291
|
+
// Estratégia de resolução do executável:
|
|
292
|
+
// 1. executablePath explícito (máxima flexibilidade)
|
|
293
|
+
// 2. channel (ex: 'chrome' → usa Chrome do sistema)
|
|
294
|
+
// 3. sem channel → Playwright usa o Chromium que ele mesmo baixou
|
|
295
|
+
if (this.browserConfig.executablePath) {
|
|
296
|
+
launchOptions.executablePath = this.browserConfig.executablePath;
|
|
297
|
+
}
|
|
298
|
+
else if (this.browserConfig.channel) {
|
|
299
|
+
launchOptions.channel = this.browserConfig.channel;
|
|
300
|
+
}
|
|
301
|
+
else {
|
|
302
|
+
// Tenta Chrome do sistema primeiro; se não tiver, usa playwright-chromium
|
|
303
|
+
try {
|
|
304
|
+
this.browser = await chromium.launch({ ...launchOptions, channel: "chrome" });
|
|
305
|
+
return this.browser;
|
|
306
|
+
}
|
|
307
|
+
catch {
|
|
308
|
+
// Chrome não encontrado → deixa o Playwright usar o Chromium dele
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
this.browser = await chromium.launch(launchOptions);
|
|
312
|
+
return this.browser;
|
|
313
|
+
}
|
|
314
|
+
// ── Scraping principal ─────────────────────────────────────────────────
|
|
315
|
+
async scrape(url, options = {}) {
|
|
316
|
+
const startTime = Date.now();
|
|
317
|
+
const browser = await this.getBrowser();
|
|
318
|
+
let context = null;
|
|
319
|
+
try {
|
|
320
|
+
context = await browser.newContext({
|
|
321
|
+
userAgent: CHROME_UA,
|
|
322
|
+
viewport: { width: 1920, height: 1080 },
|
|
323
|
+
locale: "pt-BR",
|
|
324
|
+
timezoneId: "America/Sao_Paulo",
|
|
325
|
+
extraHTTPHeaders: {
|
|
326
|
+
"accept-language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
327
|
+
...(options.headers ?? {}),
|
|
328
|
+
},
|
|
329
|
+
javaScriptEnabled: true,
|
|
330
|
+
// Desabilita WebRTC para evitar vazamento de IP real em ambientes com proxy
|
|
331
|
+
// (equivalente a --disable-webrtc nos args, mas via context)
|
|
332
|
+
});
|
|
333
|
+
const page = await context.newPage();
|
|
334
|
+
// ── Injetar stealth script ANTES de qualquer script da página ─────
|
|
335
|
+
await page.addInitScript(STEALTH_INIT_SCRIPT);
|
|
336
|
+
// ── Bloquear recursos desnecessários ──────────────────────────────
|
|
337
|
+
await page.route("**/*", (route) => {
|
|
338
|
+
const req = route.request();
|
|
339
|
+
const type = req.resourceType();
|
|
340
|
+
const reqUrl = req.url();
|
|
341
|
+
if (BLOCKED_RESOURCE_TYPES.has(type)) {
|
|
342
|
+
return route.abort();
|
|
343
|
+
}
|
|
344
|
+
if (type === "script" &&
|
|
345
|
+
BLOCKED_URL_PATTERNS.some((p) => reqUrl.includes(p))) {
|
|
346
|
+
return route.abort();
|
|
347
|
+
}
|
|
348
|
+
return route.continue();
|
|
349
|
+
});
|
|
350
|
+
// ── Interceptação de APIs JSON (fundamental para SPAs) ────────────
|
|
351
|
+
const interceptedAPIs = [];
|
|
352
|
+
const shouldIntercept = options.interceptAPIs !== false;
|
|
353
|
+
if (shouldIntercept) {
|
|
354
|
+
page.on("response", async (response) => {
|
|
355
|
+
try {
|
|
356
|
+
const contentType = response.headers()["content-type"] ?? "";
|
|
357
|
+
if (!contentType.includes("application/json"))
|
|
358
|
+
return;
|
|
359
|
+
const apiUrl = response.url();
|
|
360
|
+
// Ignora analytics e recursos JS/CSS
|
|
361
|
+
if (BLOCKED_URL_PATTERNS.some((p) => apiUrl.includes(p)))
|
|
362
|
+
return;
|
|
363
|
+
if (/\.(js|css|png|jpg|gif|svg|woff)/.test(apiUrl))
|
|
364
|
+
return;
|
|
365
|
+
// Ignora respostas muito grandes (provavelmente não são dados da view)
|
|
366
|
+
const contentLength = parseInt(response.headers()["content-length"] ?? "0", 10);
|
|
367
|
+
if (contentLength > 500_000)
|
|
368
|
+
return;
|
|
369
|
+
const data = await response.json().catch(() => null);
|
|
370
|
+
if (!data)
|
|
371
|
+
return;
|
|
372
|
+
interceptedAPIs.push({
|
|
373
|
+
url: apiUrl,
|
|
374
|
+
method: response.request().method(),
|
|
375
|
+
statusCode: response.status(),
|
|
376
|
+
contentType,
|
|
377
|
+
data,
|
|
378
|
+
});
|
|
379
|
+
}
|
|
380
|
+
catch {
|
|
381
|
+
// Resposta já consumida ou parse inválido — ignora silenciosamente
|
|
382
|
+
}
|
|
383
|
+
});
|
|
384
|
+
}
|
|
385
|
+
// Auto-dismiss dialogs (alert/confirm/prompt) para não travar a navegação
|
|
386
|
+
page.on("dialog", (dialog) => dialog.dismiss().catch(() => { }));
|
|
387
|
+
// ── Navegação com retry ────────────────────────────────────────────
|
|
388
|
+
// Em sites com anti-bot, a 1ª tentativa pode receber um challenge (403/503).
|
|
389
|
+
// A 2ª tentativa (com cookies/state acumulados) frequentemente passa.
|
|
390
|
+
const timeout = options.timeout ?? 30_000;
|
|
391
|
+
let statusCode = 200;
|
|
392
|
+
let lastNavError = null;
|
|
393
|
+
for (let attempt = 1; attempt <= 2; attempt++) {
|
|
394
|
+
try {
|
|
395
|
+
const navResponse = await page.goto(url, {
|
|
396
|
+
waitUntil: "domcontentloaded",
|
|
397
|
+
timeout: Math.min(timeout, 30_000),
|
|
398
|
+
});
|
|
399
|
+
statusCode = navResponse?.status() ?? 200;
|
|
400
|
+
lastNavError = null;
|
|
401
|
+
break; // Sucesso — sai do loop
|
|
402
|
+
}
|
|
403
|
+
catch (navErr) {
|
|
404
|
+
lastNavError = navErr instanceof Error ? navErr : new Error(String(navErr));
|
|
405
|
+
if (attempt < 2) {
|
|
406
|
+
await page.waitForTimeout(1_500).catch(() => { });
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
if (lastNavError) {
|
|
411
|
+
throw new Error(`Tier3 Browser: falha na navegação — ${lastNavError.message}`);
|
|
412
|
+
}
|
|
413
|
+
// ── Aguardar conteúdo dinâmico ────────────────────────────────────
|
|
414
|
+
// networkidle sinaliza que a SPA terminou de carregar
|
|
415
|
+
await page
|
|
416
|
+
.waitForLoadState("networkidle", {
|
|
417
|
+
timeout: Math.min(timeout * 0.5, 15_000),
|
|
418
|
+
})
|
|
419
|
+
.catch(() => {
|
|
420
|
+
// Timeout é aceitável — prosseguimos com o que tiver
|
|
421
|
+
});
|
|
422
|
+
// Seletor específico do usuário (ex: '.product-list', '#app')
|
|
423
|
+
if (options.waitForSelector) {
|
|
424
|
+
await page
|
|
425
|
+
.waitForSelector(options.waitForSelector, {
|
|
426
|
+
state: "visible",
|
|
427
|
+
timeout: 10_000,
|
|
428
|
+
})
|
|
429
|
+
.catch(() => {
|
|
430
|
+
// Seletor não apareceu — prosseguimos assim mesmo
|
|
431
|
+
});
|
|
432
|
+
}
|
|
433
|
+
// ── Scroll para ativar lazy-loading ───────────────────────────────
|
|
434
|
+
// Muitos sites usam IntersectionObserver para carregar conteúdo apenas
|
|
435
|
+
// quando o usuário rola até ele. Varrer a página simula esse comportamento
|
|
436
|
+
// e garante que todo o conteúdo seja carregado antes da extração.
|
|
437
|
+
await page
|
|
438
|
+
.evaluate(() => {
|
|
439
|
+
return new Promise((resolve) => {
|
|
440
|
+
const totalHeight = document.body.scrollHeight;
|
|
441
|
+
if (totalHeight <= window.innerHeight) {
|
|
442
|
+
resolve();
|
|
443
|
+
return;
|
|
444
|
+
}
|
|
445
|
+
const step = Math.max(Math.floor(totalHeight / 6), 300);
|
|
446
|
+
let scrolled = 0;
|
|
447
|
+
const tick = () => {
|
|
448
|
+
scrolled += step;
|
|
449
|
+
window.scrollTo({ top: scrolled, behavior: "smooth" });
|
|
450
|
+
if (scrolled < totalHeight) {
|
|
451
|
+
// Intervalo variado simula comportamento humano e dá tempo ao
|
|
452
|
+
// IntersectionObserver disparar e buscar conteúdo novo
|
|
453
|
+
setTimeout(tick, 120 + Math.floor(Math.random() * 130));
|
|
454
|
+
}
|
|
455
|
+
else {
|
|
456
|
+
window.scrollTo({ top: 0, behavior: "instant" });
|
|
457
|
+
resolve();
|
|
458
|
+
}
|
|
459
|
+
};
|
|
460
|
+
setTimeout(tick, 400);
|
|
461
|
+
});
|
|
462
|
+
})
|
|
463
|
+
.catch(() => {
|
|
464
|
+
// Scroll falhou (página sem body ou JS bloqueado) — ignora
|
|
465
|
+
});
|
|
466
|
+
// ── Extração de conteúdo ──────────────────────────────────────────
|
|
467
|
+
const [html, pageTitle] = await Promise.all([
|
|
468
|
+
page.content(),
|
|
469
|
+
page.title(),
|
|
470
|
+
]);
|
|
471
|
+
const finalUrl = page.url();
|
|
472
|
+
const formats = options.formats ?? ["markdown", "text"];
|
|
473
|
+
const extracted = extractContent(html, options.onlyMainContent ?? true, finalUrl);
|
|
474
|
+
const result = {
|
|
475
|
+
url: finalUrl,
|
|
476
|
+
statusCode,
|
|
477
|
+
title: pageTitle || extracted.title,
|
|
478
|
+
description: extracted.description || undefined,
|
|
479
|
+
tier: "browser",
|
|
480
|
+
durationMs: Date.now() - startTime,
|
|
481
|
+
links: extracted.links.length > 0 ? extracted.links : undefined,
|
|
482
|
+
interceptedAPIs: interceptedAPIs.length > 0 ? interceptedAPIs : undefined,
|
|
483
|
+
};
|
|
484
|
+
if (formats.includes("markdown"))
|
|
485
|
+
result.markdown = htmlToMarkdown(extracted.html);
|
|
486
|
+
if (formats.includes("html"))
|
|
487
|
+
result.html = extracted.html;
|
|
488
|
+
if (formats.includes("text"))
|
|
489
|
+
result.text = extracted.text;
|
|
490
|
+
return result;
|
|
491
|
+
}
|
|
492
|
+
finally {
|
|
493
|
+
await context?.close().catch(() => { });
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
// ── Encerrar browser ───────────────────────────────────────────────────
|
|
497
|
+
async close() {
|
|
498
|
+
if (this.browser) {
|
|
499
|
+
await this.browser.close().catch(() => { });
|
|
500
|
+
this.browser = null;
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
//# sourceMappingURL=tier3-browser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tier3-browser.js","sourceRoot":"","sources":["../../../src/scraper/tiers/tier3-browser.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAqC,MAAM,YAAY,CAAC;AAOzE,OAAO,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAC1D,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAE9D,8EAA8E;AAC9E,EAAE;AACF,gEAAgE;AAChE,6EAA6E;AAC7E,EAAE;AACF,yBAAyB;AACzB,oEAAoE;AACpE,2EAA2E;AAC3E,kEAAkE;AAClE,kDAAkD;AAClD,sDAAsD;AACtD,6EAA6E;AAE7E,2EAA2E;AAC3E,gDAAgD;AAChD,MAAM,SAAS,GACb,iHAAiH,CAAC;AAEpH,mDAAmD;AACnD,MAAM,YAAY,GAAG;IACnB,+CAA+C;IAC/C,oDAAoD;IACpD,oBAAoB;IACpB,gBAAgB;IAChB,cAAc;IACd,0BAA0B;IAC1B,yBAAyB;IACzB,iCAAiC;IACjC,aAAa;IACb,eAAe;IACf,yBAAyB;IACzB,iCAAiC;IACjC,0CAA0C;IAC1C,4BAA4B;IAC5B,wBAAwB;IACxB,8BAA8B;IAC9B,sBAAsB;IACtB,wBAAwB;IACxB,0BAA0B;IAC1B,4BAA4B;IAC5B,gBAAgB;IAChB,0BAA0B;IAC1B,oCAAoC;CACrC,CAAC;AAEF,iFAAiF;AACjF,EAAE;AACF,6DAA6D;AAC7D,8DAA8D;AAC9D,EAAE;AACF,8DAA8D;AAC9D,yEAAyE;AACzE,2FAA2F;AAC3F,oFAAoF;AACpF,2EAA2E;AAC3E,iEAAiE;AACjE,yFAAyF;AACzF,2FAA2F;AAC3F,6CAA6C;AAC7C,6FAA6F;AAC7F,iFAAiF;AACjF,MAAM,mBAAmB,GAAG,yBAAyB,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAiMrD,CAAC;AAEF,uDAAuD;AACvD,4EAA4E;AAC5E,MAAM,sBAAsB,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC;AAEnE,+CAA+C;AAC/C,MAAM,oBAAoB,GAAG;IAC3B,sBAAsB;IACtB,sBAAsB;IACtB,gCAAgC;IAChC,sBAAsB;IACtB,YAAY;IACZ,eAAe;IACf,aAAa;IACb,cAAc;IACd,eAAe;IACf,WAAW;IACX,YAAY;IACZ,iBAAiB;IACjB,WAAW;IACX,YAAY;IACZ,aAAa;IACb,cAAc;CACf,CAAC;AAEF,MAAM,OAAO,YAAY;IACf,OAAO,GAAmB,IAAI,CAAC;IACtB,aAAa,CAAgD;IAE9E,YAAY,gBAAkD,EAAE;QAC9D,IAAI,CAAC,aAAa,GAAG,aAAa,CAAC;IACrC,CAAC;IAED,0EAA0E;IAElE,KAAK,CAAC,UAAU;QACtB,IAAI,IAAI,CAAC,OAAO,EAAE,WAAW,EAAE;YAAE,OAAO,IAAI,CAAC,OAAO,CAAC;QAErD,MAAM,aAAa,GAA0C;YAC3D,QAAQ,EAAE,IAAI,CAAC,aAAa,CAAC,QAAQ,IAAI,IAAI;YAC7C,IAAI,EAAE,YAAY;SACnB,CAAC;QAEF,yCAAyC;QACzC,uDAAuD;QACvD,sDAAsD;QACtD,oEAAoE;QACpE,IAAI,IAAI,CAAC,aAAa,CAAC,cAAc,EAAE,CAAC;YACtC,aAAa,CAAC,cAAc,GAAG,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC;QACnE,CAAC;aAAM,IAAI,IAAI,CAAC,aAAa,CAAC,OAAO,EAAE,CAAC;YACtC,aAAa,CAAC,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC;QACrD,CAAC;aAAM,CAAC;YACN,0EAA0E;YAC1E,IAAI,CAAC;gBACH,IAAI,CAAC,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,EAAE,GAAG,aAAa,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC;gBAC9E,OAAO,IAAI,CAAC,OAAO,CAAC;YACtB,CAAC;YAAC,MAAM,CAAC;gBACP,kEAAkE;YACpE,CAAC;QACH,CAAC;QAED,IAAI,CAAC,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC;QACpD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;IAED,0EAA0E;IAE1E,KAAK,CAAC,MAAM,CAAC,GAAW,EAAE,UAAyB,EAAE;QACnD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;QAExC,IAAI,OAAO,GAA0B,IAAI,CAAC;QAE1C,IAAI,CAAC;YACH,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC;gBACjC,SAAS,EAAE,SAAS;gBACpB,QAAQ,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE;gBACvC,MAAM,EAAE,OAAO;gBACf,UAAU,EAAE,mBAAmB;gBAC/B,gBAAgB,EAAE;oBAChB,iBAAiB,EAAE,qCAAqC;oBACxD,GAAG,CAAC,OAAO,CAAC,OAAO,IAAI,EAAE,CAAC;iBAC3B;gBACD,iBAAiB,EAAE,IAAI;gBACvB,4EAA4E;gBAC5E,6DAA6D;aAC9D,CAAC,CAAC;YAEH,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;YAErC,qEAAqE;YACrE,MAAM,IAAI,CAAC,aAAa,CAAC,mBAAmB,CAAC,CAAC;YAE9C,qEAAqE;YACrE,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE;gBACjC,MAAM,GAAG,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC;gBAC5B,MAAM,IAAI,GAAG,GAAG,CAAC,YAAY,EAAE,CAAC;gBAChC,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,EAAE,CAAC;gBAEzB,IAAI,sBAAsB,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;oBACrC,OAAO,KAAK,CAAC,KAAK,EAAE,CAAC;gBACvB,CAAC;gBACD,IACE,IAAI,KAAK,QAAQ;oBACjB,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,EACpD,CAAC;oBACD,OAAO,KAAK,CAAC,KAAK,EAAE,CAAC;gBACvB,CAAC;gBAED,OAAO,KAAK,CAAC,QAAQ,EAAE,CAAC;YAC1B,CAAC,CAAC,CAAC;YAEH,qEAAqE;YACrE,MAAM,eAAe,GAAqB,EAAE,CAAC;YAC7C,MAAM,eAAe,GAAG,OAAO,CAAC,aAAa,KAAK,KAAK,CAAC;YAExD,IAAI,eAAe,EAAE,CAAC;gBACpB,IAAI,CAAC,EAAE,CAAC,UAAU,EAAE,KAAK,EAAE,QAAQ,EAAE,EAAE;oBACrC,IAAI,CAAC;wBACH,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,EAAE,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;wBAC7D,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,kBAAkB,CAAC;4BAAE,OAAO;wBAEtD,MAAM,MAAM,GAAG,QAAQ,CAAC,GAAG,EAAE,CAAC;wBAC9B,qCAAqC;wBACrC,IAAI,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;4BAAE,OAAO;wBACjE,IAAI,iCAAiC,CAAC,IAAI,CAAC,MAAM,CAAC;4BAAE,OAAO;wBAE3D,uEAAuE;wBACvE,MAAM,aAAa,GAAG,QAAQ,CAC5B,QAAQ,CAAC,OAAO,EAAE,CAAC,gBAAgB,CAAC,IAAI,GAAG,EAC3C,EAAE,CACH,CAAC;wBACF,IAAI,aAAa,GAAG,OAAO;4BAAE,OAAO;wBAEpC,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,CAAC;wBACrD,IAAI,CAAC,IAAI;4BAAE,OAAO;wBAElB,eAAe,CAAC,IAAI,CAAC;4BACnB,GAAG,EAAE,MAAM;4BACX,MAAM,EAAE,QAAQ,CAAC,OAAO,EAAE,CAAC,MAAM,EAAE;4BACnC,UAAU,EAAE,QAAQ,CAAC,MAAM,EAAE;4BAC7B,WAAW;4BACX,IAAI;yBACL,CAAC,CAAC;oBACL,CAAC;oBAAC,MAAM,CAAC;wBACP,mEAAmE;oBACrE,CAAC;gBACH,CAAC,CAAC,CAAC;YACL,CAAC;YAED,0EAA0E;YAC1E,IAAI,CAAC,EAAE,CAAC,QAAQ,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC,CAAC;YAEhE,sEAAsE;YACtE,6EAA6E;YAC7E,sEAAsE;YACtE,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,MAAM,CAAC;YAC1C,IAAI,UAAU,GAAG,GAAG,CAAC;YACrB,IAAI,YAAY,GAAiB,IAAI,CAAC;YAEtC,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,CAAC,EAAE,OAAO,EAAE,EAAE,CAAC;gBAC9C,IAAI,CAAC;oBACH,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;wBACvC,SAAS,EAAE,kBAAkB;wBAC7B,OAAO,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,MAAM,CAAC;qBACnC,CAAC,CAAC;oBACH,UAAU,GAAG,WAAW,EAAE,MAAM,EAAE,IAAI,GAAG,CAAC;oBAC1C,YAAY,GAAG,IAAI,CAAC;oBACpB,MAAM,CAAC,wBAAwB;gBACjC,CAAC;gBAAC,OAAO,MAAM,EAAE,CAAC;oBAChB,YAAY,GAAG,MAAM,YAAY,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;oBAC5E,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;wBAChB,MAAM,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;oBACnD,CAAC;gBACH,CAAC;YACH,CAAC;YAED,IAAI,YAAY,EAAE,CAAC;gBACjB,MAAM,IAAI,KAAK,CAAC,uCAAuC,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC;YACjF,CAAC;YAED,qEAAqE;YACrE,sDAAsD;YACtD,MAAM,IAAI;iBACP,gBAAgB,CAAC,aAAa,EAAE;gBAC/B,OAAO,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,GAAG,GAAG,EAAE,MAAM,CAAC;aACzC,CAAC;iBACD,KAAK,CAAC,GAAG,EAAE;gBACV,qDAAqD;YACvD,CAAC,CAAC,CAAC;YAEL,8DAA8D;YAC9D,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;gBAC5B,MAAM,IAAI;qBACP,eAAe,CAAC,OAAO,CAAC,eAAe,EAAE;oBACxC,KAAK,EAAE,SAAS;oBAChB,OAAO,EAAE,MAAM;iBAChB,CAAC;qBACD,KAAK,CAAC,GAAG,EAAE;oBACV,kDAAkD;gBACpD,CAAC,CAAC,CAAC;YACP,CAAC;YAED,qEAAqE;YACrE,uEAAuE;YACvE,2EAA2E;YAC3E,kEAAkE;YAClE,MAAM,IAAI;iBACP,QAAQ,CAAC,GAAG,EAAE;gBACb,OAAO,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;oBACnC,MAAM,WAAW,GAAG,QAAQ,CAAC,IAAI,CAAC,YAAY,CAAC;oBAC/C,IAAI,WAAW,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC;wBACtC,OAAO,EAAE,CAAC;wBACV,OAAO;oBACT,CAAC;oBAED,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,WAAW,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;oBACxD,IAAI,QAAQ,GAAG,CAAC,CAAC;oBAEjB,MAAM,IAAI,GAAG,GAAG,EAAE;wBAChB,QAAQ,IAAI,IAAI,CAAC;wBACjB,MAAM,CAAC,QAAQ,CAAC,EAAE,GAAG,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC,CAAC;wBACvD,IAAI,QAAQ,GAAG,WAAW,EAAE,CAAC;4BAC3B,8DAA8D;4BAC9D,uDAAuD;4BACvD,UAAU,CAAC,IAAI,EAAE,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC;wBAC1D,CAAC;6BAAM,CAAC;4BACN,MAAM,CAAC,QAAQ,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,QAAQ,EAAE,SAAS,EAAE,CAAC,CAAC;4BACjD,OAAO,EAAE,CAAC;wBACZ,CAAC;oBACH,CAAC,CAAC;oBAEF,UAAU,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;gBACxB,CAAC,CAAC,CAAC;YACL,CAAC,CAAC;iBACD,KAAK,CAAC,GAAG,EAAE;gBACV,2DAA2D;YAC7D,CAAC,CAAC,CAAC;YAEL,qEAAqE;YACrE,MAAM,CAAC,IAAI,EAAE,SAAS,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;gBAC1C,IAAI,CAAC,OAAO,EAAE;gBACd,IAAI,CAAC,KAAK,EAAE;aACb,CAAC,CAAC;YAEH,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAC5B,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;YACxD,MAAM,SAAS,GAAG,cAAc,CAC9B,IAAI,EACJ,OAAO,CAAC,eAAe,IAAI,IAAI,EAC/B,QAAQ,CACT,CAAC;YAEF,MAAM,MAAM,GAAiB;gBAC3B,GAAG,EAAE,QAAQ;gBACb,UAAU;gBACV,KAAK,EAAE,SAAS,IAAI,SAAS,CAAC,KAAK;gBACnC,WAAW,EAAE,SAAS,CAAC,WAAW,IAAI,SAAS;gBAC/C,IAAI,EAAE,SAAS;gBACf,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;gBAClC,KAAK,EAAE,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;gBAC/D,eAAe,EACb,eAAe,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,SAAS;aAC3D,CAAC;YAEF,IAAI,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC;gBAAE,MAAM,CAAC,QAAQ,GAAG,cAAc,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YACnF,IAAI,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAAM,MAAM,CAAC,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC;YAC/D,IAAI,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAAM,MAAM,CAAC,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC;YAE/D,OAAO,MAAM,CAAC;QAChB,CAAC;gBAAS,CAAC;YACT,MAAM,OAAO,EAAE,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QACzC,CAAC;IACH,CAAC;IAED,0EAA0E;IAE1E,KAAK,CAAC,KAAK;QACT,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;YAC3C,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Qual tier foi usado para fazer o scrape:
|
|
3
|
+
* - "http" → Tier 1: fetch nativo + Cheerio (sem browser, ~100-500ms)
|
|
4
|
+
* - "stealth" → Tier 2: got-scraping com TLS fingerprint spoofing (~200-800ms)
|
|
5
|
+
* - "browser" → Tier 3: Playwright Chromium (browser completo, fallback final)
|
|
6
|
+
*/
|
|
7
|
+
export type ScrapeTier = "http" | "stealth" | "browser";
|
|
8
|
+
/** Resultado interno retornado por cada tier (não exposto ao usuário final) */
|
|
9
|
+
export interface TierRawResult {
|
|
10
|
+
/** HTML completo da página */
|
|
11
|
+
html: string;
|
|
12
|
+
/** URL final após redirecionamentos */
|
|
13
|
+
finalUrl: string;
|
|
14
|
+
/** HTTP status code */
|
|
15
|
+
statusCode: number;
|
|
16
|
+
/**
|
|
17
|
+
* true = conteúdo suficiente (sem loading screen / anti-bot).
|
|
18
|
+
* false = deve tentar o próximo tier na cascata.
|
|
19
|
+
*/
|
|
20
|
+
sufficient: boolean;
|
|
21
|
+
}
|
|
22
|
+
/** Configuração global do Firecrawl */
|
|
23
|
+
export interface FirecrawlConfig {
|
|
24
|
+
/** Timeout padrão em ms. Default: 30_000 */
|
|
25
|
+
timeout?: number;
|
|
26
|
+
/**
|
|
27
|
+
* Forçar uso de um tier específico em todos os scrapes desta instância.
|
|
28
|
+
* Pode ser sobrescrito por opção na chamada de `scrape()`.
|
|
29
|
+
*/
|
|
30
|
+
forceTier?: ScrapeTier;
|
|
31
|
+
/** Domínios permitidos (whitelist anti-SSRF). Qualquer URL fora da lista é rejeitada. */
|
|
32
|
+
allowedDomains?: string[];
|
|
33
|
+
/** Domínios bloqueados (blacklist anti-SSRF). */
|
|
34
|
+
blockedDomains?: string[];
|
|
35
|
+
/** Log detalhado mostrando qual tier foi usado e por quê. Default: false */
|
|
36
|
+
verbose?: boolean;
|
|
37
|
+
/** Configurações do browser Chromium (Tier 3) */
|
|
38
|
+
browserConfig?: {
|
|
39
|
+
/** Rodar em modo headless. Default: true */
|
|
40
|
+
headless?: boolean;
|
|
41
|
+
/**
|
|
42
|
+
* Canal do browser instalado no sistema.
|
|
43
|
+
* Exemplos: 'chrome', 'chromium', 'msedge'.
|
|
44
|
+
* Se omitido, tenta 'chrome' (sistema) e depois playwright-chromium.
|
|
45
|
+
*/
|
|
46
|
+
channel?: string;
|
|
47
|
+
/** Caminho explícito para o executável do browser */
|
|
48
|
+
executablePath?: string;
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
/** Formatos de saída do conteúdo */
|
|
52
|
+
export type ContentFormat = "markdown" | "html" | "text";
|
|
53
|
+
/** Opções de scraping passadas pelo usuário */
|
|
54
|
+
export interface ScrapeOptions {
|
|
55
|
+
/** Formatos de saída desejados. Default: ['markdown', 'text'] */
|
|
56
|
+
formats?: ContentFormat[];
|
|
57
|
+
/** Timeout em ms. Default: 30_000 */
|
|
58
|
+
timeout?: number;
|
|
59
|
+
/** Só tentar extrair o conteúdo principal (remove nav, footer, ads). Default: true */
|
|
60
|
+
onlyMainContent?: boolean;
|
|
61
|
+
/** Headers HTTP extras */
|
|
62
|
+
headers?: Record<string, string>;
|
|
63
|
+
/**
|
|
64
|
+
* Forçar uso de um tier específico (ignora a cascata automática).
|
|
65
|
+
* - "http" → só HTTP + Cheerio
|
|
66
|
+
* - "stealth" → pula HTTP, usa got-scraping diretamente
|
|
67
|
+
* - "browser" → vai direto ao Playwright Chromium
|
|
68
|
+
*/
|
|
69
|
+
forceTier?: ScrapeTier;
|
|
70
|
+
/** Aguardar esse seletor CSS aparecer e estar visível antes de extrair */
|
|
71
|
+
waitForSelector?: string;
|
|
72
|
+
/** Interceptar respostas JSON das APIs chamadas pela SPA */
|
|
73
|
+
interceptAPIs?: boolean;
|
|
74
|
+
}
|
|
75
|
+
/** Dados SSR embutidos em tags <script> pelo framework */
|
|
76
|
+
export interface SSRData {
|
|
77
|
+
/**
|
|
78
|
+
* Framework que gerou o dado:
|
|
79
|
+
* - "next" → Next.js (#__NEXT_DATA__)
|
|
80
|
+
* - "nuxt" → Nuxt 2/3 (window.__NUXT__)
|
|
81
|
+
* - "gatsby" → Gatsby (window.___gatsby)
|
|
82
|
+
* - "remix" → Remix (window.__remixContext)
|
|
83
|
+
* - "sveltekit" → SvelteKit (script[data-sveltekit-fetched] ou window.__SVELTEKIT__)
|
|
84
|
+
* - "vue" → Vue SSR (window.__VUE_SSR_CONTEXT__ / window.__VUE_STORE__)
|
|
85
|
+
* - "angular" → Angular Universal (script#ng-state)
|
|
86
|
+
* - "tanstack" → TanStack Router / Start (window.__TSR_DEHYDRATED__)
|
|
87
|
+
* - "generic" → Outros (window.__INITIAL_STATE__, __APP_STATE__, etc.)
|
|
88
|
+
*/
|
|
89
|
+
type: "next" | "nuxt" | "gatsby" | "remix" | "sveltekit" | "vue" | "angular" | "tanstack" | "generic";
|
|
90
|
+
/** Objeto JSON extraído */
|
|
91
|
+
data: unknown;
|
|
92
|
+
}
|
|
93
|
+
/** Chamada de API interceptada durante renderização do browser */
|
|
94
|
+
export interface InterceptedAPI {
|
|
95
|
+
url: string;
|
|
96
|
+
method: string;
|
|
97
|
+
statusCode: number;
|
|
98
|
+
contentType: string;
|
|
99
|
+
data: unknown;
|
|
100
|
+
}
|
|
101
|
+
/** Resultado completo do scrape */
|
|
102
|
+
export interface ScrapeResult {
|
|
103
|
+
/** URL final (após redirecionamentos) */
|
|
104
|
+
url: string;
|
|
105
|
+
/** HTTP status code */
|
|
106
|
+
statusCode: number;
|
|
107
|
+
/** Título da página */
|
|
108
|
+
title: string;
|
|
109
|
+
/** Meta description */
|
|
110
|
+
description?: string;
|
|
111
|
+
/** Conteúdo em Markdown */
|
|
112
|
+
markdown?: string;
|
|
113
|
+
/** Conteúdo em HTML limpo */
|
|
114
|
+
html?: string;
|
|
115
|
+
/** Conteúdo em texto puro */
|
|
116
|
+
text?: string;
|
|
117
|
+
/** Tier usado para fazer o scrape */
|
|
118
|
+
tier: ScrapeTier;
|
|
119
|
+
/** Tempo total em ms */
|
|
120
|
+
durationMs: number;
|
|
121
|
+
/** Links encontrados na página */
|
|
122
|
+
links?: string[];
|
|
123
|
+
/** Dados SSR extraídos (Next.js, Nuxt, etc.) */
|
|
124
|
+
ssrData?: SSRData;
|
|
125
|
+
/** Chamadas JSON interceptadas durante renderização (só Tier 3) */
|
|
126
|
+
interceptedAPIs?: InterceptedAPI[];
|
|
127
|
+
/** Motivo de falha, se houver */
|
|
128
|
+
error?: string;
|
|
129
|
+
}
|
|
130
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/scraper/types.ts"],"names":[],"mappings":"AAEA;;;;;GAKG;AACH,MAAM,MAAM,UAAU,GAAG,MAAM,GAAG,SAAS,GAAG,SAAS,CAAC;AAExD,+EAA+E;AAC/E,MAAM,WAAW,aAAa;IAC5B,8BAA8B;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,uCAAuC;IACvC,QAAQ,EAAE,MAAM,CAAC;IACjB,uBAAuB;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,UAAU,EAAE,OAAO,CAAC;CACrB;AAED,uCAAuC;AACvC,MAAM,WAAW,eAAe;IAC9B,4CAA4C;IAC5C,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,SAAS,CAAC,EAAE,UAAU,CAAC;IAEvB,yFAAyF;IACzF,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAE1B,iDAAiD;IACjD,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAE1B,4EAA4E;IAC5E,OAAO,CAAC,EAAE,OAAO,CAAC;IAElB,iDAAiD;IACjD,aAAa,CAAC,EAAE;QACd,4CAA4C;QAC5C,QAAQ,CAAC,EAAE,OAAO,CAAC;QACnB;;;;WAIG;QACH,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,qDAAqD;QACrD,cAAc,CAAC,EAAE,MAAM,CAAC;KACzB,CAAC;CACH;AAED,oCAAoC;AACpC,MAAM,MAAM,aAAa,GAAG,UAAU,GAAG,MAAM,GAAG,MAAM,CAAC;AAEzD,+CAA+C;AAC/C,MAAM,WAAW,aAAa;IAC5B,iEAAiE;IACjE,OAAO,CAAC,EAAE,aAAa,EAAE,CAAC;IAC1B,qCAAqC;IACrC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,sFAAsF;IACtF,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,0BAA0B;IAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC;;;;;OAKG;IACH,SAAS,CAAC,EAAE,UAAU,CAAC;IAGvB,0EAA0E;IAC1E,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,4DAA4D;IAC5D,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED,0DAA0D;AAC1D,MAAM,WAAW,OAAO;IACtB;;;;;;;;;;;OAWG;IACH,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,QAAQ,GAAG,OAAO,GAAG,WAAW,GAAG,KAAK,GAAG,SAAS,GAAG,UAAU,GAAG,SAAS,CAAC;IACtG,2BAA2B;IAC3B,IAAI,EAAE,OAAO,CAAC;CACf;AAED,kEAAkE;AAClE,MAAM,WAAW,cAAc;IAC7B,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE,OAAO,CAAC;CACf;AAED,mCAAmC;AACnC,MAAM,WAAW,YAAY;IAC3B,yCAAyC;IACzC,GAAG,EAAE,MAAM,CAAC;IACZ,uBAAuB;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB,uBAAuB;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,uBAAuB;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IAGrB,2BAA2B;IAC3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,6BAA6B;IAC7B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,6BAA6B;IAC7B,IAAI,CAAC,EAAE,MAAM,CAAC;IAGd,qCAAqC;IACrC,IAAI,EAAE,UAAU,CAAC;IACjB,wBAAwB;IACxB,UAAU,EAAE,MAAM,CAAC;IACnB,kCAAkC;IAClC,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;IAGjB,gDAAgD;IAChD,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,mEAAmE;IACnE,eAAe,CAAC,EAAE,cAAc,EAAE,CAAC;IAEnC,iCAAiC;IACjC,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/scraper/types.ts"],"names":[],"mappings":"AAAA,8EAA8E"}
|