auspex 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/LICENSE +21 -0
  2. package/dist/agent/actions.d.ts +5 -0
  3. package/dist/agent/actions.d.ts.map +1 -0
  4. package/dist/agent/actions.js +26 -0
  5. package/dist/agent/actions.js.map +1 -0
  6. package/dist/agent/agent.d.ts +12 -0
  7. package/dist/agent/agent.d.ts.map +1 -0
  8. package/dist/agent/agent.js +147 -0
  9. package/dist/agent/agent.js.map +1 -0
  10. package/dist/agent/loop.d.ts +6 -0
  11. package/dist/agent/loop.d.ts.map +1 -0
  12. package/dist/agent/loop.js +165 -0
  13. package/dist/agent/loop.js.map +1 -0
  14. package/dist/agent/report.d.ts +3 -0
  15. package/dist/agent/report.d.ts.map +1 -0
  16. package/dist/agent/report.js +90 -0
  17. package/dist/agent/report.js.map +1 -0
  18. package/dist/browser/executor.d.ts +5 -0
  19. package/dist/browser/executor.d.ts.map +1 -0
  20. package/dist/browser/executor.js +33 -0
  21. package/dist/browser/executor.js.map +1 -0
  22. package/dist/browser/snapshot.d.ts +6 -0
  23. package/dist/browser/snapshot.d.ts.map +1 -0
  24. package/dist/browser/snapshot.js +145 -0
  25. package/dist/browser/snapshot.js.map +1 -0
  26. package/dist/config/defaults.d.ts +10 -0
  27. package/dist/config/defaults.d.ts.map +1 -0
  28. package/dist/config/defaults.js +10 -0
  29. package/dist/config/defaults.js.map +1 -0
  30. package/dist/config/schema.d.ts +59 -0
  31. package/dist/config/schema.d.ts.map +1 -0
  32. package/dist/config/schema.js +23 -0
  33. package/dist/config/schema.js.map +1 -0
  34. package/dist/index.d.ts +7 -0
  35. package/dist/index.d.ts.map +1 -0
  36. package/dist/index.js +8 -0
  37. package/dist/index.js.map +1 -0
  38. package/dist/llm/client.d.ts +23 -0
  39. package/dist/llm/client.d.ts.map +1 -0
  40. package/dist/llm/client.js +51 -0
  41. package/dist/llm/client.js.map +1 -0
  42. package/dist/llm/prompt.d.ts +3 -0
  43. package/dist/llm/prompt.d.ts.map +1 -0
  44. package/dist/llm/prompt.js +36 -0
  45. package/dist/llm/prompt.js.map +1 -0
  46. package/dist/scraper/extractors/content.d.ts +22 -0
  47. package/dist/scraper/extractors/content.d.ts.map +1 -0
  48. package/dist/scraper/extractors/content.js +237 -0
  49. package/dist/scraper/extractors/content.js.map +1 -0
  50. package/dist/scraper/extractors/ssr.d.ts +17 -0
  51. package/dist/scraper/extractors/ssr.d.ts.map +1 -0
  52. package/dist/scraper/extractors/ssr.js +162 -0
  53. package/dist/scraper/extractors/ssr.js.map +1 -0
  54. package/dist/scraper/extractors/to-markdown.d.ts +5 -0
  55. package/dist/scraper/extractors/to-markdown.d.ts.map +1 -0
  56. package/dist/scraper/extractors/to-markdown.js +103 -0
  57. package/dist/scraper/extractors/to-markdown.js.map +1 -0
  58. package/dist/scraper/index.d.ts +27 -0
  59. package/dist/scraper/index.d.ts.map +1 -0
  60. package/dist/scraper/index.js +178 -0
  61. package/dist/scraper/index.js.map +1 -0
  62. package/dist/scraper/tiers/tier1-http.d.ts +5 -0
  63. package/dist/scraper/tiers/tier1-http.d.ts.map +1 -0
  64. package/dist/scraper/tiers/tier1-http.js +120 -0
  65. package/dist/scraper/tiers/tier1-http.js.map +1 -0
  66. package/dist/scraper/tiers/tier2-stealth.d.ts +5 -0
  67. package/dist/scraper/tiers/tier2-stealth.d.ts.map +1 -0
  68. package/dist/scraper/tiers/tier2-stealth.js +106 -0
  69. package/dist/scraper/tiers/tier2-stealth.js.map +1 -0
  70. package/dist/scraper/tiers/tier3-browser.d.ts +10 -0
  71. package/dist/scraper/tiers/tier3-browser.d.ts.map +1 -0
  72. package/dist/scraper/tiers/tier3-browser.js +504 -0
  73. package/dist/scraper/tiers/tier3-browser.js.map +1 -0
  74. package/dist/scraper/types.d.ts +130 -0
  75. package/dist/scraper/types.d.ts.map +1 -0
  76. package/dist/scraper/types.js +3 -0
  77. package/dist/scraper/types.js.map +1 -0
  78. package/dist/security/action-validator.d.ts +83 -0
  79. package/dist/security/action-validator.d.ts.map +1 -0
  80. package/dist/security/action-validator.js +36 -0
  81. package/dist/security/action-validator.js.map +1 -0
  82. package/dist/security/url-validator.d.ts +9 -0
  83. package/dist/security/url-validator.d.ts.map +1 -0
  84. package/dist/security/url-validator.js +69 -0
  85. package/dist/security/url-validator.js.map +1 -0
  86. package/dist/types.d.ts +95 -0
  87. package/dist/types.d.ts.map +1 -0
  88. package/dist/types.js +2 -0
  89. package/dist/types.js.map +1 -0
  90. package/package.json +54 -0
  91. package/readme.md +760 -0
@@ -0,0 +1,504 @@
1
+ import { chromium } from "playwright";
2
+ import { extractContent } from "../extractors/content.js";
3
+ import { htmlToMarkdown } from "../extractors/to-markdown.js";
4
+ // ─── Tier 3: Playwright Chromium (fallback final) ──────────────────────────
5
+ //
6
+ // Acionado quando Tier 1 (HTTP) e Tier 2 (Stealth HTTP) falham.
7
+ // Casos típicos: SPAs complexas, anti-bot pesado (Cloudflare, Akamai, etc.).
8
+ //
9
+ // Estratégias aplicadas:
10
+ // 1. Stealth scripts injetados antes de qualquer script da página
11
+ // 2. Interceptar chamadas de API JSON (melhor para SPAs — dados diretos)
12
+ // 3. Bloquear recursos desnecessários (fonts, media, analytics)
13
+ // 4. Aguardar networkidle ou seletor específico
14
+ // 5. Extrair DOM completo e converter para Markdown
15
+ // ──────────────────────────────────────────────────────────────────────────
16
+ // User-Agent de Chrome real para Windows (OS mais comum = menos suspeito).
17
+ // Atualizar a cada 2-3 versões major do Chrome.
18
+ const CHROME_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36";
19
+ // Args que reduzem sinais de automação detectáveis
20
+ const STEALTH_ARGS = [
21
+ "--disable-blink-features=AutomationControlled",
22
+ "--disable-features=IsolateOrigins,site-per-process",
23
+ "--disable-infobars",
24
+ "--no-first-run",
25
+ "--no-sandbox",
26
+ "--disable-setuid-sandbox",
27
+ "--disable-dev-shm-usage",
28
+ "--disable-accelerated-2d-canvas",
29
+ "--no-zygote",
30
+ "--disable-gpu",
31
+ "--window-size=1920,1080",
32
+ "--disable-background-networking",
33
+ "--disable-client-side-phishing-detection",
34
+ "--disable-component-update",
35
+ "--disable-default-apps",
36
+ "--disable-domain-reliability",
37
+ "--disable-extensions",
38
+ "--disable-hang-monitor",
39
+ "--disable-popup-blocking",
40
+ "--disable-prompt-on-repost",
41
+ "--disable-sync",
42
+ "--metrics-recording-only",
43
+ "--safebrowsing-disable-auto-update",
44
+ ];
45
+ // ─── Script de anti-detecção (injetado antes de qualquer JS da página) ────────
46
+ //
47
+ // Cobre as principais técnicas usadas por anti-bots modernos
48
+ // (Cloudflare, DataDome, Akamai, PerimeterX, Shape Security):
49
+ //
50
+ // 1. navigator.webdriver → remove o flag mais óbvio
51
+ // 2. navigator.plugins → simula os 3 plugins reais do Chrome
52
+ // 3. Propriedades de hardware → concurrency, memory, maxTouchPoints, vendor, platform
53
+ // 4. window.chrome → objeto completo (runtime, loadTimes, csi, app)
54
+ // 5. Notification.permission → 'default' (headless retorna 'denied')
55
+ // 6. Permission API → 'prompt' para notifications
56
+ // 7. Canvas fingerprint → ruído de 1 bit no toDataURL (quebra fingerprinting)
57
+ // 8. WebGL UNMASKED_VENDOR/RENDERER → GPU Intel realista (em vez de llvmpipe/SwiftShader)
58
+ // 9. Screen.colorDepth/pixelDepth → 24 bits
59
+ // 10. Remoção de artefatos → remove vars de outras ferramentas (Selenium, PhantomJS)
60
+ // ──────────────────────────────────────────────────────────────────────────────
61
+ const STEALTH_INIT_SCRIPT = /* language=javascript */ `
62
+ (function () {
63
+ // ── 1. Remove a flag mais básica de automação ─────────────────────────
64
+ Object.defineProperty(navigator, 'webdriver', {
65
+ get: () => undefined,
66
+ configurable: true,
67
+ });
68
+
69
+ // ── 2. Plugins realistas de um Chrome normal ──────────────────────────
70
+ // navigator.plugins.length === 0 é o maior red-flag de headless.
71
+ const makeMime = (type, suffixes, desc, plugin) => {
72
+ const mt = Object.create(MimeType.prototype);
73
+ Object.defineProperties(mt, {
74
+ type: { value: type, enumerable: true },
75
+ suffixes: { value: suffixes, enumerable: true },
76
+ description: { value: desc, enumerable: true },
77
+ enabledPlugin: { value: plugin, enumerable: true },
78
+ });
79
+ return mt;
80
+ };
81
+
82
+ const makePlugin = (name, desc, filename, mimeSpecs) => {
83
+ const p = Object.create(Plugin.prototype);
84
+ Object.defineProperties(p, {
85
+ name: { value: name, enumerable: true },
86
+ description: { value: desc, enumerable: true },
87
+ filename: { value: filename, enumerable: true },
88
+ length: { value: mimeSpecs.length },
89
+ });
90
+ mimeSpecs.forEach((spec, i) => {
91
+ const mt = makeMime(spec.type, spec.suffixes, spec.desc, p);
92
+ Object.defineProperty(p, i, { value: mt, enumerable: true });
93
+ Object.defineProperty(p, spec.type, { value: mt });
94
+ });
95
+ p.item = (i) => p[i] ?? null;
96
+ p.namedItem = (n) => p[n] ?? null;
97
+ return p;
98
+ };
99
+
100
+ const pdfViewer = makePlugin(
101
+ 'PDF Viewer', 'Portable Document Format', 'internal-pdf-viewer',
102
+ [
103
+ { type: 'application/pdf', suffixes: 'pdf', desc: '' },
104
+ { type: 'text/pdf', suffixes: 'pdf', desc: '' },
105
+ ],
106
+ );
107
+ const chromePDF = makePlugin(
108
+ 'Chrome PDF Viewer', '', 'mhjfbmdgcfjbbpaeojofohoefgiehjai',
109
+ [{ type: 'application/pdf', suffixes: 'pdf', desc: '' }],
110
+ );
111
+ const nacl = makePlugin(
112
+ 'Native Client', '', 'internal-nacl-plugin',
113
+ [
114
+ { type: 'application/x-nacl', suffixes: '', desc: 'Native Client Executable' },
115
+ { type: 'application/x-pnacl', suffixes: '', desc: 'Portable Native Client Executable' },
116
+ ],
117
+ );
118
+
119
+ const pluginList = [pdfViewer, chromePDF, nacl];
120
+ const pa = Object.create(PluginArray.prototype);
121
+ Object.defineProperty(pa, 'length', { value: pluginList.length });
122
+ pluginList.forEach((plug, i) => {
123
+ Object.defineProperty(pa, i, { value: plug, enumerable: true });
124
+ Object.defineProperty(pa, plug.name, { value: plug });
125
+ });
126
+ pa.item = (i) => pluginList[i] ?? null;
127
+ pa.namedItem = (n) => pa[n] ?? null;
128
+ pa.refresh = () => {};
129
+
130
+ Object.defineProperty(navigator, 'plugins', { get: () => pa });
131
+
132
+ // ── 3. Propriedades de hardware realistas ─────────────────────────────
133
+ Object.defineProperty(navigator, 'languages', { get: () => ['pt-BR', 'pt', 'en-US', 'en'] });
134
+ Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 });
135
+ Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
136
+ Object.defineProperty(navigator, 'maxTouchPoints', { get: () => 0 });
137
+ Object.defineProperty(navigator, 'vendor', { get: () => 'Google Inc.' });
138
+ Object.defineProperty(navigator, 'platform', { get: () => 'Win32' });
139
+
140
+ // ── 4. window.chrome — objeto completo como Chrome real ──────────────
141
+ // Automação headless deixa window.chrome undefined ou com .runtime vazio.
142
+ if (!window.chrome) window.chrome = {};
143
+
144
+ if (!window.chrome.app) {
145
+ window.chrome.app = {
146
+ isInstalled: false,
147
+ getDetails: () => null,
148
+ getIsInstalled: () => false,
149
+ InstallState: { DISABLED: 'disabled', INSTALLED: 'installed', NOT_INSTALLED: 'not_installed' },
150
+ RunningState: { CANNOT_RUN: 'cannot_run', READY_TO_RUN: 'ready_to_run', RUNNING: 'running' },
151
+ };
152
+ }
153
+
154
+ if (!window.chrome.runtime) {
155
+ window.chrome.runtime = {
156
+ id: undefined,
157
+ connect: () => { throw Object.assign(new Error('Could not establish connection.'), { message: 'Could not establish connection. Receiving end does not exist.' }); },
158
+ sendMessage: () => { throw Object.assign(new Error('Could not establish connection.'), { message: 'Could not establish connection. Receiving end does not exist.' }); },
159
+ PlatformOs: { MAC: 'mac', WIN: 'win', ANDROID: 'android', CROS: 'cros', LINUX: 'linux', OPENBSD: 'openbsd' },
160
+ PlatformArch: { ARM: 'arm', ARM64: 'arm64', X86_32: 'x86-32', X86_64: 'x86-64', MIPS: 'mips', MIPS64: 'mips64' },
161
+ };
162
+ }
163
+
164
+ if (!window.chrome.loadTimes) {
165
+ window.chrome.loadTimes = () => {
166
+ const now = Date.now() / 1000;
167
+ return {
168
+ requestTime: now - 1.5 - Math.random() * 0.5,
169
+ startLoadTime: now - 1.2 - Math.random() * 0.3,
170
+ commitLoadTime: now - 0.8 - Math.random() * 0.2,
171
+ finishDocumentLoadTime: now - 0.3 - Math.random() * 0.1,
172
+ finishLoadTime: now - 0.1 - Math.random() * 0.05,
173
+ firstPaintTime: now - 0.9 - Math.random() * 0.2,
174
+ firstPaintAfterLoadTime: now - 0.05,
175
+ navigationType: 'Other',
176
+ wasFetchedViaSpdy: true,
177
+ wasNpnNegotiated: true,
178
+ npnNegotiatedProtocol: 'h2',
179
+ wasAlternateProtocolAvailable: false,
180
+ connectionInfo: 'h2',
181
+ };
182
+ };
183
+ }
184
+
185
+ if (!window.chrome.csi) {
186
+ window.chrome.csi = () => ({
187
+ startE: Date.now() - 1000,
188
+ onloadT: Date.now(),
189
+ pageT: 500 + Math.random() * 1000,
190
+ tran: 15,
191
+ });
192
+ }
193
+
194
+ // ── 5. Notification API — headless retorna 'denied', real retorna 'default' ─
195
+ try {
196
+ if (typeof Notification !== 'undefined') {
197
+ Object.defineProperty(Notification, 'permission', { get: () => 'default' });
198
+ }
199
+ } catch (_) {}
200
+
201
+ // ── 6. Permission API — 'notifications' deve retornar 'prompt' ────────
202
+ if (navigator.permissions) {
203
+ const origQuery = navigator.permissions.query.bind(navigator.permissions);
204
+ navigator.permissions.query = (params) => {
205
+ if (params && params.name === 'notifications') {
206
+ return Promise.resolve({ state: 'prompt', onchange: null, addEventListener: () => {}, removeEventListener: () => {}, dispatchEvent: () => true });
207
+ }
208
+ return origQuery(params);
209
+ };
210
+ }
211
+
212
+ // ── 7. Canvas fingerprint — ruído sutil no último byte do dataURL ─────
213
+ // Técnica: altera 1 bit → output diferente em cada run → quebra fingerprinting.
214
+ // Impacto visual: imperceptível (altera apenas o encoding base64 do último pixel).
215
+ const _origToDataURL = HTMLCanvasElement.prototype.toDataURL;
216
+ HTMLCanvasElement.prototype.toDataURL = function (type, quality) {
217
+ const data = _origToDataURL.call(this, type, quality);
218
+ if (data.length < 12) return data;
219
+ const idx = data.length - 2;
220
+ return data.slice(0, idx) + String.fromCharCode(data.charCodeAt(idx) ^ 0x01) + data.slice(idx + 1);
221
+ };
222
+
223
+ // ── 8. WebGL — GPU Intel realista em vez de llvmpipe/SwiftShader ─────
224
+ // llvmpipe/SwiftShader = fingerprint de VM detectado por todos os anti-bots.
225
+ const WEBGL_VENDOR = 'Google Inc. (Intel)';
226
+ const WEBGL_RENDERER = 'ANGLE (Intel, Intel(R) UHD Graphics 620 Direct3D11 vs_5_0 ps_5_0, D3D11)';
227
+
228
+ const patchWebGL = (Ctx) => {
229
+ if (!Ctx) return;
230
+ const orig = Ctx.prototype.getParameter;
231
+ Ctx.prototype.getParameter = function (param) {
232
+ if (param === 37445) return WEBGL_VENDOR; // UNMASKED_VENDOR_WEBGL
233
+ if (param === 37446) return WEBGL_RENDERER; // UNMASKED_RENDERER_WEBGL
234
+ return orig.call(this, param);
235
+ };
236
+ };
237
+
238
+ patchWebGL(typeof WebGLRenderingContext !== 'undefined' ? WebGLRenderingContext : null);
239
+ patchWebGL(typeof WebGL2RenderingContext !== 'undefined' ? WebGL2RenderingContext : null);
240
+
241
+ // ── 9. Screen depth realista ──────────────────────────────────────────
242
+ try {
243
+ Object.defineProperty(screen, 'colorDepth', { get: () => 24 });
244
+ Object.defineProperty(screen, 'pixelDepth', { get: () => 24 });
245
+ } catch (_) {}
246
+
247
+ // ── 10. Remove artefatos de outras ferramentas de automação ──────────
248
+ const automationVars = ['__nightmare', '_phantom', 'callPhantom',
249
+ '__selenium_evaluate', '__webdriver_evaluate', '_Selenium_IDE_Recorder',
250
+ '__webdriver_script_fn', '__lastWatirAlert', '__lastWatirConfirm'];
251
+ automationVars.forEach(v => { try { delete window[v]; } catch (_) {} });
252
+
253
+ })();
254
+ `;
255
+ // Recursos que bloqueamos para economizar banda/tempo.
256
+ // "image" incluído: extraímos texto/markdown, não renderizamos visualmente.
257
+ const BLOCKED_RESOURCE_TYPES = new Set(["font", "media", "image"]);
258
+ // Padrões de analytics/rastreamento a bloquear
259
+ const BLOCKED_URL_PATTERNS = [
260
+ "google-analytics.com",
261
+ "googletagmanager.com",
262
+ "facebook.net/en_US/fbevents.js",
263
+ "connect.facebook.net",
264
+ "hotjar.com",
265
+ "fullstory.com",
266
+ "segment.com",
267
+ "mixpanel.com",
268
+ "amplitude.com",
269
+ "sentry.io",
270
+ "clarity.ms",
271
+ "doubleclick.net",
272
+ "adnxs.com",
273
+ "criteo.com",
274
+ "taboola.com",
275
+ "outbrain.com",
276
+ ];
277
+ export class Tier3Browser {
278
+ browser = null;
279
+ browserConfig;
280
+ constructor(browserConfig = {}) {
281
+ this.browserConfig = browserConfig;
282
+ }
283
+ // ── Lifecycle do browser (singleton reutilizável) ──────────────────────
284
+ async getBrowser() {
285
+ if (this.browser?.isConnected())
286
+ return this.browser;
287
+ const launchOptions = {
288
+ headless: this.browserConfig.headless ?? true,
289
+ args: STEALTH_ARGS,
290
+ };
291
+ // Estratégia de resolução do executável:
292
+ // 1. executablePath explícito (máxima flexibilidade)
293
+ // 2. channel (ex: 'chrome' → usa Chrome do sistema)
294
+ // 3. sem channel → Playwright usa o Chromium que ele mesmo baixou
295
+ if (this.browserConfig.executablePath) {
296
+ launchOptions.executablePath = this.browserConfig.executablePath;
297
+ }
298
+ else if (this.browserConfig.channel) {
299
+ launchOptions.channel = this.browserConfig.channel;
300
+ }
301
+ else {
302
+ // Tenta Chrome do sistema primeiro; se não tiver, usa playwright-chromium
303
+ try {
304
+ this.browser = await chromium.launch({ ...launchOptions, channel: "chrome" });
305
+ return this.browser;
306
+ }
307
+ catch {
308
+ // Chrome não encontrado → deixa o Playwright usar o Chromium dele
309
+ }
310
+ }
311
+ this.browser = await chromium.launch(launchOptions);
312
+ return this.browser;
313
+ }
314
+ // ── Scraping principal ─────────────────────────────────────────────────
315
+ async scrape(url, options = {}) {
316
+ const startTime = Date.now();
317
+ const browser = await this.getBrowser();
318
+ let context = null;
319
+ try {
320
+ context = await browser.newContext({
321
+ userAgent: CHROME_UA,
322
+ viewport: { width: 1920, height: 1080 },
323
+ locale: "pt-BR",
324
+ timezoneId: "America/Sao_Paulo",
325
+ extraHTTPHeaders: {
326
+ "accept-language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
327
+ ...(options.headers ?? {}),
328
+ },
329
+ javaScriptEnabled: true,
330
+ // Desabilita WebRTC para evitar vazamento de IP real em ambientes com proxy
331
+ // (equivalente a --disable-webrtc nos args, mas via context)
332
+ });
333
+ const page = await context.newPage();
334
+ // ── Injetar stealth script ANTES de qualquer script da página ─────
335
+ await page.addInitScript(STEALTH_INIT_SCRIPT);
336
+ // ── Bloquear recursos desnecessários ──────────────────────────────
337
+ await page.route("**/*", (route) => {
338
+ const req = route.request();
339
+ const type = req.resourceType();
340
+ const reqUrl = req.url();
341
+ if (BLOCKED_RESOURCE_TYPES.has(type)) {
342
+ return route.abort();
343
+ }
344
+ if (type === "script" &&
345
+ BLOCKED_URL_PATTERNS.some((p) => reqUrl.includes(p))) {
346
+ return route.abort();
347
+ }
348
+ return route.continue();
349
+ });
350
+ // ── Interceptação de APIs JSON (fundamental para SPAs) ────────────
351
+ const interceptedAPIs = [];
352
+ const shouldIntercept = options.interceptAPIs !== false;
353
+ if (shouldIntercept) {
354
+ page.on("response", async (response) => {
355
+ try {
356
+ const contentType = response.headers()["content-type"] ?? "";
357
+ if (!contentType.includes("application/json"))
358
+ return;
359
+ const apiUrl = response.url();
360
+ // Ignora analytics e recursos JS/CSS
361
+ if (BLOCKED_URL_PATTERNS.some((p) => apiUrl.includes(p)))
362
+ return;
363
+ if (/\.(js|css|png|jpg|gif|svg|woff)/.test(apiUrl))
364
+ return;
365
+ // Ignora respostas muito grandes (provavelmente não são dados da view)
366
+ const contentLength = parseInt(response.headers()["content-length"] ?? "0", 10);
367
+ if (contentLength > 500_000)
368
+ return;
369
+ const data = await response.json().catch(() => null);
370
+ if (!data)
371
+ return;
372
+ interceptedAPIs.push({
373
+ url: apiUrl,
374
+ method: response.request().method(),
375
+ statusCode: response.status(),
376
+ contentType,
377
+ data,
378
+ });
379
+ }
380
+ catch {
381
+ // Resposta já consumida ou parse inválido — ignora silenciosamente
382
+ }
383
+ });
384
+ }
385
+ // Auto-dismiss dialogs (alert/confirm/prompt) para não travar a navegação
386
+ page.on("dialog", (dialog) => dialog.dismiss().catch(() => { }));
387
+ // ── Navegação com retry ────────────────────────────────────────────
388
+ // Em sites com anti-bot, a 1ª tentativa pode receber um challenge (403/503).
389
+ // A 2ª tentativa (com cookies/state acumulados) frequentemente passa.
390
+ const timeout = options.timeout ?? 30_000;
391
+ let statusCode = 200;
392
+ let lastNavError = null;
393
+ for (let attempt = 1; attempt <= 2; attempt++) {
394
+ try {
395
+ const navResponse = await page.goto(url, {
396
+ waitUntil: "domcontentloaded",
397
+ timeout: Math.min(timeout, 30_000),
398
+ });
399
+ statusCode = navResponse?.status() ?? 200;
400
+ lastNavError = null;
401
+ break; // Sucesso — sai do loop
402
+ }
403
+ catch (navErr) {
404
+ lastNavError = navErr instanceof Error ? navErr : new Error(String(navErr));
405
+ if (attempt < 2) {
406
+ await page.waitForTimeout(1_500).catch(() => { });
407
+ }
408
+ }
409
+ }
410
+ if (lastNavError) {
411
+ throw new Error(`Tier3 Browser: falha na navegação — ${lastNavError.message}`);
412
+ }
413
+ // ── Aguardar conteúdo dinâmico ────────────────────────────────────
414
+ // networkidle sinaliza que a SPA terminou de carregar
415
+ await page
416
+ .waitForLoadState("networkidle", {
417
+ timeout: Math.min(timeout * 0.5, 15_000),
418
+ })
419
+ .catch(() => {
420
+ // Timeout é aceitável — prosseguimos com o que tiver
421
+ });
422
+ // Seletor específico do usuário (ex: '.product-list', '#app')
423
+ if (options.waitForSelector) {
424
+ await page
425
+ .waitForSelector(options.waitForSelector, {
426
+ state: "visible",
427
+ timeout: 10_000,
428
+ })
429
+ .catch(() => {
430
+ // Seletor não apareceu — prosseguimos assim mesmo
431
+ });
432
+ }
433
+ // ── Scroll para ativar lazy-loading ───────────────────────────────
434
+ // Muitos sites usam IntersectionObserver para carregar conteúdo apenas
435
+ // quando o usuário rola até ele. Varrer a página simula esse comportamento
436
+ // e garante que todo o conteúdo seja carregado antes da extração.
437
+ await page
438
+ .evaluate(() => {
439
+ return new Promise((resolve) => {
440
+ const totalHeight = document.body.scrollHeight;
441
+ if (totalHeight <= window.innerHeight) {
442
+ resolve();
443
+ return;
444
+ }
445
+ const step = Math.max(Math.floor(totalHeight / 6), 300);
446
+ let scrolled = 0;
447
+ const tick = () => {
448
+ scrolled += step;
449
+ window.scrollTo({ top: scrolled, behavior: "smooth" });
450
+ if (scrolled < totalHeight) {
451
+ // Intervalo variado simula comportamento humano e dá tempo ao
452
+ // IntersectionObserver disparar e buscar conteúdo novo
453
+ setTimeout(tick, 120 + Math.floor(Math.random() * 130));
454
+ }
455
+ else {
456
+ window.scrollTo({ top: 0, behavior: "instant" });
457
+ resolve();
458
+ }
459
+ };
460
+ setTimeout(tick, 400);
461
+ });
462
+ })
463
+ .catch(() => {
464
+ // Scroll falhou (página sem body ou JS bloqueado) — ignora
465
+ });
466
+ // ── Extração de conteúdo ──────────────────────────────────────────
467
+ const [html, pageTitle] = await Promise.all([
468
+ page.content(),
469
+ page.title(),
470
+ ]);
471
+ const finalUrl = page.url();
472
+ const formats = options.formats ?? ["markdown", "text"];
473
+ const extracted = extractContent(html, options.onlyMainContent ?? true, finalUrl);
474
+ const result = {
475
+ url: finalUrl,
476
+ statusCode,
477
+ title: pageTitle || extracted.title,
478
+ description: extracted.description || undefined,
479
+ tier: "browser",
480
+ durationMs: Date.now() - startTime,
481
+ links: extracted.links.length > 0 ? extracted.links : undefined,
482
+ interceptedAPIs: interceptedAPIs.length > 0 ? interceptedAPIs : undefined,
483
+ };
484
+ if (formats.includes("markdown"))
485
+ result.markdown = htmlToMarkdown(extracted.html);
486
+ if (formats.includes("html"))
487
+ result.html = extracted.html;
488
+ if (formats.includes("text"))
489
+ result.text = extracted.text;
490
+ return result;
491
+ }
492
+ finally {
493
+ await context?.close().catch(() => { });
494
+ }
495
+ }
496
+ // ── Encerrar browser ───────────────────────────────────────────────────
497
+ async close() {
498
+ if (this.browser) {
499
+ await this.browser.close().catch(() => { });
500
+ this.browser = null;
501
+ }
502
+ }
503
+ }
504
+ //# sourceMappingURL=tier3-browser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tier3-browser.js","sourceRoot":"","sources":["../../../src/scraper/tiers/tier3-browser.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAqC,MAAM,YAAY,CAAC;AAOzE,OAAO,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAC1D,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAE9D,8EAA8E;AAC9E,EAAE;AACF,gEAAgE;AAChE,6EAA6E;AAC7E,EAAE;AACF,yBAAyB;AACzB,oEAAoE;AACpE,2EAA2E;AAC3E,kEAAkE;AAClE,kDAAkD;AAClD,sDAAsD;AACtD,6EAA6E;AAE7E,2EAA2E;AAC3E,gDAAgD;AAChD,MAAM,SAAS,GACb,iHAAiH,CAAC;AAEpH,mDAAmD;AACnD,MAAM,YAAY,GAAG;IACnB,+CAA+C;IAC/C,oDAAoD;IACpD,oBAAoB;IACpB,gBAAgB;IAChB,cAAc;IACd,0BAA0B;IAC1B,yBAAyB;IACzB,iCAAiC;IACjC,aAAa;IACb,eAAe;IACf,yBAAyB;IACzB,iCAAiC;IACjC,0CAA0C;IAC1C,4BAA4B;IAC5B,wBAAwB;IACxB,8BAA8B;IAC9B,sBAAsB;IACtB,wBAAwB;IACxB,0BAA0B;IAC1B,4BAA4B;IAC5B,gBAAgB;IAChB,0BAA0B;IAC1B,oCAAoC;CACrC,CAAC;AAEF,iFAAiF;AACjF,EAAE;AACF,6DAA6D;AAC7D,8DAA8D;AAC9D,EAAE;AACF,8DAA8D;AAC9D,yEAAyE;AACzE,2FAA2F;AAC3F,oFAAoF;AACpF,2EAA2E;AAC3E,iEAAiE;AACjE,yFAAyF;AACzF,2FAA2F;AAC3F,6CAA6C;AAC7C,6FAA6F;AAC7F,iFAAiF;AACjF,MAAM,mBAAmB,GAAG,yBAAyB,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAiMrD,CAAC;AAEF,uDAAuD;AACvD,4EAA4E;AAC5E,MAAM,sBAAsB,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC;AAEnE,+CAA+C;AAC/C,MAAM,oBAAoB,GAAG;IAC3B,sBAAsB;IACtB,sBAAsB;IACtB,gCAAgC;IAChC,sBAAsB;IACtB,YAAY;IACZ,eAAe;IACf,aAAa;IACb,cAAc;IACd,eAAe;IACf,WAAW;IACX,YAAY;IACZ,iBAAiB;IACjB,WAAW;IACX,YAAY;IACZ,aAAa;IACb,cAAc;CACf,CAAC;AAEF,MAAM,OAAO,YAAY;IACf,OAAO,GAAmB,IAAI,CAAC;IACtB,aAAa,CAAgD;IAE9E,YAAY,gBAAkD,EAAE;QAC9D,IAAI,CAAC,aAAa,GAAG,aAAa,CAAC;IACrC,CAAC;IAED,0EAA0E;IAElE,KAAK,CAAC,UAAU;QACtB,IAAI,IAAI,CAAC,OAAO,EAAE,WAAW,EAAE;YAAE,OAAO,IAAI,CAAC,OAAO,CAAC;QAErD,MAAM,aAAa,GAA0C;YAC3D,QAAQ,EAAE,IAAI,CAAC,aAAa,CAAC,QAAQ,IAAI,IAAI;YAC7C,IAAI,EAAE,YAAY;SACnB,CAAC;QAEF,yCAAyC;QACzC,uDAAuD;QACvD,sDAAsD;QACtD,oEAAoE;QACpE,IAAI,IAAI,CAAC,aAAa,CAAC,cAAc,EAAE,CAAC;YACtC,aAAa,CAAC,cAAc,GAAG,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC;QACnE,CAAC;aAAM,IAAI,IAAI,CAAC,aAAa,CAAC,OAAO,EAAE,CAAC;YACtC,aAAa,CAAC,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC;QACrD,CAAC;aAAM,CAAC;YACN,0EAA0E;YAC1E,IAAI,CAAC;gBACH,IAAI,CAAC,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,EAAE,GAAG,aAAa,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC;gBAC9E,OAAO,IAAI,CAAC,OAAO,CAAC;YACtB,CAAC;YAAC,MAAM,CAAC;gBACP,kEAAkE;YACpE,CAAC;QACH,CAAC;QAED,IAAI,CAAC,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC;QACpD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;IAED,0EAA0E;IAE1E,KAAK,CAAC,MAAM,CAAC,GAAW,EAAE,UAAyB,EAAE;QACnD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;QAExC,IAAI,OAAO,GAA0B,IAAI,CAAC;QAE1C,IAAI,CAAC;YACH,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC;gBACjC,SAAS,EAAE,SAAS;gBACpB,QAAQ,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE;gBACvC,MAAM,EAAE,OAAO;gBACf,UAAU,EAAE,mBAAmB;gBAC/B,gBAAgB,EAAE;oBAChB,iBAAiB,EAAE,qCAAqC;oBACxD,GAAG,CAAC,OAAO,CAAC,OAAO,IAAI,EAAE,CAAC;iBAC3B;gBACD,iBAAiB,EAAE,IAAI;gBACvB,4EAA4E;gBAC5E,6DAA6D;aAC9D,CAAC,CAAC;YAEH,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;YAErC,qEAAqE;YACrE,MAAM,IAAI,CAAC,aAAa,CAAC,mBAAmB,CAAC,CAAC;YAE9C,qEAAqE;YACrE,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE;gBACjC,MAAM,GAAG,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC;gBAC5B,MAAM,IAAI,GAAG,GAAG,CAAC,YAAY,EAAE,CAAC;gBAChC,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,EAAE,CAAC;gBAEzB,IAAI,sBAAsB,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;oBACrC,OAAO,KAAK,CAAC,KAAK,EAAE,CAAC;gBACvB,CAAC;gBACD,IACE,IAAI,KAAK,QAAQ;oBACjB,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,EACpD,CAAC;oBACD,OAAO,KAAK,CAAC,KAAK,EAAE,CAAC;gBACvB,CAAC;gBAED,OAAO,KAAK,CAAC,QAAQ,EAAE,CAAC;YAC1B,CAAC,CAAC,CAAC;YAEH,qEAAqE;YACrE,MAAM,eAAe,GAAqB,EAAE,CAAC;YAC7C,MAAM,eAAe,GAAG,OAAO,CAAC,aAAa,KAAK,KAAK,CAAC;YAExD,IAAI,eAAe,EAAE,CAAC;gBACpB,IAAI,CAAC,EAAE,CAAC,UAAU,EAAE,KAAK,EAAE,QAAQ,EAAE,EAAE;oBACrC,IAAI,CAAC;wBACH,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,EAAE,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;wBAC7D,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,kBAAkB,CAAC;4BAAE,OAAO;wBAEtD,MAAM,MAAM,GAAG,QAAQ,CAAC,GAAG,EAAE,CAAC;wBAC9B,qCAAqC;wBACrC,IAAI,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;4BAAE,OAAO;wBACjE,IAAI,iCAAiC,CAAC,IAAI,CAAC,MAAM,CAAC;4BAAE,OAAO;wBAE3D,uEAAuE;wBACvE,MAAM,aAAa,GAAG,QAAQ,CAC5B,QAAQ,CAAC,OAAO,EAAE,CAAC,gBAAgB,CAAC,IAAI,GAAG,EAC3C,EAAE,CACH,CAAC;wBACF,IAAI,aAAa,GAAG,OAAO;4BAAE,OAAO;wBAEpC,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,CAAC;wBACrD,IAAI,CAAC,IAAI;4BAAE,OAAO;wBAElB,eAAe,CAAC,IAAI,CAAC;4BACnB,GAAG,EAAE,MAAM;4BACX,MAAM,EAAE,QAAQ,CAAC,OAAO,EAAE,CAAC,MAAM,EAAE;4BACnC,UAAU,EAAE,QAAQ,CAAC,MAAM,EAAE;4BAC7B,WAAW;4BACX,IAAI;yBACL,CAAC,CAAC;oBACL,CAAC;oBAAC,MAAM,CAAC;wBACP,mEAAmE;oBACrE,CAAC;gBACH,CAAC,CAAC,CAAC;YACL,CAAC;YAED,0EAA0E;YAC1E,IAAI,CAAC,EAAE,CAAC,QAAQ,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC,CAAC;YAEhE,sEAAsE;YACtE,6EAA6E;YAC7E,sEAAsE;YACtE,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,MAAM,CAAC;YAC1C,IAAI,UAAU,GAAG,GAAG,CAAC;YACrB,IAAI,YAAY,GAAiB,IAAI,CAAC;YAEtC,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,CAAC,EAAE,OAAO,EAAE,EAAE,CAAC;gBAC9C,IAAI,CAAC;oBACH,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;wBACvC,SAAS,EAAE,kBAAkB;wBAC7B,OAAO,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,MAAM,CAAC;qBACnC,CAAC,CAAC;oBACH,UAAU,GAAG,WAAW,EAAE,MAAM,EAAE,IAAI,GAAG,CAAC;oBAC1C,YAAY,GAAG,IAAI,CAAC;oBACpB,MAAM,CAAC,wBAAwB;gBACjC,CAAC;gBAAC,OAAO,MAAM,EAAE,CAAC;oBAChB,YAAY,GAAG,MAAM,YAAY,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;oBAC5E,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;wBAChB,MAAM,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;oBACnD,CAAC;gBACH,CAAC;YACH,CAAC;YAED,IAAI,YAAY,EAAE,CAAC;gBACjB,MAAM,IAAI,KAAK,CAAC,uCAAuC,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC;YACjF,CAAC;YAED,qEAAqE;YACrE,sDAAsD;YACtD,MAAM,IAAI;iBACP,gBAAgB,CAAC,aAAa,EAAE;gBAC/B,OAAO,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,GAAG,GAAG,EAAE,MAAM,CAAC;aACzC,CAAC;iBACD,KAAK,CAAC,GAAG,EAAE;gBACV,qDAAqD;YACvD,CAAC,CAAC,CAAC;YAEL,8DAA8D;YAC9D,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;gBAC5B,MAAM,IAAI;qBACP,eAAe,CAAC,OAAO,CAAC,eAAe,EAAE;oBACxC,KAAK,EAAE,SAAS;oBAChB,OAAO,EAAE,MAAM;iBAChB,CAAC;qBACD,KAAK,CAAC,GAAG,EAAE;oBACV,kDAAkD;gBACpD,CAAC,CAAC,CAAC;YACP,CAAC;YAED,qEAAqE;YACrE,uEAAuE;YACvE,2EAA2E;YAC3E,kEAAkE;YAClE,MAAM,IAAI;iBACP,QAAQ,CAAC,GAAG,EAAE;gBACb,OAAO,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;oBACnC,MAAM,WAAW,GAAG,QAAQ,CAAC,IAAI,CAAC,YAAY,CAAC;oBAC/C,IAAI,WAAW,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC;wBACtC,OAAO,EAAE,CAAC;wBACV,OAAO;oBACT,CAAC;oBAED,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,WAAW,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;oBACxD,IAAI,QAAQ,GAAG,CAAC,CAAC;oBAEjB,MAAM,IAAI,GAAG,GAAG,EAAE;wBAChB,QAAQ,IAAI,IAAI,CAAC;wBACjB,MAAM,CAAC,QAAQ,CAAC,EAAE,GAAG,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC,CAAC;wBACvD,IAAI,QAAQ,GAAG,WAAW,EAAE,CAAC;4BAC3B,8DAA8D;4BAC9D,uDAAuD;4BACvD,UAAU,CAAC,IAAI,EAAE,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC;wBAC1D,CAAC;6BAAM,CAAC;4BACN,MAAM,CAAC,QAAQ,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,QAAQ,EAAE,SAAS,EAAE,CAAC,CAAC;4BACjD,OAAO,EAAE,CAAC;wBACZ,CAAC;oBACH,CAAC,CAAC;oBAEF,UAAU,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;gBACxB,CAAC,CAAC,CAAC;YACL,CAAC,CAAC;iBACD,KAAK,CAAC,GAAG,EAAE;gBACV,2DAA2D;YAC7D,CAAC,CAAC,CAAC;YAEL,qEAAqE;YACrE,MAAM,CAAC,IAAI,EAAE,SAAS,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;gBAC1C,IAAI,CAAC,OAAO,EAAE;gBACd,IAAI,CAAC,KAAK,EAAE;aACb,CAAC,CAAC;YAEH,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAC5B,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;YACxD,MAAM,SAAS,GAAG,cAAc,CAC9B,IAAI,EACJ,OAAO,CAAC,eAAe,IAAI,IAAI,EAC/B,QAAQ,CACT,CAAC;YAEF,MAAM,MAAM,GAAiB;gBAC3B,GAAG,EAAE,QAAQ;gBACb,UAAU;gBACV,KAAK,EAAE,SAAS,IAAI,SAAS,CAAC,KAAK;gBACnC,WAAW,EAAE,SAAS,CAAC,WAAW,IAAI,SAAS;gBAC/C,IAAI,EAAE,SAAS;gBACf,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;gBAClC,KAAK,EAAE,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;gBAC/D,eAAe,EACb,eAAe,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,SAAS;aAC3D,CAAC;YAEF,IAAI,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC;gBAAE,MAAM,CAAC,QAAQ,GAAG,cAAc,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YACnF,IAAI,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAAM,MAAM,CAAC,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC;YAC/D,IAAI,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAAM,MAAM,CAAC,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC;YAE/D,OAAO,MAAM,CAAC;QAChB,CAAC;gBAAS,CAAC;YACT,MAAM,OAAO,EAAE,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QACzC,CAAC;IACH,CAAC;IAED,0EAA0E;IAE1E,KAAK,CAAC,KAAK;QACT,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;YAC3C,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,130 @@
1
+ /**
2
+ * Qual tier foi usado para fazer o scrape:
3
+ * - "http" → Tier 1: fetch nativo + Cheerio (sem browser, ~100-500ms)
4
+ * - "stealth" → Tier 2: got-scraping com TLS fingerprint spoofing (~200-800ms)
5
+ * - "browser" → Tier 3: Playwright Chromium (browser completo, fallback final)
6
+ */
7
+ export type ScrapeTier = "http" | "stealth" | "browser";
8
+ /** Resultado interno retornado por cada tier (não exposto ao usuário final) */
9
+ export interface TierRawResult {
10
+ /** HTML completo da página */
11
+ html: string;
12
+ /** URL final após redirecionamentos */
13
+ finalUrl: string;
14
+ /** HTTP status code */
15
+ statusCode: number;
16
+ /**
17
+ * true = conteúdo suficiente (sem loading screen / anti-bot).
18
+ * false = deve tentar o próximo tier na cascata.
19
+ */
20
+ sufficient: boolean;
21
+ }
22
+ /** Configuração global do Firecrawl */
23
+ export interface FirecrawlConfig {
24
+ /** Timeout padrão em ms. Default: 30_000 */
25
+ timeout?: number;
26
+ /**
27
+ * Forçar uso de um tier específico em todos os scrapes desta instância.
28
+ * Pode ser sobrescrito por opção na chamada de `scrape()`.
29
+ */
30
+ forceTier?: ScrapeTier;
31
+ /** Domínios permitidos (whitelist anti-SSRF). Qualquer URL fora da lista é rejeitada. */
32
+ allowedDomains?: string[];
33
+ /** Domínios bloqueados (blacklist anti-SSRF). */
34
+ blockedDomains?: string[];
35
+ /** Log detalhado mostrando qual tier foi usado e por quê. Default: false */
36
+ verbose?: boolean;
37
+ /** Configurações do browser Chromium (Tier 3) */
38
+ browserConfig?: {
39
+ /** Rodar em modo headless. Default: true */
40
+ headless?: boolean;
41
+ /**
42
+ * Canal do browser instalado no sistema.
43
+ * Exemplos: 'chrome', 'chromium', 'msedge'.
44
+ * Se omitido, tenta 'chrome' (sistema) e depois playwright-chromium.
45
+ */
46
+ channel?: string;
47
+ /** Caminho explícito para o executável do browser */
48
+ executablePath?: string;
49
+ };
50
+ }
51
+ /** Formatos de saída do conteúdo */
52
+ export type ContentFormat = "markdown" | "html" | "text";
53
+ /** Opções de scraping passadas pelo usuário */
54
+ export interface ScrapeOptions {
55
+ /** Formatos de saída desejados. Default: ['markdown', 'text'] */
56
+ formats?: ContentFormat[];
57
+ /** Timeout em ms. Default: 30_000 */
58
+ timeout?: number;
59
+ /** Só tentar extrair o conteúdo principal (remove nav, footer, ads). Default: true */
60
+ onlyMainContent?: boolean;
61
+ /** Headers HTTP extras */
62
+ headers?: Record<string, string>;
63
+ /**
64
+ * Forçar uso de um tier específico (ignora a cascata automática).
65
+ * - "http" → só HTTP + Cheerio
66
+ * - "stealth" → pula HTTP, usa got-scraping diretamente
67
+ * - "browser" → vai direto ao Playwright Chromium
68
+ */
69
+ forceTier?: ScrapeTier;
70
+ /** Aguardar esse seletor CSS aparecer e estar visível antes de extrair */
71
+ waitForSelector?: string;
72
+ /** Interceptar respostas JSON das APIs chamadas pela SPA */
73
+ interceptAPIs?: boolean;
74
+ }
75
+ /** Dados SSR embutidos em tags <script> pelo framework */
76
+ export interface SSRData {
77
+ /**
78
+ * Framework que gerou o dado:
79
+ * - "next" → Next.js (#__NEXT_DATA__)
80
+ * - "nuxt" → Nuxt 2/3 (window.__NUXT__)
81
+ * - "gatsby" → Gatsby (window.___gatsby)
82
+ * - "remix" → Remix (window.__remixContext)
83
+ * - "sveltekit" → SvelteKit (script[data-sveltekit-fetched] ou window.__SVELTEKIT__)
84
+ * - "vue" → Vue SSR (window.__VUE_SSR_CONTEXT__ / window.__VUE_STORE__)
85
+ * - "angular" → Angular Universal (script#ng-state)
86
+ * - "tanstack" → TanStack Router / Start (window.__TSR_DEHYDRATED__)
87
+ * - "generic" → Outros (window.__INITIAL_STATE__, __APP_STATE__, etc.)
88
+ */
89
+ type: "next" | "nuxt" | "gatsby" | "remix" | "sveltekit" | "vue" | "angular" | "tanstack" | "generic";
90
+ /** Objeto JSON extraído */
91
+ data: unknown;
92
+ }
93
+ /** Chamada de API interceptada durante renderização do browser */
94
+ export interface InterceptedAPI {
95
+ url: string;
96
+ method: string;
97
+ statusCode: number;
98
+ contentType: string;
99
+ data: unknown;
100
+ }
101
+ /** Resultado completo do scrape */
102
+ export interface ScrapeResult {
103
+ /** URL final (após redirecionamentos) */
104
+ url: string;
105
+ /** HTTP status code */
106
+ statusCode: number;
107
+ /** Título da página */
108
+ title: string;
109
+ /** Meta description */
110
+ description?: string;
111
+ /** Conteúdo em Markdown */
112
+ markdown?: string;
113
+ /** Conteúdo em HTML limpo */
114
+ html?: string;
115
+ /** Conteúdo em texto puro */
116
+ text?: string;
117
+ /** Tier usado para fazer o scrape */
118
+ tier: ScrapeTier;
119
+ /** Tempo total em ms */
120
+ durationMs: number;
121
+ /** Links encontrados na página */
122
+ links?: string[];
123
+ /** Dados SSR extraídos (Next.js, Nuxt, etc.) */
124
+ ssrData?: SSRData;
125
+ /** Chamadas JSON interceptadas durante renderização (só Tier 3) */
126
+ interceptedAPIs?: InterceptedAPI[];
127
+ /** Motivo de falha, se houver */
128
+ error?: string;
129
+ }
130
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/scraper/types.ts"],"names":[],"mappings":"AAEA;;;;;GAKG;AACH,MAAM,MAAM,UAAU,GAAG,MAAM,GAAG,SAAS,GAAG,SAAS,CAAC;AAExD,+EAA+E;AAC/E,MAAM,WAAW,aAAa;IAC5B,8BAA8B;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,uCAAuC;IACvC,QAAQ,EAAE,MAAM,CAAC;IACjB,uBAAuB;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,UAAU,EAAE,OAAO,CAAC;CACrB;AAED,uCAAuC;AACvC,MAAM,WAAW,eAAe;IAC9B,4CAA4C;IAC5C,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,SAAS,CAAC,EAAE,UAAU,CAAC;IAEvB,yFAAyF;IACzF,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAE1B,iDAAiD;IACjD,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAE1B,4EAA4E;IAC5E,OAAO,CAAC,EAAE,OAAO,CAAC;IAElB,iDAAiD;IACjD,aAAa,CAAC,EAAE;QACd,4CAA4C;QAC5C,QAAQ,CAAC,EAAE,OAAO,CAAC;QACnB;;;;WAIG;QACH,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,qDAAqD;QACrD,cAAc,CAAC,EAAE,MAAM,CAAC;KACzB,CAAC;CACH;AAED,oCAAoC;AACpC,MAAM,MAAM,aAAa,GAAG,UAAU,GAAG,MAAM,GAAG,MAAM,CAAC;AAEzD,+CAA+C;AAC/C,MAAM,WAAW,aAAa;IAC5B,iEAAiE;IACjE,OAAO,CAAC,EAAE,aAAa,EAAE,CAAC;IAC1B,qCAAqC;IACrC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,sFAAsF;IACtF,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,0BAA0B;IAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC;;;;;OAKG;IACH,SAAS,CAAC,EAAE,UAAU,CAAC;IAGvB,0EAA0E;IAC1E,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,4DAA4D;IAC5D,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED,0DAA0D;AAC1D,MAAM,WAAW,OAAO;IACtB;;;;;;;;;;;OAWG;IACH,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,QAAQ,GAAG,OAAO,GAAG,WAAW,GAAG,KAAK,GAAG,SAAS,GAAG,UAAU,GAAG,SAAS,CAAC;IACtG,2BAA2B;IAC3B,IAAI,EAAE,OAAO,CAAC;CACf;AAED,kEAAkE;AAClE,MAAM,WAAW,cAAc;IAC7B,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE,OAAO,CAAC;CACf;AAED,mCAAmC;AACnC,MAAM,WAAW,YAAY;IAC3B,yCAAyC;IACzC,GAAG,EAAE,MAAM,CAAC;IACZ,uBAAuB;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB,uBAAuB;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,uBAAuB;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IAGrB,2BAA2B;IAC3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,6BAA6B;IAC7B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,6BAA6B;IAC7B,IAAI,CAAC,EAAE,MAAM,CAAC;IAGd,qCAAqC;IACrC,IAAI,EAAE,UAAU,CAAC;IACjB,wBAAwB;IACxB,UAAU,EAAE,MAAM,CAAC;IACnB,kCAAkC;IAClC,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;IAGjB,gDAAgD;IAChD,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,mEAAmE;IACnE,eAAe,CAAC,EAAE,cAAc,EAAE,CAAC;IAEnC,iCAAiC;IACjC,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB"}
@@ -0,0 +1,3 @@
1
+ // ─── Tipos do Firecrawl ────────────────────────────────────────────────────
2
+ export {};
3
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/scraper/types.ts"],"names":[],"mappings":"AAAA,8EAA8E"}