auspex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/agent/actions.d.ts +5 -0
- package/dist/agent/actions.d.ts.map +1 -0
- package/dist/agent/actions.js +26 -0
- package/dist/agent/actions.js.map +1 -0
- package/dist/agent/agent.d.ts +12 -0
- package/dist/agent/agent.d.ts.map +1 -0
- package/dist/agent/agent.js +147 -0
- package/dist/agent/agent.js.map +1 -0
- package/dist/agent/loop.d.ts +6 -0
- package/dist/agent/loop.d.ts.map +1 -0
- package/dist/agent/loop.js +165 -0
- package/dist/agent/loop.js.map +1 -0
- package/dist/agent/report.d.ts +3 -0
- package/dist/agent/report.d.ts.map +1 -0
- package/dist/agent/report.js +90 -0
- package/dist/agent/report.js.map +1 -0
- package/dist/browser/executor.d.ts +5 -0
- package/dist/browser/executor.d.ts.map +1 -0
- package/dist/browser/executor.js +33 -0
- package/dist/browser/executor.js.map +1 -0
- package/dist/browser/snapshot.d.ts +6 -0
- package/dist/browser/snapshot.d.ts.map +1 -0
- package/dist/browser/snapshot.js +145 -0
- package/dist/browser/snapshot.js.map +1 -0
- package/dist/config/defaults.d.ts +10 -0
- package/dist/config/defaults.d.ts.map +1 -0
- package/dist/config/defaults.js +10 -0
- package/dist/config/defaults.js.map +1 -0
- package/dist/config/schema.d.ts +59 -0
- package/dist/config/schema.d.ts.map +1 -0
- package/dist/config/schema.js +23 -0
- package/dist/config/schema.js.map +1 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/client.d.ts +23 -0
- package/dist/llm/client.d.ts.map +1 -0
- package/dist/llm/client.js +51 -0
- package/dist/llm/client.js.map +1 -0
- package/dist/llm/prompt.d.ts +3 -0
- package/dist/llm/prompt.d.ts.map +1 -0
- package/dist/llm/prompt.js +36 -0
- package/dist/llm/prompt.js.map +1 -0
- package/dist/scraper/extractors/content.d.ts +22 -0
- package/dist/scraper/extractors/content.d.ts.map +1 -0
- package/dist/scraper/extractors/content.js +237 -0
- package/dist/scraper/extractors/content.js.map +1 -0
- package/dist/scraper/extractors/ssr.d.ts +17 -0
- package/dist/scraper/extractors/ssr.d.ts.map +1 -0
- package/dist/scraper/extractors/ssr.js +162 -0
- package/dist/scraper/extractors/ssr.js.map +1 -0
- package/dist/scraper/extractors/to-markdown.d.ts +5 -0
- package/dist/scraper/extractors/to-markdown.d.ts.map +1 -0
- package/dist/scraper/extractors/to-markdown.js +103 -0
- package/dist/scraper/extractors/to-markdown.js.map +1 -0
- package/dist/scraper/index.d.ts +27 -0
- package/dist/scraper/index.d.ts.map +1 -0
- package/dist/scraper/index.js +178 -0
- package/dist/scraper/index.js.map +1 -0
- package/dist/scraper/tiers/tier1-http.d.ts +5 -0
- package/dist/scraper/tiers/tier1-http.d.ts.map +1 -0
- package/dist/scraper/tiers/tier1-http.js +120 -0
- package/dist/scraper/tiers/tier1-http.js.map +1 -0
- package/dist/scraper/tiers/tier2-stealth.d.ts +5 -0
- package/dist/scraper/tiers/tier2-stealth.d.ts.map +1 -0
- package/dist/scraper/tiers/tier2-stealth.js +106 -0
- package/dist/scraper/tiers/tier2-stealth.js.map +1 -0
- package/dist/scraper/tiers/tier3-browser.d.ts +10 -0
- package/dist/scraper/tiers/tier3-browser.d.ts.map +1 -0
- package/dist/scraper/tiers/tier3-browser.js +504 -0
- package/dist/scraper/tiers/tier3-browser.js.map +1 -0
- package/dist/scraper/types.d.ts +130 -0
- package/dist/scraper/types.d.ts.map +1 -0
- package/dist/scraper/types.js +3 -0
- package/dist/scraper/types.js.map +1 -0
- package/dist/security/action-validator.d.ts +83 -0
- package/dist/security/action-validator.d.ts.map +1 -0
- package/dist/security/action-validator.js +36 -0
- package/dist/security/action-validator.js.map +1 -0
- package/dist/security/url-validator.d.ts +9 -0
- package/dist/security/url-validator.d.ts.map +1 -0
- package/dist/security/url-validator.js +69 -0
- package/dist/security/url-validator.js.map +1 -0
- package/dist/types.d.ts +95 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +54 -0
- package/readme.md +760 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import { validateUrl } from "../security/url-validator.js";
|
|
2
|
+
import { Tier1HTTP } from "./tiers/tier1-http.js";
|
|
3
|
+
import { Tier2Stealth } from "./tiers/tier2-stealth.js";
|
|
4
|
+
import { Tier3Browser } from "./tiers/tier3-browser.js";
|
|
5
|
+
// ─── Firecrawl ─────────────────────────────────────────────────────────────
|
|
6
|
+
//
|
|
7
|
+
// Scraper de alta qualidade com fallback automático em 3 tiers:
|
|
8
|
+
//
|
|
9
|
+
// Tier 1 → HTTP puro (fetch nativo) (~100-500ms, sem browser)
|
|
10
|
+
// ↓ bloqueado ou conteúdo insuficiente (SPA, anti-bot básico)
|
|
11
|
+
// Tier 2 → HTTP Stealth (got-scraping) (~200-800ms, TLS fingerprint)
|
|
12
|
+
// ↓ ainda bloqueado ou SPA sem SSR
|
|
13
|
+
// Tier 3 → Playwright Chromium + stealth (~2-10s, browser completo)
|
|
14
|
+
//
|
|
15
|
+
// Anti-SSRF integrado: todas as URLs são validadas antes do scrape.
|
|
16
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
17
|
+
export class Firecrawl {
|
|
18
|
+
fullConfig;
|
|
19
|
+
tier1;
|
|
20
|
+
tier2;
|
|
21
|
+
tier3;
|
|
22
|
+
config;
|
|
23
|
+
constructor(fullConfig = {}) {
|
|
24
|
+
this.fullConfig = fullConfig;
|
|
25
|
+
this.tier1 = new Tier1HTTP();
|
|
26
|
+
this.tier2 = new Tier2Stealth();
|
|
27
|
+
this.tier3 = new Tier3Browser(fullConfig.browserConfig);
|
|
28
|
+
this.config = {
|
|
29
|
+
timeout: fullConfig.timeout ?? 30_000,
|
|
30
|
+
verbose: fullConfig.verbose ?? false,
|
|
31
|
+
forceTier: fullConfig.forceTier,
|
|
32
|
+
allowedDomains: fullConfig.allowedDomains,
|
|
33
|
+
blockedDomains: fullConfig.blockedDomains,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
// ── Scrape de uma única URL ────────────────────────────────────────────
|
|
37
|
+
async scrape(url, options = {}) {
|
|
38
|
+
// Validação anti-SSRF antes de qualquer requisição
|
|
39
|
+
const validUrl = await validateUrl(url, {
|
|
40
|
+
allowedDomains: this.config.allowedDomains,
|
|
41
|
+
blockedDomains: this.config.blockedDomains,
|
|
42
|
+
});
|
|
43
|
+
const mergedOptions = {
|
|
44
|
+
timeout: this.config.timeout,
|
|
45
|
+
...options,
|
|
46
|
+
};
|
|
47
|
+
// ── Tier forçado: pula a cascata automática ────────────────────────
|
|
48
|
+
const forced = options.forceTier ?? this.config.forceTier;
|
|
49
|
+
if (forced === "browser") {
|
|
50
|
+
this.log("🌐 Tier 3 (Playwright) forçado");
|
|
51
|
+
return this.tier3.scrape(validUrl, mergedOptions);
|
|
52
|
+
}
|
|
53
|
+
if (forced === "stealth") {
|
|
54
|
+
this.log("🥷 Tier 2 (Stealth HTTP) forçado");
|
|
55
|
+
return this.tier2.scrape(validUrl, mergedOptions);
|
|
56
|
+
}
|
|
57
|
+
if (forced === "http") {
|
|
58
|
+
this.log("🔗 Tier 1 (HTTP) forçado");
|
|
59
|
+
return this.tier1.scrape(validUrl, mergedOptions);
|
|
60
|
+
}
|
|
61
|
+
// ── Modo automático: Tier 1 → Tier 2 → Tier 3 ────────────────────
|
|
62
|
+
// ── Tier 1: HTTP puro (fetch nativo, sem overhead de TLS) ─────────
|
|
63
|
+
let tier1Error = null;
|
|
64
|
+
try {
|
|
65
|
+
const result = await this.tier1.scrape(validUrl, mergedOptions);
|
|
66
|
+
const content = result.markdown ?? result.text ?? "";
|
|
67
|
+
// Menos de 200 chars sem dados SSR = página quase certamente vazia
|
|
68
|
+
// (SPA sem SSR, Cloudflare challenge, bloqueio silencioso, etc.)
|
|
69
|
+
if (content.length < 200 && !result.ssrData) {
|
|
70
|
+
tier1Error = "Conteúdo insuficiente após HTTP — provavelmente SPA ou bloqueio silencioso";
|
|
71
|
+
this.log(`⚠ Tier 1: ${tier1Error}`);
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
this.log(`✓ Tier 1 (HTTP) — ${result.durationMs}ms`);
|
|
75
|
+
return result;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
catch (err) {
|
|
79
|
+
tier1Error = err instanceof Error ? err.message : String(err);
|
|
80
|
+
this.log(`⚠ Tier 1 falhou: ${tier1Error}`);
|
|
81
|
+
}
|
|
82
|
+
// ── Tier 2: HTTP Stealth (got-scraping, TLS fingerprint) ──────────
|
|
83
|
+
let tier2Error = null;
|
|
84
|
+
this.log("🥷 Ativando fallback → Tier 2 (Stealth HTTP)...");
|
|
85
|
+
try {
|
|
86
|
+
const result = await this.tier2.scrape(validUrl, mergedOptions);
|
|
87
|
+
const content = result.markdown ?? result.text ?? "";
|
|
88
|
+
// Mesmo com TLS spoofing pode ser SPA que precisa de browser
|
|
89
|
+
if (content.length < 200 && !result.ssrData) {
|
|
90
|
+
tier2Error = "Conteúdo insuficiente após Stealth — SPA que precisa de browser";
|
|
91
|
+
this.log(`⚠ Tier 2: ${tier2Error}`);
|
|
92
|
+
}
|
|
93
|
+
else {
|
|
94
|
+
this.log(`✓ Tier 2 (Stealth) — ${result.durationMs}ms`);
|
|
95
|
+
return result;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
catch (err) {
|
|
99
|
+
tier2Error = err instanceof Error ? err.message : String(err);
|
|
100
|
+
this.log(`⚠ Tier 2 (Stealth) falhou: ${tier2Error}`);
|
|
101
|
+
}
|
|
102
|
+
// ── Tier 3: Playwright Chromium + stealth (fallback final) ────────
|
|
103
|
+
this.log("🌐 Ativando fallback final → Tier 3 (Playwright)...");
|
|
104
|
+
try {
|
|
105
|
+
const result = await this.tier3.scrape(validUrl, mergedOptions);
|
|
106
|
+
this.log(`✓ Tier 3 (Playwright) — ${result.durationMs}ms`);
|
|
107
|
+
return result;
|
|
108
|
+
}
|
|
109
|
+
catch (err) {
|
|
110
|
+
const tier3Error = err instanceof Error ? err.message : String(err);
|
|
111
|
+
this.log(`✗ Tier 3 (Playwright) falhou: ${tier3Error}`);
|
|
112
|
+
// Todos os tiers falharam — retorna resultado com erro consolidado
|
|
113
|
+
return {
|
|
114
|
+
url: validUrl,
|
|
115
|
+
statusCode: 0,
|
|
116
|
+
title: "",
|
|
117
|
+
tier: "browser",
|
|
118
|
+
durationMs: 0,
|
|
119
|
+
error: [
|
|
120
|
+
"Todos os tiers falharam:",
|
|
121
|
+
` Tier 1 (HTTP): ${tier1Error ?? "não tentado"}`,
|
|
122
|
+
` Tier 2 (Stealth): ${tier2Error ?? "não tentado"}`,
|
|
123
|
+
` Tier 3 (Browser): ${tier3Error}`,
|
|
124
|
+
].join("\n"),
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
// ── Scrape em lote com concorrência controlada ─────────────────────────
|
|
129
|
+
/**
|
|
130
|
+
* Scrapia múltiplas URLs em paralelo com concorrência limitada.
|
|
131
|
+
* Erros em URLs individuais não derrubam o lote inteiro.
|
|
132
|
+
*
|
|
133
|
+
* @param urls - Lista de URLs a scrapeiar
|
|
134
|
+
* @param options - Opções aplicadas a todas as URLs
|
|
135
|
+
* @param concurrency - Máximo de scrapes simultâneos. Default: 3
|
|
136
|
+
*/
|
|
137
|
+
async scrapeMany(urls, options = {}, concurrency = 3) {
|
|
138
|
+
const results = [];
|
|
139
|
+
const queue = [...urls];
|
|
140
|
+
while (queue.length > 0) {
|
|
141
|
+
const batch = queue.splice(0, concurrency);
|
|
142
|
+
const settled = await Promise.allSettled(batch.map((u) => this.scrape(u, options)));
|
|
143
|
+
for (const outcome of settled) {
|
|
144
|
+
if (outcome.status === "fulfilled") {
|
|
145
|
+
results.push(outcome.value);
|
|
146
|
+
}
|
|
147
|
+
else {
|
|
148
|
+
results.push({
|
|
149
|
+
url: "unknown",
|
|
150
|
+
statusCode: 0,
|
|
151
|
+
title: "",
|
|
152
|
+
tier: "http",
|
|
153
|
+
durationMs: 0,
|
|
154
|
+
error: outcome.reason instanceof Error
|
|
155
|
+
? outcome.reason.message
|
|
156
|
+
: String(outcome.reason),
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return results;
|
|
162
|
+
}
|
|
163
|
+
// ── Encerrar recursos ──────────────────────────────────────────────────
|
|
164
|
+
/**
|
|
165
|
+
* Fecha o browser Playwright (Tier 3).
|
|
166
|
+
* Sempre chamar ao terminar para evitar processos Chromium órfãos.
|
|
167
|
+
*/
|
|
168
|
+
async close() {
|
|
169
|
+
await this.tier3.close();
|
|
170
|
+
}
|
|
171
|
+
// ── Helpers ────────────────────────────────────────────────────────────
|
|
172
|
+
log(msg) {
|
|
173
|
+
if (this.config.verbose) {
|
|
174
|
+
console.log(`[Firecrawl] ${msg}`);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/scraper/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,8BAA8B,CAAC;AAC3D,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AAOxD,8EAA8E;AAC9E,EAAE;AACF,gEAAgE;AAChE,EAAE;AACF,wEAAwE;AACxE,2EAA2E;AAC3E,4EAA4E;AAC5E,gDAAgD;AAChD,yEAAyE;AACzE,EAAE;AACF,oEAAoE;AACpE,6EAA6E;AAE7E,MAAM,OAAO,SAAS;IAYS;IAXZ,KAAK,CAAY;IACjB,KAAK,CAAe;IACpB,KAAK,CAAe;IACpB,MAAM,CAMrB;IAEF,YAA6B,aAA8B,EAAE;QAAhC,eAAU,GAAV,UAAU,CAAsB;QAC3D,IAAI,CAAC,KAAK,GAAG,IAAI,SAAS,EAAE,CAAC;QAC7B,IAAI,CAAC,KAAK,GAAG,IAAI,YAAY,EAAE,CAAC;QAChC,IAAI,CAAC,KAAK,GAAG,IAAI,YAAY,CAAC,UAAU,CAAC,aAAa,CAAC,CAAC;QACxD,IAAI,CAAC,MAAM,GAAG;YACZ,OAAO,EAAE,UAAU,CAAC,OAAO,IAAI,MAAM;YACrC,OAAO,EAAE,UAAU,CAAC,OAAO,IAAI,KAAK;YACpC,SAAS,EAAE,UAAU,CAAC,SAAS;YAC/B,cAAc,EAAE,UAAU,CAAC,cAAc;YACzC,cAAc,EAAE,UAAU,CAAC,cAAc;SAC1C,CAAC;IACJ,CAAC;IAED,0EAA0E;IAE1E,KAAK,CAAC,MAAM,CAAC,GAAW,EAAE,UAAyB,EAAE;QACnD,mDAAmD;QACnD,MAAM,QAAQ,GAAG,MAAM,WAAW,CAAC,GAAG,EAAE;YACtC,cAAc,EAAE,IAAI,CAAC,MAAM,CAAC,cAAc;YAC1C,cAAc,EAAE,IAAI,CAAC,MAAM,CAAC,cAAc;SAC3C,CAAC,CAAC;QAEH,MAAM,aAAa,GAAkB;YACnC,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO;YAC5B,GAAG,OAAO;SACX,CAAC;QAEF,sEAAsE;QACtE,MAAM,MAAM,GAAG,OAAO,CAAC,SAAS,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC;QAE1D,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,IAAI,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;YAC3C,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;QACpD,CAAC;QAED,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,IAAI,CAAC,GAAG,CAAC,kCAAkC,CAAC,CAAC;YAC7C,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;QACpD,CAAC;QAED,IAAI,MAAM,KAAK,MAAM,EAAE,CAAC;YACtB,IAAI,CAAC,GAAG,CAAC,0BAA0B,CAAC,CAAC;YACrC,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;QACpD,CAAC;QAED,oEAAoE;QAEpE,qEAAqE;QACrE,IAAI,UAAU,GAAkB,IAAI,CAAC;QACrC,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;YAChE,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;YAErD,mEAAmE;YACnE,iEAAiE;YACjE,IAAI,OAAO,CAAC,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;gBAC5C,UAAU,GAAG,4EAA4E,CAAC;gBAC1F,IAAI,CAAC,GAAG,CAAC,cAAc,UAAU,EAAE,CAAC,CAAC;YACvC,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC,GAAG,CAAC,qBAAqB,MAAM,CAAC,UAAU,IAAI,CAAC,CAAC;gBACrD,OAAO,MAAM,CAAC;YAChB,CAAC;QACH,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,UAAU,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YAC9D,IAAI,CAAC,GAAG,CAAC,qBAAqB,UAAU,EAAE,CAAC,CAAC;QAC9C,CAAC;QAED,qEAAqE;QACrE,IAAI,UAAU,GAAkB,IAAI,CAAC;QACrC,IAAI,CAAC,GAAG,CAAC,iDAAiD,CAAC,CAAC;QAC5D,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;YAChE,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;YAErD,6DAA6D;YAC7D,IAAI,OAAO,CAAC,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;gBAC5C,UAAU,GAAG,iEAAiE,CAAC;gBAC/E,IAAI,CAAC,GAAG,CAAC,cAAc,UAAU,EAAE,CAAC,CAAC;YACvC,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC,GAAG,CAAC,wBAAwB,MAAM,CAAC,UAAU,IAAI,CAAC,CAAC;gBACxD,OAAO,MAAM,CAAC;YAChB,CAAC;QACH,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,UAAU,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YAC9D,IAAI,CAAC,GAAG,CAAC,+BAA+B,UAAU,EAAE,CAAC,CAAC;QACxD,CAAC;QAED,qEAAqE;QACrE,IAAI,CAAC,GAAG,CAAC,qDAAqD,CAAC,CAAC;QAChE,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;YAChE,IAAI,CAAC,GAAG,CAAC,2BAA2B,MAAM,CAAC,UAAU,IAAI,CAAC,CAAC;YAC3D,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,UAAU,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YACpE,IAAI,CAAC,GAAG,CAAC,iCAAiC,UAAU,EAAE,CAAC,CAAC;YAExD,mEAAmE;YACnE,OAAO;gBACL,GAAG,EAAE,QAAQ;gBACb,UAAU,EAAE,CAAC;gBACb,KAAK,EAAE,EAAE;gBACT,IAAI,EAAE,SAAS;gBACf,UAAU,EAAE,CAAC;gBACb,KAAK,EAAE;oBACL,0BAA0B;oBAC1B,uBAAuB,UAAU,IAAI,aAAa,EAAE;oBACpD,uBAAuB,UAAU,IAAI,aAAa,EAAE;oBACpD,uBAAuB,UAAU,EAAE;iBACpC,CAAC,IAAI,CAAC,IAAI,CAAC;aACb,CAAC;QACJ,CAAC;IACH,CAAC;IAED,0EAA0E;IAE1E;;;;;;;OAOG;IACH,KAAK,CAAC,UAAU,CACd,IAAc,EACd,UAAyB,EAAE,EAC3B,WAAW,GAAG,CAAC;QAEf,MAAM,OAAO,GAAmB,EAAE,CAAC;QACnC,MAAM,KAAK,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC;QAExB,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxB,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,EAAE,WAAW,CAAC,CAAC;YAC3C,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,CACtC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAC1C,CAAC;YAEF,KAAK,MAAM,OAAO,IAAI,OAAO,EAAE,CAAC;gBAC9B,IAAI,OAAO,CAAC,MAAM,KAAK,WAAW,EAAE,CAAC;oBACnC,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;gBAC9B,CAAC;qBAAM,CAAC;oBACN,OAAO,CAAC,IAAI,CAAC;wBACX,GAAG,EAAE,SAAS;wBACd,UAAU,EAAE,CAAC;wBACb,KAAK,EAAE,EAAE;wBACT,IAAI,EAAE,MAAM;wBACZ,UAAU,EAAE,CAAC;wBACb,KAAK,EACH,OAAO,CAAC,MAAM,YAAY,KAAK;4BAC7B,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO;4BACxB,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC;qBAC7B,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,0EAA0E;IAE1E;;;OAGG;IACH,KAAK,CAAC,KAAK;QACT,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;IAC3B,CAAC;IAED,0EAA0E;IAElE,GAAG,CAAC,GAAW;QACrB,IAAI,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;YACxB,OAAO,CAAC,GAAG,CAAC,eAAe,GAAG,EAAE,CAAC,CAAC;QACpC,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tier1-http.d.ts","sourceRoot":"","sources":["../../../src/scraper/tiers/tier1-http.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAmD/D,qBAAa,SAAS;IACd,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC,YAAY,CAAC;CAmG9E"}
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import { gotScraping } from "got-scraping";
|
|
2
|
+
import { extractSSRData, hasEnoughContent } from "../extractors/ssr.js";
|
|
3
|
+
import { extractContent } from "../extractors/content.js";
|
|
4
|
+
import { htmlToMarkdown } from "../extractors/to-markdown.js";
|
|
5
|
+
// ─── Tier 1: got-scraping + TLS Fingerprint + Cheerio ──────────────────────
|
|
6
|
+
//
|
|
7
|
+
// Pipeline:
|
|
8
|
+
// 1. got-scraping com TLS/JA3 fingerprint spoofing que imita Chrome real
|
|
9
|
+
// 2. Detectar dados SSR embutidos (Next.js, Nuxt, Gatsby, Remix)
|
|
10
|
+
// 3. Verificar se o HTML tem conteúdo sem JS
|
|
11
|
+
// 4. Mozilla Readability → Cheerio (fallback) → Markdown
|
|
12
|
+
//
|
|
13
|
+
// Por que got-scraping em vez de fetch() nativo?
|
|
14
|
+
// ✓ TLS fingerprint (JA3/JA4) idêntico ao Chrome → bypassa Cloudflare, Akamai
|
|
15
|
+
// ✓ HTTP/2 com fingerprint consistente (TLS + ALPN + header order)
|
|
16
|
+
// ✓ Header generator integrado: UA, Sec-Ch-Ua*, Sec-Fetch-* coerentes entre si
|
|
17
|
+
// ✓ ~65-70% dos sites funcionam sem browser (~100-800ms)
|
|
18
|
+
//
|
|
19
|
+
// Limitações:
|
|
20
|
+
// ✗ Não executa JavaScript → SPAs sem SSR vão falhar
|
|
21
|
+
// → Orquestrador aciona Tier 2 (Stealth) ou Tier 3 (Playwright) se falhar
|
|
22
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
23
|
+
// Headers que NÃO são gerados automaticamente pelo got-scraping:
|
|
24
|
+
// - Accept-Language → precisa ser pt-BR para sites locais
|
|
25
|
+
// - Cache-Control → garante resposta fresca, sem cache de CDN
|
|
26
|
+
// - Pragma → backward compat com servidores antigos
|
|
27
|
+
//
|
|
28
|
+
// got-scraping auto-gera (coerentes com o TLS fingerprint do Chrome):
|
|
29
|
+
// - User-Agent, Accept, Accept-Encoding
|
|
30
|
+
// - Sec-Ch-Ua, Sec-Ch-Ua-Mobile, Sec-Ch-Ua-Platform
|
|
31
|
+
// - Sec-Fetch-Dest, Sec-Fetch-Mode, Sec-Fetch-Site, Sec-Fetch-User
|
|
32
|
+
// - Upgrade-Insecure-Requests
|
|
33
|
+
const EXTRA_HEADERS = {
|
|
34
|
+
"Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
35
|
+
"Cache-Control": "no-cache",
|
|
36
|
+
"Pragma": "no-cache",
|
|
37
|
+
};
|
|
38
|
+
// Códigos HTTP que indicam bloqueio ativo por anti-bot (não erro de servidor)
|
|
39
|
+
const ANTIBOT_STATUS = new Set([403, 429, 503]);
|
|
40
|
+
export class Tier1HTTP {
|
|
41
|
+
async scrape(url, options = {}) {
|
|
42
|
+
const startTime = Date.now();
|
|
43
|
+
// ── Requisição HTTP com TLS fingerprint spoofing ────────────────────
|
|
44
|
+
// gotScraping() aplica JA3/JA4 fingerprint de Chrome real no handshake TLS.
|
|
45
|
+
// O header generator (useHeaderGenerator: true, padrão) gera User-Agent,
|
|
46
|
+
// Accept, Sec-Ch-Ua* e Sec-Fetch-* consistentes com esse fingerprint.
|
|
47
|
+
// Isso é o que diferencia got-scraping de fetch() e axios.
|
|
48
|
+
let response;
|
|
49
|
+
try {
|
|
50
|
+
response = (await gotScraping({
|
|
51
|
+
url,
|
|
52
|
+
// Mesclamos nossos headers extras com os que got-scraping auto-gera.
|
|
53
|
+
// Se o usuário passar headers customizados, eles têm prioridade.
|
|
54
|
+
headers: { ...EXTRA_HEADERS, ...options.headers },
|
|
55
|
+
// Não lança exceção em 4xx/5xx — tratamos manualmente abaixo
|
|
56
|
+
throwHttpErrors: false,
|
|
57
|
+
// got gerencia decompressão (gzip/br) automaticamente — não setar Accept-Encoding
|
|
58
|
+
timeout: { request: options.timeout ?? 15_000 },
|
|
59
|
+
// Retorna o corpo como string (HTML)
|
|
60
|
+
responseType: "text",
|
|
61
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
62
|
+
}));
|
|
63
|
+
}
|
|
64
|
+
catch (err) {
|
|
65
|
+
// Erros de rede: DNS, TLS, timeout, ECONNREFUSED, etc.
|
|
66
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
67
|
+
throw new Error(`Tier1 HTTP: falha na requisição — ${msg}`);
|
|
68
|
+
}
|
|
69
|
+
// ── Verificações de bloqueio por anti-bot ───────────────────────────
|
|
70
|
+
if (ANTIBOT_STATUS.has(response.statusCode)) {
|
|
71
|
+
throw new Error(`Tier1 HTTP: status ${response.statusCode} — bloqueado por anti-bot`);
|
|
72
|
+
}
|
|
73
|
+
if (response.statusCode >= 400) {
|
|
74
|
+
throw new Error(`Tier1 HTTP: status HTTP ${response.statusCode}`);
|
|
75
|
+
}
|
|
76
|
+
// Content-Type: em got pode ser string ou string[] dependendo da versão
|
|
77
|
+
const rawCt = response.headers["content-type"];
|
|
78
|
+
const contentType = Array.isArray(rawCt) ? (rawCt[0] ?? "") : (rawCt ?? "");
|
|
79
|
+
if (!contentType.includes("text/html") && !contentType.includes("text/plain")) {
|
|
80
|
+
throw new Error(`Tier1 HTTP: Content-Type "${contentType}" inesperado — esperava text/html`);
|
|
81
|
+
}
|
|
82
|
+
// got retorna o body como string quando responseType: 'text'
|
|
83
|
+
const html = response.body;
|
|
84
|
+
// response.url é a URL final após redirecionamentos
|
|
85
|
+
const finalUrl = response.url;
|
|
86
|
+
// ── Tentar extrair dados SSR embutidos ──────────────────────────────
|
|
87
|
+
// Next.js, Nuxt, Gatsby, Remix → os dados já estão no HTML!
|
|
88
|
+
// Permite extrair conteúdo rico sem precisar de browser ou JS.
|
|
89
|
+
const ssrData = extractSSRData(html);
|
|
90
|
+
// ── Verificar se o HTML tem conteúdo sem JS ─────────────────────────
|
|
91
|
+
// Detecta: página vazia de SPA, Cloudflare challenge, "enable JavaScript", etc.
|
|
92
|
+
if (!hasEnoughContent(html) && !ssrData) {
|
|
93
|
+
throw new Error("Tier1 HTTP: conteúdo insuficiente — provavelmente SPA sem SSR ou anti-bot");
|
|
94
|
+
}
|
|
95
|
+
// ── Extrair conteúdo principal ──────────────────────────────────────
|
|
96
|
+
// 1. Mozilla Readability (mesmo algoritmo do Firefox Reader Mode)
|
|
97
|
+
// 2. Cheerio + heurísticas (fallback quando Readability falha)
|
|
98
|
+
const formats = options.formats ?? ["markdown", "text"];
|
|
99
|
+
const extracted = extractContent(html, options.onlyMainContent ?? true, finalUrl);
|
|
100
|
+
const result = {
|
|
101
|
+
url: finalUrl,
|
|
102
|
+
statusCode: response.statusCode,
|
|
103
|
+
title: extracted.title,
|
|
104
|
+
description: extracted.description || undefined,
|
|
105
|
+
tier: "http",
|
|
106
|
+
durationMs: Date.now() - startTime,
|
|
107
|
+
links: extracted.links.length > 0 ? extracted.links : undefined,
|
|
108
|
+
};
|
|
109
|
+
if (formats.includes("markdown"))
|
|
110
|
+
result.markdown = htmlToMarkdown(extracted.html);
|
|
111
|
+
if (formats.includes("html"))
|
|
112
|
+
result.html = extracted.html;
|
|
113
|
+
if (formats.includes("text"))
|
|
114
|
+
result.text = extracted.text;
|
|
115
|
+
if (ssrData)
|
|
116
|
+
result.ssrData = ssrData;
|
|
117
|
+
return result;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
//# sourceMappingURL=tier1-http.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tier1-http.js","sourceRoot":"","sources":["../../../src/scraper/tiers/tier1-http.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAW3C,OAAO,EAAE,cAAc,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAC1D,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAE9D,8EAA8E;AAC9E,EAAE;AACF,YAAY;AACZ,0EAA0E;AAC1E,kEAAkE;AAClE,8CAA8C;AAC9C,0DAA0D;AAC1D,EAAE;AACF,iDAAiD;AACjD,gFAAgF;AAChF,qEAAqE;AACrE,iFAAiF;AACjF,2DAA2D;AAC3D,EAAE;AACF,cAAc;AACd,uDAAuD;AACvD,4EAA4E;AAC5E,6EAA6E;AAE7E,iEAAiE;AACjE,6DAA6D;AAC7D,mEAAmE;AACnE,gEAAgE;AAChE,EAAE;AACF,sEAAsE;AACtE,0CAA0C;AAC1C,sDAAsD;AACtD,qEAAqE;AACrE,gCAAgC;AAChC,MAAM,aAAa,GAA2B;IAC5C,iBAAiB,EAAE,qCAAqC;IACxD,eAAe,EAAE,UAAU;IAC3B,QAAQ,EAAE,UAAU;CACrB,CAAC;AAEF,8EAA8E;AAC9E,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC;AAEhD,MAAM,OAAO,SAAS;IACpB,KAAK,CAAC,MAAM,CAAC,GAAW,EAAE,UAAyB,EAAE;QACnD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE7B,uEAAuE;QACvE,4EAA4E;QAC5E,yEAAyE;QACzE,sEAAsE;QACtE,2DAA2D;QAC3D,IAAI,QAAqB,CAAC;QAE1B,IAAI,CAAC;YACH,QAAQ,GAAG,CAAC,MAAM,WAAW,CAAC;gBAC5B,GAAG;gBACH,qEAAqE;gBACrE,iEAAiE;gBACjE,OAAO,EAAE,EAAE,GAAG,aAAa,EAAE,GAAG,OAAO,CAAC,OAAO,EAAE;gBAEjD,6DAA6D;gBAC7D,eAAe,EAAE,KAAK;gBAEtB,kFAAkF;gBAClF,OAAO,EAAE,EAAE,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,MAAM,EAAE;gBAE/C,qCAAqC;gBACrC,YAAY,EAAE,MAAM;gBACpB,8DAA8D;aAC/D,CAAC,CAA2B,CAAC;QAChC,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,uDAAuD;YACvD,MAAM,GAAG,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YAC7D,MAAM,IAAI,KAAK,CAAC,qCAAqC,GAAG,EAAE,CAAC,CAAC;QAC9D,CAAC;QAED,uEAAuE;QACvE,IAAI,cAAc,CAAC,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;YAC5C,MAAM,IAAI,KAAK,CACb,sBAAsB,QAAQ,CAAC,UAAU,2BAA2B,CACrE,CAAC;QACJ,CAAC;QAED,IAAI,QAAQ,CAAC,UAAU,IAAI,GAAG,EAAE,CAAC;YAC/B,MAAM,IAAI,KAAK,CAAC,2BAA2B,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QACpE,CAAC;QAED,wEAAwE;QACxE,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;QAC/C,MAAM,WAAW,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC;QAE5E,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;YAC9E,MAAM,IAAI,KAAK,CACb,6BAA6B,WAAW,mCAAmC,CAC5E,CAAC;QACJ,CAAC;QAED,6DAA6D;QAC7D,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAc,CAAC;QACrC,oDAAoD;QACpD,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC;QAE9B,uEAAuE;QACvE,4DAA4D;QAC5D,+DAA+D;QAC/D,MAAM,OAAO,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;QAErC,uEAAuE;QACvE,gFAAgF;QAChF,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YACxC,MAAM,IAAI,KAAK,CACb,2EAA2E,CAC5E,CAAC;QACJ,CAAC;QAED,uEAAuE;QACvE,kEAAkE;QAClE,+DAA+D;QAC/D,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;QACxD,MAAM,SAAS,GAAG,cAAc,CAC9B,IAAI,EACJ,OAAO,CAAC,eAAe,IAAI,IAAI,EAC/B,QAAQ,CACT,CAAC;QAEF,MAAM,MAAM,GAAiB;YAC3B,GAAG,EAAE,QAAQ;YACb,UAAU,EAAE,QAAQ,CAAC,UAAU;YAC/B,KAAK,EAAE,SAAS,CAAC,KAAK;YACtB,WAAW,EAAE,SAAS,CAAC,WAAW,IAAI,SAAS;YAC/C,IAAI,EAAE,MAAM;YACZ,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;YAClC,KAAK,EAAE,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;SAChE,CAAC;QAEF,IAAI,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC;YAAE,MAAM,CAAC,QAAQ,GAAG,cAAc,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;QACnF,IAAI,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAM,MAAM,CAAC,IAAI,GAAO,SAAS,CAAC,IAAI,CAAC;QACnE,IAAI,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAM,MAAM,CAAC,IAAI,GAAO,SAAS,CAAC,IAAI,CAAC;QACnE,IAAI,OAAO;YAAuB,MAAM,CAAC,OAAO,GAAI,OAAO,CAAC;QAE5D,OAAO,MAAM,CAAC;IAChB,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tier2-stealth.d.ts","sourceRoot":"","sources":["../../../src/scraper/tiers/tier2-stealth.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAgC/D,qBAAa,YAAY;IACjB,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC,YAAY,CAAC;CAuG9E"}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import { gotScraping } from "got-scraping";
|
|
2
|
+
import { extractSSRData, hasEnoughContent } from "../extractors/ssr.js";
|
|
3
|
+
import { extractContent } from "../extractors/content.js";
|
|
4
|
+
import { htmlToMarkdown } from "../extractors/to-markdown.js";
|
|
5
|
+
// ─── Tier 2: HTTP Stealth com TLS Fingerprint ──────────────────────────────
|
|
6
|
+
//
|
|
7
|
+
// Pipeline:
|
|
8
|
+
// 1. gotScraping → HTTP com TLS fingerprint spoofing (JA3/JA4 anti-bot)
|
|
9
|
+
// 2. Gera headers realistas de Chrome automaticamente via headerGeneratorOptions
|
|
10
|
+
// 3. Detectar dados SSR embutidos (Next.js, Nuxt, Gatsby, Remix)
|
|
11
|
+
// 4. Verificar se o HTML tem conteúdo sem JS
|
|
12
|
+
// 5. Extrair conteúdo principal + converter para Markdown
|
|
13
|
+
//
|
|
14
|
+
// Ativado quando o Tier 1 (HTTP simples) for bloqueado por anti-bot básico.
|
|
15
|
+
// Resolve a maioria dos casos de TLS/JA3 fingerprinting.
|
|
16
|
+
//
|
|
17
|
+
// Se ainda falhar (SPA sem SSR, anti-bot avançado) → lança Error para o
|
|
18
|
+
// orquestrador acionar o Tier 3 (Playwright Chromium).
|
|
19
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
20
|
+
// Códigos HTTP que indicam bloqueio por anti-bot
|
|
21
|
+
const ANTIBOT_STATUS = new Set([403, 429, 503]);
|
|
22
|
+
export class Tier2Stealth {
|
|
23
|
+
async scrape(url, options = {}) {
|
|
24
|
+
const startTime = Date.now();
|
|
25
|
+
// ── Requisição HTTP com TLS fingerprint de browser real ─────────────
|
|
26
|
+
let response;
|
|
27
|
+
try {
|
|
28
|
+
response = (await gotScraping({
|
|
29
|
+
url,
|
|
30
|
+
method: "GET",
|
|
31
|
+
// Gera headers realistas de Chrome automaticamente
|
|
32
|
+
headerGeneratorOptions: {
|
|
33
|
+
browsers: [{ name: "chrome", minVersion: 120 }],
|
|
34
|
+
operatingSystems: ["macos", "windows"],
|
|
35
|
+
devices: ["desktop"],
|
|
36
|
+
locales: ["pt-BR", "pt", "en-US"],
|
|
37
|
+
},
|
|
38
|
+
// Headers extras para parecer mais humano
|
|
39
|
+
headers: {
|
|
40
|
+
accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
41
|
+
"accept-language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
42
|
+
"cache-control": "no-cache",
|
|
43
|
+
pragma: "no-cache",
|
|
44
|
+
"sec-fetch-dest": "document",
|
|
45
|
+
"sec-fetch-mode": "navigate",
|
|
46
|
+
"sec-fetch-site": "none",
|
|
47
|
+
"sec-fetch-user": "?1",
|
|
48
|
+
"upgrade-insecure-requests": "1",
|
|
49
|
+
...options.headers,
|
|
50
|
+
},
|
|
51
|
+
timeout: { request: options.timeout ?? 30_000 },
|
|
52
|
+
followRedirect: true,
|
|
53
|
+
maxRedirects: 10,
|
|
54
|
+
retry: { limit: 2, methods: ["GET"] },
|
|
55
|
+
throwHttpErrors: false,
|
|
56
|
+
decompress: true,
|
|
57
|
+
}));
|
|
58
|
+
}
|
|
59
|
+
catch (err) {
|
|
60
|
+
throw new Error(`Tier2 Stealth: falha na requisição — ${err instanceof Error ? err.message : String(err)}`);
|
|
61
|
+
}
|
|
62
|
+
const html = response.body;
|
|
63
|
+
const statusCode = response.statusCode;
|
|
64
|
+
const finalUrl = response.url ?? url;
|
|
65
|
+
// ── Verificações de bloqueio ────────────────────────────────────────
|
|
66
|
+
if (ANTIBOT_STATUS.has(statusCode)) {
|
|
67
|
+
throw new Error(`Tier2 Stealth: status ${statusCode} — bloqueado por anti-bot`);
|
|
68
|
+
}
|
|
69
|
+
if (statusCode >= 400) {
|
|
70
|
+
throw new Error(`Tier2 Stealth: status ${statusCode}`);
|
|
71
|
+
}
|
|
72
|
+
const contentType = response.headers["content-type"] ?? "";
|
|
73
|
+
if (!String(contentType).includes("text/html") &&
|
|
74
|
+
!String(contentType).includes("text/plain")) {
|
|
75
|
+
throw new Error(`Tier2 Stealth: Content-Type inesperado "${contentType}" — esperava text/html`);
|
|
76
|
+
}
|
|
77
|
+
// ── Tentar extrair dados SSR embutidos ──────────────────────────────
|
|
78
|
+
const ssrData = extractSSRData(html);
|
|
79
|
+
// ── Verificar se o HTML tem conteúdo sem JS ─────────────────────────
|
|
80
|
+
if (!hasEnoughContent(html) && !ssrData) {
|
|
81
|
+
throw new Error("Tier2 Stealth: conteúdo insuficiente — página precisa de JavaScript para renderizar");
|
|
82
|
+
}
|
|
83
|
+
// ── Extrair conteúdo ────────────────────────────────────────────────
|
|
84
|
+
const formats = options.formats ?? ["markdown", "text"];
|
|
85
|
+
const extracted = extractContent(html, options.onlyMainContent ?? true, finalUrl);
|
|
86
|
+
const result = {
|
|
87
|
+
url: finalUrl,
|
|
88
|
+
statusCode,
|
|
89
|
+
title: extracted.title,
|
|
90
|
+
description: extracted.description || undefined,
|
|
91
|
+
tier: "stealth",
|
|
92
|
+
durationMs: Date.now() - startTime,
|
|
93
|
+
links: extracted.links.length > 0 ? extracted.links : undefined,
|
|
94
|
+
};
|
|
95
|
+
if (formats.includes("markdown"))
|
|
96
|
+
result.markdown = htmlToMarkdown(extracted.html);
|
|
97
|
+
if (formats.includes("html"))
|
|
98
|
+
result.html = extracted.html;
|
|
99
|
+
if (formats.includes("text"))
|
|
100
|
+
result.text = extracted.text;
|
|
101
|
+
if (ssrData)
|
|
102
|
+
result.ssrData = ssrData;
|
|
103
|
+
return result;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
//# sourceMappingURL=tier2-stealth.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tier2-stealth.js","sourceRoot":"","sources":["../../../src/scraper/tiers/tier2-stealth.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAE3C,OAAO,EAAE,cAAc,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAC1D,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAU9D,8EAA8E;AAC9E,EAAE;AACF,YAAY;AACZ,yEAAyE;AACzE,kFAAkF;AAClF,kEAAkE;AAClE,8CAA8C;AAC9C,2DAA2D;AAC3D,EAAE;AACF,4EAA4E;AAC5E,yDAAyD;AACzD,EAAE;AACF,wEAAwE;AACxE,uDAAuD;AACvD,6EAA6E;AAE7E,iDAAiD;AACjD,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC;AAEhD,MAAM,OAAO,YAAY;IACvB,KAAK,CAAC,MAAM,CAAC,GAAW,EAAE,UAAyB,EAAE;QACnD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE7B,uEAAuE;QACvE,IAAI,QAAqB,CAAC;QAC1B,IAAI,CAAC;YACH,QAAQ,GAAG,CAAC,MAAM,WAAW,CAAC;gBAC5B,GAAG;gBACH,MAAM,EAAE,KAAK;gBACb,mDAAmD;gBACnD,sBAAsB,EAAE;oBACtB,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,UAAU,EAAE,GAAG,EAAE,CAAC;oBAC/C,gBAAgB,EAAE,CAAC,OAAO,EAAE,SAAS,CAAC;oBACtC,OAAO,EAAE,CAAC,SAAS,CAAC;oBACpB,OAAO,EAAE,CAAC,OAAO,EAAE,IAAI,EAAE,OAAO,CAAC;iBAClC;gBACD,0CAA0C;gBAC1C,OAAO,EAAE;oBACP,MAAM,EACJ,kGAAkG;oBACpG,iBAAiB,EAAE,qCAAqC;oBACxD,eAAe,EAAE,UAAU;oBAC3B,MAAM,EAAE,UAAU;oBAClB,gBAAgB,EAAE,UAAU;oBAC5B,gBAAgB,EAAE,UAAU;oBAC5B,gBAAgB,EAAE,MAAM;oBACxB,gBAAgB,EAAE,IAAI;oBACtB,2BAA2B,EAAE,GAAG;oBAChC,GAAG,OAAO,CAAC,OAAO;iBACnB;gBACD,OAAO,EAAE,EAAE,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,MAAM,EAAE;gBAC/C,cAAc,EAAE,IAAI;gBACpB,YAAY,EAAE,EAAE;gBAChB,KAAK,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,KAAK,CAAC,EAAE;gBACrC,eAAe,EAAE,KAAK;gBACtB,UAAU,EAAE,IAAI;aACjB,CAAC,CAA2B,CAAC;QAChC,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CACb,wCAAwC,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAC3F,CAAC;QACJ,CAAC;QAED,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC;QAC3B,MAAM,UAAU,GAAG,QAAQ,CAAC,UAAU,CAAC;QACvC,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,IAAI,GAAG,CAAC;QAErC,uEAAuE;QACvE,IAAI,cAAc,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CACb,yBAAyB,UAAU,2BAA2B,CAC/D,CAAC;QACJ,CAAC;QAED,IAAI,UAAU,IAAI,GAAG,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CAAC,yBAAyB,UAAU,EAAE,CAAC,CAAC;QACzD,CAAC;QAED,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;QAC3D,IACE,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC;YAC1C,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,EAC3C,CAAC;YACD,MAAM,IAAI,KAAK,CACb,2CAA2C,WAAW,wBAAwB,CAC/E,CAAC;QACJ,CAAC;QAED,uEAAuE;QACvE,MAAM,OAAO,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;QAErC,uEAAuE;QACvE,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YACxC,MAAM,IAAI,KAAK,CACb,qFAAqF,CACtF,CAAC;QACJ,CAAC;QAED,uEAAuE;QACvE,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;QACxD,MAAM,SAAS,GAAG,cAAc,CAC9B,IAAI,EACJ,OAAO,CAAC,eAAe,IAAI,IAAI,EAC/B,QAAQ,CACT,CAAC;QAEF,MAAM,MAAM,GAAiB;YAC3B,GAAG,EAAE,QAAQ;YACb,UAAU;YACV,KAAK,EAAE,SAAS,CAAC,KAAK;YACtB,WAAW,EAAE,SAAS,CAAC,WAAW,IAAI,SAAS;YAC/C,IAAI,EAAE,SAAS;YACf,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;YAClC,KAAK,EAAE,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;SAChE,CAAC;QAEF,IAAI,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC;YAAE,MAAM,CAAC,QAAQ,GAAG,cAAc,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;QACnF,IAAI,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAM,MAAM,CAAC,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC;QAC/D,IAAI,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAM,MAAM,CAAC,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC;QAC/D,IAAI,OAAO;YAAuB,MAAM,CAAC,OAAO,GAAG,OAAO,CAAC;QAE3D,OAAO,MAAM,CAAC;IAChB,CAAC;CACF"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { ScrapeOptions, ScrapeResult, FirecrawlConfig } from "../types.js";
|
|
2
|
+
export declare class Tier3Browser {
|
|
3
|
+
private browser;
|
|
4
|
+
private readonly browserConfig;
|
|
5
|
+
constructor(browserConfig?: FirecrawlConfig["browserConfig"]);
|
|
6
|
+
private getBrowser;
|
|
7
|
+
scrape(url: string, options?: ScrapeOptions): Promise<ScrapeResult>;
|
|
8
|
+
close(): Promise<void>;
|
|
9
|
+
}
|
|
10
|
+
//# sourceMappingURL=tier3-browser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tier3-browser.d.ts","sourceRoot":"","sources":["../../../src/scraper/tiers/tier3-browser.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EACV,aAAa,EACb,YAAY,EAEZ,eAAe,EAChB,MAAM,aAAa,CAAC;AA4RrB,qBAAa,YAAY;IACvB,OAAO,CAAC,OAAO,CAAwB;IACvC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAgD;gBAElE,aAAa,GAAE,eAAe,CAAC,eAAe,CAAM;YAMlD,UAAU;IAgClB,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC,YAAY,CAAC;IAkNvE,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAM7B"}
|