auspex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/agent/actions.d.ts +5 -0
- package/dist/agent/actions.d.ts.map +1 -0
- package/dist/agent/actions.js +26 -0
- package/dist/agent/actions.js.map +1 -0
- package/dist/agent/agent.d.ts +12 -0
- package/dist/agent/agent.d.ts.map +1 -0
- package/dist/agent/agent.js +147 -0
- package/dist/agent/agent.js.map +1 -0
- package/dist/agent/loop.d.ts +6 -0
- package/dist/agent/loop.d.ts.map +1 -0
- package/dist/agent/loop.js +165 -0
- package/dist/agent/loop.js.map +1 -0
- package/dist/agent/report.d.ts +3 -0
- package/dist/agent/report.d.ts.map +1 -0
- package/dist/agent/report.js +90 -0
- package/dist/agent/report.js.map +1 -0
- package/dist/browser/executor.d.ts +5 -0
- package/dist/browser/executor.d.ts.map +1 -0
- package/dist/browser/executor.js +33 -0
- package/dist/browser/executor.js.map +1 -0
- package/dist/browser/snapshot.d.ts +6 -0
- package/dist/browser/snapshot.d.ts.map +1 -0
- package/dist/browser/snapshot.js +145 -0
- package/dist/browser/snapshot.js.map +1 -0
- package/dist/config/defaults.d.ts +10 -0
- package/dist/config/defaults.d.ts.map +1 -0
- package/dist/config/defaults.js +10 -0
- package/dist/config/defaults.js.map +1 -0
- package/dist/config/schema.d.ts +59 -0
- package/dist/config/schema.d.ts.map +1 -0
- package/dist/config/schema.js +23 -0
- package/dist/config/schema.js.map +1 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/client.d.ts +23 -0
- package/dist/llm/client.d.ts.map +1 -0
- package/dist/llm/client.js +51 -0
- package/dist/llm/client.js.map +1 -0
- package/dist/llm/prompt.d.ts +3 -0
- package/dist/llm/prompt.d.ts.map +1 -0
- package/dist/llm/prompt.js +36 -0
- package/dist/llm/prompt.js.map +1 -0
- package/dist/scraper/extractors/content.d.ts +22 -0
- package/dist/scraper/extractors/content.d.ts.map +1 -0
- package/dist/scraper/extractors/content.js +237 -0
- package/dist/scraper/extractors/content.js.map +1 -0
- package/dist/scraper/extractors/ssr.d.ts +17 -0
- package/dist/scraper/extractors/ssr.d.ts.map +1 -0
- package/dist/scraper/extractors/ssr.js +162 -0
- package/dist/scraper/extractors/ssr.js.map +1 -0
- package/dist/scraper/extractors/to-markdown.d.ts +5 -0
- package/dist/scraper/extractors/to-markdown.d.ts.map +1 -0
- package/dist/scraper/extractors/to-markdown.js +103 -0
- package/dist/scraper/extractors/to-markdown.js.map +1 -0
- package/dist/scraper/index.d.ts +27 -0
- package/dist/scraper/index.d.ts.map +1 -0
- package/dist/scraper/index.js +178 -0
- package/dist/scraper/index.js.map +1 -0
- package/dist/scraper/tiers/tier1-http.d.ts +5 -0
- package/dist/scraper/tiers/tier1-http.d.ts.map +1 -0
- package/dist/scraper/tiers/tier1-http.js +120 -0
- package/dist/scraper/tiers/tier1-http.js.map +1 -0
- package/dist/scraper/tiers/tier2-stealth.d.ts +5 -0
- package/dist/scraper/tiers/tier2-stealth.d.ts.map +1 -0
- package/dist/scraper/tiers/tier2-stealth.js +106 -0
- package/dist/scraper/tiers/tier2-stealth.js.map +1 -0
- package/dist/scraper/tiers/tier3-browser.d.ts +10 -0
- package/dist/scraper/tiers/tier3-browser.d.ts.map +1 -0
- package/dist/scraper/tiers/tier3-browser.js +504 -0
- package/dist/scraper/tiers/tier3-browser.js.map +1 -0
- package/dist/scraper/types.d.ts +130 -0
- package/dist/scraper/types.d.ts.map +1 -0
- package/dist/scraper/types.js +3 -0
- package/dist/scraper/types.js.map +1 -0
- package/dist/security/action-validator.d.ts +83 -0
- package/dist/security/action-validator.d.ts.map +1 -0
- package/dist/security/action-validator.js +36 -0
- package/dist/security/action-validator.js.map +1 -0
- package/dist/security/url-validator.d.ts +9 -0
- package/dist/security/url-validator.d.ts.map +1 -0
- package/dist/security/url-validator.js +69 -0
- package/dist/security/url-validator.js.map +1 -0
- package/dist/types.d.ts +95 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +54 -0
- package/readme.md +760 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import { load } from "cheerio";
|
|
2
|
+
import { JSDOM } from "jsdom";
|
|
3
|
+
import { Readability } from "@mozilla/readability";
|
|
4
|
+
// ─── Seletores de "ruído" a remover (fallback Cheerio) ────────────────────────
|
|
5
|
+
const NOISE_SELECTORS = [
|
|
6
|
+
// Estrutural
|
|
7
|
+
"script",
|
|
8
|
+
"style",
|
|
9
|
+
"noscript",
|
|
10
|
+
"iframe",
|
|
11
|
+
"svg",
|
|
12
|
+
// Navegação
|
|
13
|
+
"nav",
|
|
14
|
+
"header",
|
|
15
|
+
"footer",
|
|
16
|
+
'[role="navigation"]',
|
|
17
|
+
'[role="banner"]',
|
|
18
|
+
'[role="contentinfo"]',
|
|
19
|
+
".nav",
|
|
20
|
+
".navbar",
|
|
21
|
+
".navigation",
|
|
22
|
+
".menu",
|
|
23
|
+
".header",
|
|
24
|
+
".footer",
|
|
25
|
+
".site-header",
|
|
26
|
+
".site-footer",
|
|
27
|
+
// Lateral
|
|
28
|
+
"aside",
|
|
29
|
+
".sidebar",
|
|
30
|
+
".side-bar",
|
|
31
|
+
"#sidebar",
|
|
32
|
+
'[role="complementary"]',
|
|
33
|
+
// Anúncios e promoções
|
|
34
|
+
".ad",
|
|
35
|
+
".ads",
|
|
36
|
+
".adsbygoogle",
|
|
37
|
+
".advertisement",
|
|
38
|
+
".promo",
|
|
39
|
+
".banner",
|
|
40
|
+
'[id*="google_ads"]',
|
|
41
|
+
'[class*="sponsored"]',
|
|
42
|
+
// Banners legais
|
|
43
|
+
".cookie-banner",
|
|
44
|
+
".cookie-notice",
|
|
45
|
+
".cookie-consent",
|
|
46
|
+
".gdpr",
|
|
47
|
+
// Overlays
|
|
48
|
+
".popup",
|
|
49
|
+
".modal",
|
|
50
|
+
".overlay",
|
|
51
|
+
".backdrop",
|
|
52
|
+
// Social e misc
|
|
53
|
+
".social-share",
|
|
54
|
+
".share-buttons",
|
|
55
|
+
".related-posts",
|
|
56
|
+
".comments",
|
|
57
|
+
"#comments",
|
|
58
|
+
".comment-section",
|
|
59
|
+
".newsletter",
|
|
60
|
+
".subscribe",
|
|
61
|
+
];
|
|
62
|
+
// ─── Seletores de conteúdo principal (fallback Cheerio) ───────────────────────
|
|
63
|
+
const MAIN_CONTENT_SELECTORS = [
|
|
64
|
+
"main",
|
|
65
|
+
"article",
|
|
66
|
+
'[role="main"]',
|
|
67
|
+
"#main-content",
|
|
68
|
+
"#content",
|
|
69
|
+
"#main",
|
|
70
|
+
".main-content",
|
|
71
|
+
".content",
|
|
72
|
+
".post-content",
|
|
73
|
+
".article-content",
|
|
74
|
+
".entry-content",
|
|
75
|
+
".page-content",
|
|
76
|
+
".blog-post",
|
|
77
|
+
".blog-content",
|
|
78
|
+
".post-body",
|
|
79
|
+
".article-body",
|
|
80
|
+
];
|
|
81
|
+
// ─── Extração de links ─────────────────────────────────────────────────────
|
|
82
|
+
function extractLinks($, baseUrl) {
|
|
83
|
+
const links = [];
|
|
84
|
+
const seen = new Set();
|
|
85
|
+
$("a[href]").each((_, el) => {
|
|
86
|
+
const href = $(el).attr("href");
|
|
87
|
+
if (!href)
|
|
88
|
+
return;
|
|
89
|
+
if (href.startsWith("#"))
|
|
90
|
+
return;
|
|
91
|
+
if (href.startsWith("javascript:"))
|
|
92
|
+
return;
|
|
93
|
+
if (href.startsWith("mailto:"))
|
|
94
|
+
return;
|
|
95
|
+
if (href.startsWith("tel:"))
|
|
96
|
+
return;
|
|
97
|
+
// Tenta resolver URL relativa
|
|
98
|
+
let resolved = href;
|
|
99
|
+
if (baseUrl && (href.startsWith("/") || href.startsWith("."))) {
|
|
100
|
+
try {
|
|
101
|
+
resolved = new URL(href, baseUrl).href;
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
return;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
if (!seen.has(resolved)) {
|
|
108
|
+
seen.add(resolved);
|
|
109
|
+
links.push(resolved);
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
return links;
|
|
113
|
+
}
|
|
114
|
+
// ─── Extração de metadados ─────────────────────────────────────────────────
|
|
115
|
+
function extractMeta($) {
|
|
116
|
+
const title = $("title").first().text().trim() ||
|
|
117
|
+
$('meta[property="og:title"]').attr("content")?.trim() ||
|
|
118
|
+
$("h1").first().text().trim() ||
|
|
119
|
+
"";
|
|
120
|
+
const description = $('meta[name="description"]').attr("content")?.trim() ||
|
|
121
|
+
$('meta[property="og:description"]').attr("content")?.trim() ||
|
|
122
|
+
$('meta[name="twitter:description"]').attr("content")?.trim() ||
|
|
123
|
+
"";
|
|
124
|
+
return { title, description };
|
|
125
|
+
}
|
|
126
|
+
// ─── Mozilla Readability (caminho principal) ───────────────────────────────────
|
|
127
|
+
//
|
|
128
|
+
// Mesmo algoritmo que o Firefox usa no Reader Mode e que o Firecrawl real usa.
|
|
129
|
+
// Produz conteúdo semanticamente limpo, muito superior a heurísticas manuais.
|
|
130
|
+
function extractWithReadability(html, baseUrl) {
|
|
131
|
+
try {
|
|
132
|
+
const dom = new JSDOM(html, {
|
|
133
|
+
// URL necessária para Readability resolver links relativos corretamente
|
|
134
|
+
url: baseUrl ?? "https://example.com",
|
|
135
|
+
});
|
|
136
|
+
const reader = new Readability(dom.window.document, {
|
|
137
|
+
// Aceita conteúdo com no mínimo 50 caracteres (padrão é 500)
|
|
138
|
+
charThreshold: 50,
|
|
139
|
+
});
|
|
140
|
+
const article = reader.parse();
|
|
141
|
+
// Rejeita se não produziu conteúdo suficiente
|
|
142
|
+
if (!article ||
|
|
143
|
+
!article.content ||
|
|
144
|
+
(article.textContent?.trim()?.length ?? 0) < 100) {
|
|
145
|
+
return null;
|
|
146
|
+
}
|
|
147
|
+
return {
|
|
148
|
+
html: article.content,
|
|
149
|
+
text: (article.textContent ?? "").replace(/\s+/g, " ").trim(),
|
|
150
|
+
title: article.title ?? "",
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
catch {
|
|
154
|
+
// JSDOM ou Readability falharam — aciona fallback Cheerio
|
|
155
|
+
return null;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
// ─── Cheerio (fallback) ───────────────────────────────────────────────────────
|
|
159
|
+
function extractWithCheerio($, onlyMain) {
|
|
160
|
+
// Remove ruído
|
|
161
|
+
NOISE_SELECTORS.forEach((selector) => {
|
|
162
|
+
try {
|
|
163
|
+
$(selector).remove();
|
|
164
|
+
}
|
|
165
|
+
catch {
|
|
166
|
+
// Seletor inválido no contexto — ignora
|
|
167
|
+
}
|
|
168
|
+
});
|
|
169
|
+
// Inicia com body como padrão seguro
|
|
170
|
+
let contentEl = $("body");
|
|
171
|
+
if (onlyMain) {
|
|
172
|
+
// Tenta encontrar área de conteúdo principal
|
|
173
|
+
for (const selector of MAIN_CONTENT_SELECTORS) {
|
|
174
|
+
const el = $(selector);
|
|
175
|
+
if (el.length > 0) {
|
|
176
|
+
const text = el.first().text().replace(/\s+/g, " ").trim();
|
|
177
|
+
if (text.length > 150) {
|
|
178
|
+
contentEl = el.first();
|
|
179
|
+
break;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
// Limpa atributos de rastreamento e estilos inline
|
|
185
|
+
contentEl.find("[style]").removeAttr("style");
|
|
186
|
+
contentEl.find("[onclick]").removeAttr("onclick");
|
|
187
|
+
contentEl.find("[class]").each((_, el) => {
|
|
188
|
+
$(el).removeAttr("class");
|
|
189
|
+
});
|
|
190
|
+
const contentHtml = contentEl.html() ?? "";
|
|
191
|
+
const text = contentEl.text().replace(/\s+/g, " ").trim();
|
|
192
|
+
return { html: contentHtml, text };
|
|
193
|
+
}
|
|
194
|
+
// ─── Extração principal ────────────────────────────────────────────────────────
|
|
195
|
+
/**
|
|
196
|
+
* Extrai o conteúdo significativo de um HTML.
|
|
197
|
+
*
|
|
198
|
+
* Estratégia em dois níveis:
|
|
199
|
+
* 1. Mozilla Readability — mesmo algoritmo do Firefox Reader Mode.
|
|
200
|
+
* Produz conteúdo muito mais limpo e semântico que heurísticas manuais.
|
|
201
|
+
* 2. Cheerio + seletores heurísticos — fallback quando Readability falha
|
|
202
|
+
* (ex: páginas muito simples ou layouts não-convencionais).
|
|
203
|
+
*
|
|
204
|
+
* @param html - HTML completo da página
|
|
205
|
+
* @param onlyMain - Tentar extrair apenas o conteúdo principal
|
|
206
|
+
* @param baseUrl - URL base para resolver links e contextualizar o Readability
|
|
207
|
+
*/
|
|
208
|
+
export function extractContent(html, onlyMain = true, baseUrl) {
|
|
209
|
+
const $ = load(html);
|
|
210
|
+
// Extrai metadados e links ANTES de remover elementos de navegação
|
|
211
|
+
const { title, description } = extractMeta($);
|
|
212
|
+
const links = extractLinks($, baseUrl);
|
|
213
|
+
// ── Caminho 1: Mozilla Readability ────────────────────────────────────────
|
|
214
|
+
if (onlyMain) {
|
|
215
|
+
const readable = extractWithReadability(html, baseUrl);
|
|
216
|
+
if (readable) {
|
|
217
|
+
return {
|
|
218
|
+
html: readable.html,
|
|
219
|
+
text: readable.text,
|
|
220
|
+
// Título do Readability é mais preciso (remove sufixos de site)
|
|
221
|
+
title: readable.title || title,
|
|
222
|
+
description,
|
|
223
|
+
links,
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
// ── Caminho 2: Cheerio (fallback) ─────────────────────────────────────────
|
|
228
|
+
const cheerio = extractWithCheerio($, onlyMain);
|
|
229
|
+
return {
|
|
230
|
+
html: cheerio.html,
|
|
231
|
+
text: cheerio.text,
|
|
232
|
+
title,
|
|
233
|
+
description,
|
|
234
|
+
links,
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
//# sourceMappingURL=content.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content.js","sourceRoot":"","sources":["../../../src/scraper/extractors/content.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAmB,MAAM,SAAS,CAAC;AAChD,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAC9B,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AAEnD,iFAAiF;AAEjF,MAAM,eAAe,GAAG;IACtB,aAAa;IACb,QAAQ;IACR,OAAO;IACP,UAAU;IACV,QAAQ;IACR,KAAK;IACL,YAAY;IACZ,KAAK;IACL,QAAQ;IACR,QAAQ;IACR,qBAAqB;IACrB,iBAAiB;IACjB,sBAAsB;IACtB,MAAM;IACN,SAAS;IACT,aAAa;IACb,OAAO;IACP,SAAS;IACT,SAAS;IACT,cAAc;IACd,cAAc;IACd,UAAU;IACV,OAAO;IACP,UAAU;IACV,WAAW;IACX,UAAU;IACV,wBAAwB;IACxB,uBAAuB;IACvB,KAAK;IACL,MAAM;IACN,cAAc;IACd,gBAAgB;IAChB,QAAQ;IACR,SAAS;IACT,oBAAoB;IACpB,sBAAsB;IACtB,iBAAiB;IACjB,gBAAgB;IAChB,gBAAgB;IAChB,iBAAiB;IACjB,OAAO;IACP,WAAW;IACX,QAAQ;IACR,QAAQ;IACR,UAAU;IACV,WAAW;IACX,gBAAgB;IAChB,eAAe;IACf,gBAAgB;IAChB,gBAAgB;IAChB,WAAW;IACX,WAAW;IACX,kBAAkB;IAClB,aAAa;IACb,YAAY;CACJ,CAAC;AAEX,iFAAiF;AAEjF,MAAM,sBAAsB,GAAG;IAC7B,MAAM;IACN,SAAS;IACT,eAAe;IACf,eAAe;IACf,UAAU;IACV,OAAO;IACP,eAAe;IACf,UAAU;IACV,eAAe;IACf,kBAAkB;IAClB,gBAAgB;IAChB,eAAe;IACf,YAAY;IACZ,eAAe;IACf,YAAY;IACZ,eAAe;CACP,CAAC;AAYX,8EAA8E;AAE9E,SAAS,YAAY,CAAC,CAAa,EAAE,OAAgB;IACnD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAE/B,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QAC1B,MAAM,IAAI,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAChC,IAAI,CAAC,IAAI;YAAE,OAAO;QAClB,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,OAAO;QACjC,IAAI,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;YAAE,OAAO;QAC3C,IAAI,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;YAAE,OAAO;QACvC,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;YAAE,OAAO;QAEpC,8BAA8B;QAC9B,IAAI,QAAQ,GAAG,IAAI,CAAC;QACpB,IAAI,OAAO,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAC9D,IAAI,CAAC;gBACH,QAAQ,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;YACzC,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO;YACT,CAAC;QACH,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACxB,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YACnB,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACvB,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC;AAED,8EAA8E;AAE9E,SAAS,WAAW,CAAC,CAAa;IAChC,MAAM,KAAK,GACT,CAAC,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE;QAChC,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE;QACtD,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE;QAC7B,EAAE,CAAC;IAEL,MAAM,WAAW,GACf,CAAC,CAAC,0BAA0B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE;QACrD,CAAC,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE;QAC5D,CAAC,CAAC,kCAAkC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE;QAC7D,EAAE,CAAC;IAEL,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,CAAC;AAChC,CAAC;AAED,kFAAkF;AAClF,EAAE;AACF,+EAA+E;AAC/E,8EAA8E;AAE9E,SAAS,sBAAsB,CAC7B,IAAY,EACZ,OAAgB;IAEhB,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE;YAC1B,wEAAwE;YACxE,GAAG,EAAE,OAAO,IAAI,qBAAqB;SACtC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE;YAClD,6DAA6D;YAC7D,aAAa,EAAE,EAAE;SAClB,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAE/B,8CAA8C;QAC9C,IACE,CAAC,OAAO;YACR,CAAC,OAAO,CAAC,OAAO;YAChB,CAAC,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,EAAE,MAAM,IAAI,CAAC,CAAC,GAAG,GAAG,EAChD,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO;YACL,IAAI,EAAE,OAAO,CAAC,OAAO;YACrB,IAAI,EAAE,CAAC,OAAO,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE;YAC7D,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,EAAE;SAC3B,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,0DAA0D;QAC1D,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,iFAAiF;AAEjF,SAAS,kBAAkB,CACzB,CAAa,EACb,QAAiB;IAEjB,eAAe;IACf,eAAe,CAAC,OAAO,CAAC,CAAC,QAAQ,EAAE,EAAE;QACnC,IAAI,CAAC;YACH,CAAC,CAAC,QAAQ,CAAC,CAAC,MAAM,EAAE,CAAC;QACvB,CAAC;QAAC,MAAM,CAAC;YACP,wCAAwC;QAC1C,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,qCAAqC;IACrC,IAAI,SAAS,GAAyB,CAAC,CAAC,MAAM,CAAC,CAAC;IAEhD,IAAI,QAAQ,EAAE,CAAC;QACb,6CAA6C;QAC7C,KAAK,MAAM,QAAQ,IAAI,sBAAsB,EAAE,CAAC;YAC9C,MAAM,EAAE,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;YACvB,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAClB,MAAM,IAAI,GAAG,EAAE,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC3D,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;oBACtB,SAAS,GAAG,EAAE,CAAC,KAAK,EAAE,CAAC;oBACvB,MAAM;gBACR,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,mDAAmD;IACnD,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC;IAC9C,SAAS,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC;IAClD,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACvC,CAAC,CAAC,EAAE,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC;IAC5B,CAAC,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;IAC3C,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAE1D,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;AACrC,CAAC;AAED,kFAAkF;AAElF;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,cAAc,CAC5B,IAAY,EACZ,QAAQ,GAAG,IAAI,EACf,OAAgB;IAEhB,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IAErB,mEAAmE;IACnE,MAAM,EAAE,KAAK,EAAE,WAAW,EAAE,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC;IAC9C,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;IAEvC,6EAA6E;IAC7E,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,QAAQ,GAAG,sBAAsB,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QACvD,IAAI,QAAQ,EAAE,CAAC;YACb,OAAO;gBACL,IAAI,EAAE,QAAQ,CAAC,IAAI;gBACnB,IAAI,EAAE,QAAQ,CAAC,IAAI;gBACnB,gEAAgE;gBAChE,KAAK,EAAE,QAAQ,CAAC,KAAK,IAAI,KAAK;gBAC9B,WAAW;gBACX,KAAK;aACN,CAAC;QACJ,CAAC;IACH,CAAC;IAED,6EAA6E;IAC7E,MAAM,OAAO,GAAG,kBAAkB,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;IAChD,OAAO;QACL,IAAI,EAAE,OAAO,CAAC,IAAI;QAClB,IAAI,EAAE,OAAO,CAAC,IAAI;QAClB,KAAK;QACL,WAAW;QACX,KAAK;KACN,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { SSRData } from "../types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Tenta extrair dados JSON embutidos por frameworks SSR no HTML inicial.
|
|
4
|
+
* Muitos sites Next.js/Nuxt/SvelteKit não precisam de browser —
|
|
5
|
+
* os dados já estão no HTML e podem ser extraídos com Cheerio!
|
|
6
|
+
*/
|
|
7
|
+
export declare function extractSSRData(html: string): SSRData | null;
|
|
8
|
+
/**
|
|
9
|
+
* Verifica se a página tem conteúdo suficiente sem JavaScript.
|
|
10
|
+
*
|
|
11
|
+
* Retorna `false` quando:
|
|
12
|
+
* - O texto visível é muito curto (< 200 chars) → SPA ainda não renderizou
|
|
13
|
+
* - Detecta padrões de anti-bot / challenge pages (Cloudflare, DDoS-Guard, etc.)
|
|
14
|
+
* - Detecta loading screens (texto de JS habilitado, spinners, etc.)
|
|
15
|
+
*/
|
|
16
|
+
export declare function hasEnoughContent(html: string): boolean;
|
|
17
|
+
//# sourceMappingURL=ssr.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ssr.d.ts","sourceRoot":"","sources":["../../../src/scraper/extractors/ssr.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAoB3C;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,GAAG,IAAI,CA8F3D;AAED;;;;;;;GAOG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAwDtD"}
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import { load } from "cheerio";
|
|
2
|
+
// ─── Detectores de dados SSR ───────────────────────────────────────────────
|
|
3
|
+
//
|
|
4
|
+
// Frameworks modernos embutem dados no HTML inicial para hidratação no cliente.
|
|
5
|
+
// Extrair esses dados evita a necessidade de browser em ~60-70% dos sites.
|
|
6
|
+
//
|
|
7
|
+
// Ordem: do mais específico para o mais genérico.
|
|
8
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
9
|
+
/** Tenta parsear JSON com segurança; retorna null em caso de erro */
|
|
10
|
+
function tryParse(raw) {
|
|
11
|
+
if (!raw?.trim())
|
|
12
|
+
return null;
|
|
13
|
+
try {
|
|
14
|
+
return JSON.parse(raw.trim());
|
|
15
|
+
}
|
|
16
|
+
catch {
|
|
17
|
+
return null;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Tenta extrair dados JSON embutidos por frameworks SSR no HTML inicial.
|
|
22
|
+
* Muitos sites Next.js/Nuxt/SvelteKit não precisam de browser —
|
|
23
|
+
* os dados já estão no HTML e podem ser extraídos com Cheerio!
|
|
24
|
+
*/
|
|
25
|
+
export function extractSSRData(html) {
|
|
26
|
+
const $ = load(html);
|
|
27
|
+
// ── Next.js: <script id="__NEXT_DATA__" type="application/json"> ──────
|
|
28
|
+
const nextRaw = $("#__NEXT_DATA__").text().trim();
|
|
29
|
+
const nextData = tryParse(nextRaw);
|
|
30
|
+
if (nextData)
|
|
31
|
+
return { type: "next", data: nextData };
|
|
32
|
+
// ── Angular Universal: <script id="ng-state" type="application/json"> ─
|
|
33
|
+
const ngRaw = $('script#ng-state[type="application/json"]').text().trim();
|
|
34
|
+
const ngData = tryParse(ngRaw);
|
|
35
|
+
if (ngData)
|
|
36
|
+
return { type: "angular", data: ngData };
|
|
37
|
+
// ── SvelteKit: <script type="application/json" data-sveltekit-fetched> ─
|
|
38
|
+
// SvelteKit 2+ serializa dados de `load()` em tags script com atributo especial
|
|
39
|
+
const svelteFetchedRaw = $('script[data-sveltekit-fetched]').text().trim();
|
|
40
|
+
const svelteFetchedData = tryParse(svelteFetchedRaw);
|
|
41
|
+
if (svelteFetchedData)
|
|
42
|
+
return { type: "sveltekit", data: svelteFetchedData };
|
|
43
|
+
// ── Nuxt 2/3: window.__NUXT__ = ... ──────────────────────────────────
|
|
44
|
+
// Nuxt pode usar JSON ou devalue (formato não-JSON proprietário do Nuxt 3)
|
|
45
|
+
// Tentamos capturar JSON puro; devalue é ignorado (precisa de browser)
|
|
46
|
+
const nuxtMatch = html.match(/window\.__NUXT__\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
|
|
47
|
+
if (nuxtMatch?.[1]) {
|
|
48
|
+
const nuxtData = tryParse(nuxtMatch[1]);
|
|
49
|
+
if (nuxtData)
|
|
50
|
+
return { type: "nuxt", data: nuxtData };
|
|
51
|
+
}
|
|
52
|
+
// ── Nuxt 3 alternativo: useNuxtApp / nuxtState ────────────────────────
|
|
53
|
+
const nuxt3Match = html.match(/window\.__nuxt_state__\s*=\s*'([^']+)'/);
|
|
54
|
+
if (nuxt3Match?.[1]) {
|
|
55
|
+
try {
|
|
56
|
+
const decoded = decodeURIComponent(nuxt3Match[1]);
|
|
57
|
+
const nuxt3Data = tryParse(decoded);
|
|
58
|
+
if (nuxt3Data)
|
|
59
|
+
return { type: "nuxt", data: nuxt3Data };
|
|
60
|
+
}
|
|
61
|
+
catch { }
|
|
62
|
+
}
|
|
63
|
+
// ── Gatsby: window.___gatsby ou window.___GATSBY ──────────────────────
|
|
64
|
+
const gatsbyMatch = html.match(/window\.___(?:gatsby|GATSBY)\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
|
|
65
|
+
if (gatsbyMatch?.[1]) {
|
|
66
|
+
const gatsbyData = tryParse(gatsbyMatch[1]);
|
|
67
|
+
if (gatsbyData)
|
|
68
|
+
return { type: "gatsby", data: gatsbyData };
|
|
69
|
+
}
|
|
70
|
+
// ── Remix / React Router v7: window.__remixContext ────────────────────
|
|
71
|
+
const remixMatch = html.match(/window\.__remix(?:Context|RouterManifest)\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
|
|
72
|
+
if (remixMatch?.[1]) {
|
|
73
|
+
const remixData = tryParse(remixMatch[1]);
|
|
74
|
+
if (remixData)
|
|
75
|
+
return { type: "remix", data: remixData };
|
|
76
|
+
}
|
|
77
|
+
// ── TanStack Router / Start: window.__TSR_DEHYDRATED__ ───────────────
|
|
78
|
+
const tanstackMatch = html.match(/window\.__(?:TSR_DEHYDRATED|TANSTACK_ROUTER_CONTEXT|TRT_DEHYDRATED)__\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
|
|
79
|
+
if (tanstackMatch?.[1]) {
|
|
80
|
+
const tsrData = tryParse(tanstackMatch[1]);
|
|
81
|
+
if (tsrData)
|
|
82
|
+
return { type: "tanstack", data: tsrData };
|
|
83
|
+
}
|
|
84
|
+
// ── Vue SSR: window.__VUE_SSR_CONTEXT__ / window.__pinia ─────────────
|
|
85
|
+
const vueMatch = html.match(/window\.__(?:VUE_SSR_CONTEXT__|VUE_STORE__|pinia)\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
|
|
86
|
+
if (vueMatch?.[1]) {
|
|
87
|
+
const vueData = tryParse(vueMatch[1]);
|
|
88
|
+
if (vueData)
|
|
89
|
+
return { type: "vue", data: vueData };
|
|
90
|
+
}
|
|
91
|
+
// ── SvelteKit legado: window.__SVELTEKIT__ ────────────────────────────
|
|
92
|
+
const svelteLegacyMatch = html.match(/window\.__(?:SVELTEKIT|sveltekit)__?\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
|
|
93
|
+
if (svelteLegacyMatch?.[1]) {
|
|
94
|
+
const svelteData = tryParse(svelteLegacyMatch[1]);
|
|
95
|
+
if (svelteData)
|
|
96
|
+
return { type: "sveltekit", data: svelteData };
|
|
97
|
+
}
|
|
98
|
+
// ── Genérico: window.__INITIAL_STATE__ / __APP_STATE__ / __REDUX_STATE__ ─
|
|
99
|
+
// Cobre Redux, MobX, Zustand e qualquer store serializado manualmente
|
|
100
|
+
const genericMatch = html.match(/window\.__(?:INITIAL_STATE|APP_STATE|REDUX_STATE|STORE_STATE|DATA|STATE|PROPS)__\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
|
|
101
|
+
if (genericMatch?.[1]) {
|
|
102
|
+
const genericData = tryParse(genericMatch[1]);
|
|
103
|
+
if (genericData)
|
|
104
|
+
return { type: "generic", data: genericData };
|
|
105
|
+
}
|
|
106
|
+
return null;
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Verifica se a página tem conteúdo suficiente sem JavaScript.
|
|
110
|
+
*
|
|
111
|
+
* Retorna `false` quando:
|
|
112
|
+
* - O texto visível é muito curto (< 200 chars) → SPA ainda não renderizou
|
|
113
|
+
* - Detecta padrões de anti-bot / challenge pages (Cloudflare, DDoS-Guard, etc.)
|
|
114
|
+
* - Detecta loading screens (texto de JS habilitado, spinners, etc.)
|
|
115
|
+
*/
|
|
116
|
+
export function hasEnoughContent(html) {
|
|
117
|
+
const $ = load(html);
|
|
118
|
+
// Remove elementos que não geram conteúdo legível
|
|
119
|
+
$("script, style, noscript, iframe, svg, img").remove();
|
|
120
|
+
const bodyText = $("body").text().replace(/\s+/g, " ").trim();
|
|
121
|
+
// Heurística básica: texto muito curto = SPA sem SSR ou página vazia
|
|
122
|
+
if (bodyText.length < 200)
|
|
123
|
+
return false;
|
|
124
|
+
// ── Padrões de anti-bot / challenge pages ────────────────────────────
|
|
125
|
+
// Cada serviço tem uma frase característica que aparece quando bloqueia o bot.
|
|
126
|
+
const antiBotPatterns = [
|
|
127
|
+
// Cloudflare (mais comum)
|
|
128
|
+
/just a moment/i,
|
|
129
|
+
/checking your browser/i,
|
|
130
|
+
/ddos protection by cloudflare/i,
|
|
131
|
+
/ray id:/i, // ID único do Cloudflare
|
|
132
|
+
// DDoS-Guard
|
|
133
|
+
/ddos-guard/i,
|
|
134
|
+
// Imperva / Incapsula
|
|
135
|
+
/incapsula incident id/i,
|
|
136
|
+
/powered by imperva/i,
|
|
137
|
+
// DataDome
|
|
138
|
+
/datadome/i,
|
|
139
|
+
// hCaptcha / reCAPTCHA challenges
|
|
140
|
+
/complete the security check/i,
|
|
141
|
+
/prove you are human/i,
|
|
142
|
+
/please complete the captcha/i,
|
|
143
|
+
// Loading screens / SPA shell genérica
|
|
144
|
+
/please wait/i,
|
|
145
|
+
/enable javascript/i,
|
|
146
|
+
/you need to enable javascript/i,
|
|
147
|
+
/javascript is required/i,
|
|
148
|
+
/javascript is disabled/i,
|
|
149
|
+
/please enable javascript/i,
|
|
150
|
+
// Genérico
|
|
151
|
+
/access denied/i,
|
|
152
|
+
/403 forbidden/i,
|
|
153
|
+
/bot detected/i,
|
|
154
|
+
];
|
|
155
|
+
const lowerText = bodyText.toLowerCase();
|
|
156
|
+
const isAntiBot = antiBotPatterns.some((p) => p.test(lowerText));
|
|
157
|
+
// Challenge pages tem pouco texto e padrões identificáveis
|
|
158
|
+
if (isAntiBot && bodyText.length < 2_000)
|
|
159
|
+
return false;
|
|
160
|
+
return true;
|
|
161
|
+
}
|
|
162
|
+
//# sourceMappingURL=ssr.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ssr.js","sourceRoot":"","sources":["../../../src/scraper/extractors/ssr.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,CAAC;AAG/B,8EAA8E;AAC9E,EAAE;AACF,gFAAgF;AAChF,2EAA2E;AAC3E,EAAE;AACF,kDAAkD;AAClD,6EAA6E;AAE7E,qEAAqE;AACrE,SAAS,QAAQ,CAAC,GAAW;IAC3B,IAAI,CAAC,GAAG,EAAE,IAAI,EAAE;QAAE,OAAO,IAAI,CAAC;IAC9B,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;IAChC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IAErB,yEAAyE;IACzE,MAAM,OAAO,GAAG,CAAC,CAAC,gBAAgB,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAClD,MAAM,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC;IACnC,IAAI,QAAQ;QAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAEtD,yEAAyE;IACzE,MAAM,KAAK,GAAG,CAAC,CAAC,0CAA0C,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAC1E,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC;IAC/B,IAAI,MAAM;QAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC;IAErD,0EAA0E;IAC1E,gFAAgF;IAChF,MAAM,gBAAgB,GAAG,CAAC,CAAC,gCAAgC,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAC3E,MAAM,iBAAiB,GAAG,QAAQ,CAAC,gBAAgB,CAAC,CAAC;IACrD,IAAI,iBAAiB;QAAE,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,iBAAiB,EAAE,CAAC;IAE7E,wEAAwE;IACxE,2EAA2E;IAC3E,uEAAuE;IACvE,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,yDAAyD,CAAC,CAAC;IACxF,IAAI,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACnB,MAAM,QAAQ,GAAG,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;QACxC,IAAI,QAAQ;YAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IACxD,CAAC;IAED,yEAAyE;IACzE,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,wCAAwC,CAAC,CAAC;IACxE,IAAI,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,kBAAkB,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;YAClD,MAAM,SAAS,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC;YACpC,IAAI,SAAS;gBAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC;QAC1D,CAAC;QAAC,MAAM,CAAC,CAAA,CAAC;IACZ,CAAC;IAED,yEAAyE;IACzE,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAC5B,qEAAqE,CACtE,CAAC;IACF,IAAI,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACrB,MAAM,UAAU,GAAG,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5C,IAAI,UAAU;YAAE,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;IAC9D,CAAC;IAED,yEAAyE;IACzE,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAC3B,kFAAkF,CACnF,CAAC;IACF,IAAI,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACpB,MAAM,SAAS,GAAG,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1C,IAAI,SAAS;YAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC;IAC3D,CAAC;IAED,wEAAwE;IACxE,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAC9B,8GAA8G,CAC/G,CAAC;IACF,IAAI,aAAa,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACvB,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3C,IAAI,OAAO;YAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;IAC1D,CAAC;IAED,wEAAwE;IACxE,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CACzB,0FAA0F,CAC3F,CAAC;IACF,IAAI,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAClB,MAAM,OAAO,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QACtC,IAAI,OAAO;YAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;IACrD,CAAC;IAED,yEAAyE;IACzE,MAAM,iBAAiB,GAAG,IAAI,CAAC,KAAK,CAClC,6EAA6E,CAC9E,CAAC;IACF,IAAI,iBAAiB,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC3B,MAAM,UAAU,GAAG,QAAQ,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC,CAAC;QAClD,IAAI,UAAU;YAAE,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;IACjE,CAAC;IAED,4EAA4E;IAC5E,sEAAsE;IACtE,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAC7B,yHAAyH,CAC1H,CAAC;IACF,IAAI,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACtB,MAAM,WAAW,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;QAC9C,IAAI,WAAW;YAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,WAAW,EAAE,CAAC;IACjE,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IAErB,kDAAkD;IAClD,CAAC,CAAC,2CAA2C,CAAC,CAAC,MAAM,EAAE,CAAC;IAExD,MAAM,QAAQ,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAE9D,qEAAqE;IACrE,IAAI,QAAQ,CAAC,MAAM,GAAG,GAAG;QAAE,OAAO,KAAK,CAAC;IAExC,wEAAwE;IACxE,+EAA+E;IAC/E,MAAM,eAAe,GAAa;QAChC,0BAA0B;QAC1B,gBAAgB;QAChB,wBAAwB;QACxB,gCAAgC;QAChC,UAAU,EAA8B,yBAAyB;QAEjE,aAAa;QACb,aAAa;QAEb,sBAAsB;QACtB,wBAAwB;QACxB,qBAAqB;QAErB,WAAW;QACX,WAAW;QAEX,kCAAkC;QAClC,8BAA8B;QAC9B,sBAAsB;QACtB,8BAA8B;QAE9B,uCAAuC;QACvC,cAAc;QACd,oBAAoB;QACpB,gCAAgC;QAChC,yBAAyB;QACzB,yBAAyB;QACzB,2BAA2B;QAE3B,WAAW;QACX,gBAAgB;QAChB,gBAAgB;QAChB,eAAe;KAChB,CAAC;IAEF,MAAM,SAAS,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC;IACzC,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;IAEjE,2DAA2D;IAC3D,IAAI,SAAS,IAAI,QAAQ,CAAC,MAAM,GAAG,KAAK;QAAE,OAAO,KAAK,CAAC;IAEvD,OAAO,IAAI,CAAC;AACd,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"to-markdown.d.ts","sourceRoot":"","sources":["../../../src/scraper/extractors/to-markdown.ts"],"names":[],"mappings":"AA6FA;;GAEG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAwBnD"}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
// ─── Conversor HTML → Markdown ─────────────────────────────────────────────
|
|
2
|
+
// Usa Turndown (CJS) + plugin GFM para tabelas pipe nativas
|
|
3
|
+
import TurndownService from "turndown";
|
|
4
|
+
import { tables, strikethrough } from "turndown-plugin-gfm";
|
|
5
|
+
let _td = null;
|
|
6
|
+
function getTurndown() {
|
|
7
|
+
if (_td)
|
|
8
|
+
return _td;
|
|
9
|
+
_td = new TurndownService({
|
|
10
|
+
headingStyle: "atx", // # Título em vez de sublinhado
|
|
11
|
+
bulletListMarker: "-",
|
|
12
|
+
codeBlockStyle: "fenced", // ```code``` em vez de indentado
|
|
13
|
+
hr: "---",
|
|
14
|
+
strongDelimiter: "**",
|
|
15
|
+
emDelimiter: "_",
|
|
16
|
+
linkStyle: "inlined",
|
|
17
|
+
});
|
|
18
|
+
// ── Plugin GFM: tabelas pipe e strikethrough ────────────────────────────
|
|
19
|
+
// Converte <table> → | col1 | col2 | em vez de HTML bruto
|
|
20
|
+
_td.use(tables);
|
|
21
|
+
_td.use(strikethrough);
|
|
22
|
+
// ── Regras customizadas ──────────────────────────────────────────────────
|
|
23
|
+
// Remove completamente elementos que não geram conteúdo útil
|
|
24
|
+
// Nota: Turndown.remove() aceita apenas tag names, não CSS selectors
|
|
25
|
+
_td.remove([
|
|
26
|
+
"script",
|
|
27
|
+
"style",
|
|
28
|
+
"noscript",
|
|
29
|
+
"iframe",
|
|
30
|
+
"nav",
|
|
31
|
+
"footer",
|
|
32
|
+
"header",
|
|
33
|
+
"button",
|
|
34
|
+
"form",
|
|
35
|
+
]);
|
|
36
|
+
// figcaption dentro de figure: remove (evita legenda solta no Markdown)
|
|
37
|
+
_td.addRule("removeFigcaption", {
|
|
38
|
+
filter(node) {
|
|
39
|
+
return (node.nodeName === "FIGCAPTION" &&
|
|
40
|
+
node.parentNode?.nodeName === "FIGURE");
|
|
41
|
+
},
|
|
42
|
+
replacement: () => "",
|
|
43
|
+
});
|
|
44
|
+
// Classes de anúncio (.ad, .ads) — Turndown.remove() não aceita CSS selectors
|
|
45
|
+
_td.addRule("removeAds", {
|
|
46
|
+
filter(node) {
|
|
47
|
+
if (node.nodeType !== 1)
|
|
48
|
+
return false;
|
|
49
|
+
const cls = node.getAttribute("class") ?? "";
|
|
50
|
+
return /\bad\b|\bads\b/.test(cls);
|
|
51
|
+
},
|
|
52
|
+
replacement: () => "",
|
|
53
|
+
});
|
|
54
|
+
// Imagens: extrai alt text de forma limpa
|
|
55
|
+
_td.addRule("images", {
|
|
56
|
+
filter: "img",
|
|
57
|
+
replacement(_content, node) {
|
|
58
|
+
const img = node;
|
|
59
|
+
const alt = img.getAttribute("alt")?.trim() ?? "";
|
|
60
|
+
const src = img.getAttribute("src") ?? "";
|
|
61
|
+
if (!src)
|
|
62
|
+
return "";
|
|
63
|
+
return alt ? `` : ``;
|
|
64
|
+
},
|
|
65
|
+
});
|
|
66
|
+
// Links: remove links vazios ou com href #
|
|
67
|
+
_td.addRule("cleanLinks", {
|
|
68
|
+
filter(node) {
|
|
69
|
+
return (node.nodeName === "A" &&
|
|
70
|
+
(!node.getAttribute("href") ||
|
|
71
|
+
node.getAttribute("href") === "#" ||
|
|
72
|
+
node.getAttribute("href")?.startsWith("javascript:") === true));
|
|
73
|
+
},
|
|
74
|
+
replacement(content) {
|
|
75
|
+
return content; // Mantém apenas o texto, sem o link
|
|
76
|
+
},
|
|
77
|
+
});
|
|
78
|
+
return _td;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Converte HTML em Markdown limpo e legível por humanos/LLMs.
|
|
82
|
+
*/
|
|
83
|
+
export function htmlToMarkdown(html) {
|
|
84
|
+
if (!html.trim())
|
|
85
|
+
return "";
|
|
86
|
+
const td = getTurndown();
|
|
87
|
+
let markdown = td.turndown(html);
|
|
88
|
+
// ── Limpeza pós-conversão ──────────────────────────────────────────────
|
|
89
|
+
// Remove linhas que são só espaços/pontuação
|
|
90
|
+
markdown = markdown
|
|
91
|
+
.split("\n")
|
|
92
|
+
.filter((line) => line.trim().length > 0 || line === "")
|
|
93
|
+
.join("\n");
|
|
94
|
+
// Colapsa 3+ linhas em branco para no máximo 2
|
|
95
|
+
markdown = markdown.replace(/\n{3,}/g, "\n\n");
|
|
96
|
+
// Remove espaços trailing
|
|
97
|
+
markdown = markdown
|
|
98
|
+
.split("\n")
|
|
99
|
+
.map((l) => l.trimEnd())
|
|
100
|
+
.join("\n");
|
|
101
|
+
return markdown.trim();
|
|
102
|
+
}
|
|
103
|
+
//# sourceMappingURL=to-markdown.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"to-markdown.js","sourceRoot":"","sources":["../../../src/scraper/extractors/to-markdown.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4DAA4D;AAE5D,OAAO,eAAe,MAAM,UAAU,CAAC;AACvC,OAAO,EAAE,MAAM,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAE5D,IAAI,GAAG,GAA2B,IAAI,CAAC;AAEvC,SAAS,WAAW;IAClB,IAAI,GAAG;QAAE,OAAO,GAAG,CAAC;IAEpB,GAAG,GAAG,IAAI,eAAe,CAAC;QACxB,YAAY,EAAE,KAAK,EAAQ,gCAAgC;QAC3D,gBAAgB,EAAE,GAAG;QACrB,cAAc,EAAE,QAAQ,EAAG,iCAAiC;QAC5D,EAAE,EAAE,KAAK;QACT,eAAe,EAAE,IAAI;QACrB,WAAW,EAAE,GAAG;QAChB,SAAS,EAAE,SAAS;KACrB,CAAC,CAAC;IAEH,2EAA2E;IAC3E,0DAA0D;IAC1D,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC;IAEvB,4EAA4E;IAE5E,6DAA6D;IAC7D,qEAAqE;IACrE,GAAG,CAAC,MAAM,CAAC;QACT,QAAQ;QACR,OAAO;QACP,UAAU;QACV,QAAQ;QACR,KAAK;QACL,QAAQ;QACR,QAAQ;QACR,QAAQ;QACR,MAAM;KACP,CAAC,CAAC;IAEH,wEAAwE;IACxE,GAAG,CAAC,OAAO,CAAC,kBAAkB,EAAE;QAC9B,MAAM,CAAC,IAAI;YACT,OAAO,CACL,IAAI,CAAC,QAAQ,KAAK,YAAY;gBAC9B,IAAI,CAAC,UAAU,EAAE,QAAQ,KAAK,QAAQ,CACvC,CAAC;QACJ,CAAC;QACD,WAAW,EAAE,GAAG,EAAE,CAAC,EAAE;KACtB,CAAC,CAAC;IAEH,8EAA8E;IAC9E,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE;QACvB,MAAM,CAAC,IAAI;YACT,IAAI,IAAI,CAAC,QAAQ,KAAK,CAAC;gBAAE,OAAO,KAAK,CAAC;YACtC,MAAM,GAAG,GAAI,IAAgB,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;YAC1D,OAAO,gBAAgB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACpC,CAAC;QACD,WAAW,EAAE,GAAG,EAAE,CAAC,EAAE;KACtB,CAAC,CAAC;IAEH,0CAA0C;IAC1C,GAAG,CAAC,OAAO,CAAC,QAAQ,EAAE;QACpB,MAAM,EAAE,KAAK;QACb,WAAW,CAAC,QAAQ,EAAE,IAAI;YACxB,MAAM,GAAG,GAAG,IAAwB,CAAC;YACrC,MAAM,GAAG,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAClD,MAAM,GAAG,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;YAC1C,IAAI,CAAC,GAAG;gBAAE,OAAO,EAAE,CAAC;YACpB,OAAO,GAAG,CAAC,CAAC,CAAC,KAAK,GAAG,KAAK,GAAG,GAAG,CAAC,CAAC,CAAC,YAAY,GAAG,GAAG,CAAC;QACxD,CAAC;KACF,CAAC,CAAC;IAEH,2CAA2C;IAC3C,GAAG,CAAC,OAAO,CAAC,YAAY,EAAE;QACxB,MAAM,CAAC,IAAI;YACT,OAAO,CACL,IAAI,CAAC,QAAQ,KAAK,GAAG;gBACrB,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC;oBACzB,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,KAAK,GAAG;oBACjC,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,EAAE,UAAU,CAAC,aAAa,CAAC,KAAK,IAAI,CAAC,CACjE,CAAC;QACJ,CAAC;QACD,WAAW,CAAC,OAAO;YACjB,OAAO,OAAO,CAAC,CAAC,oCAAoC;QACtD,CAAC;KACF,CAAC,CAAC;IAEH,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAE5B,MAAM,EAAE,GAAG,WAAW,EAAE,CAAC;IACzB,IAAI,QAAQ,GAAG,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAEjC,0EAA0E;IAE1E,6CAA6C;IAC7C,QAAQ,GAAG,QAAQ;SAChB,KAAK,CAAC,IAAI,CAAC;SACX,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,KAAK,EAAE,CAAC;SACvD,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,+CAA+C;IAC/C,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAE/C,0BAA0B;IAC1B,QAAQ,GAAG,QAAQ;SAChB,KAAK,CAAC,IAAI,CAAC;SACX,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC;SACvB,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAC;AACzB,CAAC"}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import type { FirecrawlConfig, ScrapeOptions, ScrapeResult } from "./types.js";
|
|
2
|
+
export declare class Firecrawl {
|
|
3
|
+
private readonly fullConfig;
|
|
4
|
+
private readonly tier1;
|
|
5
|
+
private readonly tier2;
|
|
6
|
+
private readonly tier3;
|
|
7
|
+
private readonly config;
|
|
8
|
+
constructor(fullConfig?: FirecrawlConfig);
|
|
9
|
+
scrape(url: string, options?: ScrapeOptions): Promise<ScrapeResult>;
|
|
10
|
+
/**
|
|
11
|
+
* Scrapia múltiplas URLs em paralelo com concorrência limitada.
|
|
12
|
+
* Erros em URLs individuais não derrubam o lote inteiro.
|
|
13
|
+
*
|
|
14
|
+
* @param urls - Lista de URLs a scrapeiar
|
|
15
|
+
* @param options - Opções aplicadas a todas as URLs
|
|
16
|
+
* @param concurrency - Máximo de scrapes simultâneos. Default: 3
|
|
17
|
+
*/
|
|
18
|
+
scrapeMany(urls: string[], options?: ScrapeOptions, concurrency?: number): Promise<ScrapeResult[]>;
|
|
19
|
+
/**
|
|
20
|
+
* Fecha o browser Playwright (Tier 3).
|
|
21
|
+
* Sempre chamar ao terminar para evitar processos Chromium órfãos.
|
|
22
|
+
*/
|
|
23
|
+
close(): Promise<void>;
|
|
24
|
+
private log;
|
|
25
|
+
}
|
|
26
|
+
export type { ScrapeOptions, ScrapeResult, ScrapeTier, ContentFormat, SSRData, InterceptedAPI, FirecrawlConfig, TierRawResult, } from "./types.js";
|
|
27
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scraper/index.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EACV,eAAe,EACf,aAAa,EACb,YAAY,EACb,MAAM,YAAY,CAAC;AAepB,qBAAa,SAAS;IAYR,OAAO,CAAC,QAAQ,CAAC,UAAU;IAXvC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAY;IAClC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAe;IACrC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAe;IACrC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAMrB;gBAE2B,UAAU,GAAE,eAAoB;IAevD,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC,YAAY,CAAC;IAqG7E;;;;;;;OAOG;IACG,UAAU,CACd,IAAI,EAAE,MAAM,EAAE,EACd,OAAO,GAAE,aAAkB,EAC3B,WAAW,SAAI,GACd,OAAO,CAAC,YAAY,EAAE,CAAC;IAkC1B;;;OAGG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAM5B,OAAO,CAAC,GAAG;CAKZ;AAGD,YAAY,EACV,aAAa,EACb,YAAY,EACZ,UAAU,EACV,aAAa,EACb,OAAO,EACP,cAAc,EACd,eAAe,EACf,aAAa,GACd,MAAM,YAAY,CAAC"}
|