auspex 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/LICENSE +21 -0
  2. package/dist/agent/actions.d.ts +5 -0
  3. package/dist/agent/actions.d.ts.map +1 -0
  4. package/dist/agent/actions.js +26 -0
  5. package/dist/agent/actions.js.map +1 -0
  6. package/dist/agent/agent.d.ts +12 -0
  7. package/dist/agent/agent.d.ts.map +1 -0
  8. package/dist/agent/agent.js +147 -0
  9. package/dist/agent/agent.js.map +1 -0
  10. package/dist/agent/loop.d.ts +6 -0
  11. package/dist/agent/loop.d.ts.map +1 -0
  12. package/dist/agent/loop.js +165 -0
  13. package/dist/agent/loop.js.map +1 -0
  14. package/dist/agent/report.d.ts +3 -0
  15. package/dist/agent/report.d.ts.map +1 -0
  16. package/dist/agent/report.js +90 -0
  17. package/dist/agent/report.js.map +1 -0
  18. package/dist/browser/executor.d.ts +5 -0
  19. package/dist/browser/executor.d.ts.map +1 -0
  20. package/dist/browser/executor.js +33 -0
  21. package/dist/browser/executor.js.map +1 -0
  22. package/dist/browser/snapshot.d.ts +6 -0
  23. package/dist/browser/snapshot.d.ts.map +1 -0
  24. package/dist/browser/snapshot.js +145 -0
  25. package/dist/browser/snapshot.js.map +1 -0
  26. package/dist/config/defaults.d.ts +10 -0
  27. package/dist/config/defaults.d.ts.map +1 -0
  28. package/dist/config/defaults.js +10 -0
  29. package/dist/config/defaults.js.map +1 -0
  30. package/dist/config/schema.d.ts +59 -0
  31. package/dist/config/schema.d.ts.map +1 -0
  32. package/dist/config/schema.js +23 -0
  33. package/dist/config/schema.js.map +1 -0
  34. package/dist/index.d.ts +7 -0
  35. package/dist/index.d.ts.map +1 -0
  36. package/dist/index.js +8 -0
  37. package/dist/index.js.map +1 -0
  38. package/dist/llm/client.d.ts +23 -0
  39. package/dist/llm/client.d.ts.map +1 -0
  40. package/dist/llm/client.js +51 -0
  41. package/dist/llm/client.js.map +1 -0
  42. package/dist/llm/prompt.d.ts +3 -0
  43. package/dist/llm/prompt.d.ts.map +1 -0
  44. package/dist/llm/prompt.js +36 -0
  45. package/dist/llm/prompt.js.map +1 -0
  46. package/dist/scraper/extractors/content.d.ts +22 -0
  47. package/dist/scraper/extractors/content.d.ts.map +1 -0
  48. package/dist/scraper/extractors/content.js +237 -0
  49. package/dist/scraper/extractors/content.js.map +1 -0
  50. package/dist/scraper/extractors/ssr.d.ts +17 -0
  51. package/dist/scraper/extractors/ssr.d.ts.map +1 -0
  52. package/dist/scraper/extractors/ssr.js +162 -0
  53. package/dist/scraper/extractors/ssr.js.map +1 -0
  54. package/dist/scraper/extractors/to-markdown.d.ts +5 -0
  55. package/dist/scraper/extractors/to-markdown.d.ts.map +1 -0
  56. package/dist/scraper/extractors/to-markdown.js +103 -0
  57. package/dist/scraper/extractors/to-markdown.js.map +1 -0
  58. package/dist/scraper/index.d.ts +27 -0
  59. package/dist/scraper/index.d.ts.map +1 -0
  60. package/dist/scraper/index.js +178 -0
  61. package/dist/scraper/index.js.map +1 -0
  62. package/dist/scraper/tiers/tier1-http.d.ts +5 -0
  63. package/dist/scraper/tiers/tier1-http.d.ts.map +1 -0
  64. package/dist/scraper/tiers/tier1-http.js +120 -0
  65. package/dist/scraper/tiers/tier1-http.js.map +1 -0
  66. package/dist/scraper/tiers/tier2-stealth.d.ts +5 -0
  67. package/dist/scraper/tiers/tier2-stealth.d.ts.map +1 -0
  68. package/dist/scraper/tiers/tier2-stealth.js +106 -0
  69. package/dist/scraper/tiers/tier2-stealth.js.map +1 -0
  70. package/dist/scraper/tiers/tier3-browser.d.ts +10 -0
  71. package/dist/scraper/tiers/tier3-browser.d.ts.map +1 -0
  72. package/dist/scraper/tiers/tier3-browser.js +504 -0
  73. package/dist/scraper/tiers/tier3-browser.js.map +1 -0
  74. package/dist/scraper/types.d.ts +130 -0
  75. package/dist/scraper/types.d.ts.map +1 -0
  76. package/dist/scraper/types.js +3 -0
  77. package/dist/scraper/types.js.map +1 -0
  78. package/dist/security/action-validator.d.ts +83 -0
  79. package/dist/security/action-validator.d.ts.map +1 -0
  80. package/dist/security/action-validator.js +36 -0
  81. package/dist/security/action-validator.js.map +1 -0
  82. package/dist/security/url-validator.d.ts +9 -0
  83. package/dist/security/url-validator.d.ts.map +1 -0
  84. package/dist/security/url-validator.js +69 -0
  85. package/dist/security/url-validator.js.map +1 -0
  86. package/dist/types.d.ts +95 -0
  87. package/dist/types.d.ts.map +1 -0
  88. package/dist/types.js +2 -0
  89. package/dist/types.js.map +1 -0
  90. package/package.json +54 -0
  91. package/readme.md +760 -0
@@ -0,0 +1,237 @@
1
+ import { load } from "cheerio";
2
+ import { JSDOM } from "jsdom";
3
+ import { Readability } from "@mozilla/readability";
4
+ // ─── Seletores de "ruído" a remover (fallback Cheerio) ────────────────────────
5
+ const NOISE_SELECTORS = [
6
+ // Estrutural
7
+ "script",
8
+ "style",
9
+ "noscript",
10
+ "iframe",
11
+ "svg",
12
+ // Navegação
13
+ "nav",
14
+ "header",
15
+ "footer",
16
+ '[role="navigation"]',
17
+ '[role="banner"]',
18
+ '[role="contentinfo"]',
19
+ ".nav",
20
+ ".navbar",
21
+ ".navigation",
22
+ ".menu",
23
+ ".header",
24
+ ".footer",
25
+ ".site-header",
26
+ ".site-footer",
27
+ // Lateral
28
+ "aside",
29
+ ".sidebar",
30
+ ".side-bar",
31
+ "#sidebar",
32
+ '[role="complementary"]',
33
+ // Anúncios e promoções
34
+ ".ad",
35
+ ".ads",
36
+ ".adsbygoogle",
37
+ ".advertisement",
38
+ ".promo",
39
+ ".banner",
40
+ '[id*="google_ads"]',
41
+ '[class*="sponsored"]',
42
+ // Banners legais
43
+ ".cookie-banner",
44
+ ".cookie-notice",
45
+ ".cookie-consent",
46
+ ".gdpr",
47
+ // Overlays
48
+ ".popup",
49
+ ".modal",
50
+ ".overlay",
51
+ ".backdrop",
52
+ // Social e misc
53
+ ".social-share",
54
+ ".share-buttons",
55
+ ".related-posts",
56
+ ".comments",
57
+ "#comments",
58
+ ".comment-section",
59
+ ".newsletter",
60
+ ".subscribe",
61
+ ];
62
+ // ─── Seletores de conteúdo principal (fallback Cheerio) ───────────────────────
63
+ const MAIN_CONTENT_SELECTORS = [
64
+ "main",
65
+ "article",
66
+ '[role="main"]',
67
+ "#main-content",
68
+ "#content",
69
+ "#main",
70
+ ".main-content",
71
+ ".content",
72
+ ".post-content",
73
+ ".article-content",
74
+ ".entry-content",
75
+ ".page-content",
76
+ ".blog-post",
77
+ ".blog-content",
78
+ ".post-body",
79
+ ".article-body",
80
+ ];
81
+ // ─── Extração de links ─────────────────────────────────────────────────────
82
+ function extractLinks($, baseUrl) {
83
+ const links = [];
84
+ const seen = new Set();
85
+ $("a[href]").each((_, el) => {
86
+ const href = $(el).attr("href");
87
+ if (!href)
88
+ return;
89
+ if (href.startsWith("#"))
90
+ return;
91
+ if (href.startsWith("javascript:"))
92
+ return;
93
+ if (href.startsWith("mailto:"))
94
+ return;
95
+ if (href.startsWith("tel:"))
96
+ return;
97
+ // Tenta resolver URL relativa
98
+ let resolved = href;
99
+ if (baseUrl && (href.startsWith("/") || href.startsWith("."))) {
100
+ try {
101
+ resolved = new URL(href, baseUrl).href;
102
+ }
103
+ catch {
104
+ return;
105
+ }
106
+ }
107
+ if (!seen.has(resolved)) {
108
+ seen.add(resolved);
109
+ links.push(resolved);
110
+ }
111
+ });
112
+ return links;
113
+ }
114
+ // ─── Extração de metadados ─────────────────────────────────────────────────
115
+ function extractMeta($) {
116
+ const title = $("title").first().text().trim() ||
117
+ $('meta[property="og:title"]').attr("content")?.trim() ||
118
+ $("h1").first().text().trim() ||
119
+ "";
120
+ const description = $('meta[name="description"]').attr("content")?.trim() ||
121
+ $('meta[property="og:description"]').attr("content")?.trim() ||
122
+ $('meta[name="twitter:description"]').attr("content")?.trim() ||
123
+ "";
124
+ return { title, description };
125
+ }
126
+ // ─── Mozilla Readability (caminho principal) ───────────────────────────────────
127
+ //
128
+ // Mesmo algoritmo que o Firefox usa no Reader Mode e que o Firecrawl real usa.
129
+ // Produz conteúdo semanticamente limpo, muito superior a heurísticas manuais.
130
+ function extractWithReadability(html, baseUrl) {
131
+ try {
132
+ const dom = new JSDOM(html, {
133
+ // URL necessária para Readability resolver links relativos corretamente
134
+ url: baseUrl ?? "https://example.com",
135
+ });
136
+ const reader = new Readability(dom.window.document, {
137
+ // Aceita conteúdo com no mínimo 50 caracteres (padrão é 500)
138
+ charThreshold: 50,
139
+ });
140
+ const article = reader.parse();
141
+ // Rejeita se não produziu conteúdo suficiente
142
+ if (!article ||
143
+ !article.content ||
144
+ (article.textContent?.trim()?.length ?? 0) < 100) {
145
+ return null;
146
+ }
147
+ return {
148
+ html: article.content,
149
+ text: (article.textContent ?? "").replace(/\s+/g, " ").trim(),
150
+ title: article.title ?? "",
151
+ };
152
+ }
153
+ catch {
154
+ // JSDOM ou Readability falharam — aciona fallback Cheerio
155
+ return null;
156
+ }
157
+ }
158
+ // ─── Cheerio (fallback) ───────────────────────────────────────────────────────
159
+ function extractWithCheerio($, onlyMain) {
160
+ // Remove ruído
161
+ NOISE_SELECTORS.forEach((selector) => {
162
+ try {
163
+ $(selector).remove();
164
+ }
165
+ catch {
166
+ // Seletor inválido no contexto — ignora
167
+ }
168
+ });
169
+ // Inicia com body como padrão seguro
170
+ let contentEl = $("body");
171
+ if (onlyMain) {
172
+ // Tenta encontrar área de conteúdo principal
173
+ for (const selector of MAIN_CONTENT_SELECTORS) {
174
+ const el = $(selector);
175
+ if (el.length > 0) {
176
+ const text = el.first().text().replace(/\s+/g, " ").trim();
177
+ if (text.length > 150) {
178
+ contentEl = el.first();
179
+ break;
180
+ }
181
+ }
182
+ }
183
+ }
184
+ // Limpa atributos de rastreamento e estilos inline
185
+ contentEl.find("[style]").removeAttr("style");
186
+ contentEl.find("[onclick]").removeAttr("onclick");
187
+ contentEl.find("[class]").each((_, el) => {
188
+ $(el).removeAttr("class");
189
+ });
190
+ const contentHtml = contentEl.html() ?? "";
191
+ const text = contentEl.text().replace(/\s+/g, " ").trim();
192
+ return { html: contentHtml, text };
193
+ }
194
+ // ─── Extração principal ────────────────────────────────────────────────────────
195
+ /**
196
+ * Extrai o conteúdo significativo de um HTML.
197
+ *
198
+ * Estratégia em dois níveis:
199
+ * 1. Mozilla Readability — mesmo algoritmo do Firefox Reader Mode.
200
+ * Produz conteúdo muito mais limpo e semântico que heurísticas manuais.
201
+ * 2. Cheerio + seletores heurísticos — fallback quando Readability falha
202
+ * (ex: páginas muito simples ou layouts não-convencionais).
203
+ *
204
+ * @param html - HTML completo da página
205
+ * @param onlyMain - Tentar extrair apenas o conteúdo principal
206
+ * @param baseUrl - URL base para resolver links e contextualizar o Readability
207
+ */
208
+ export function extractContent(html, onlyMain = true, baseUrl) {
209
+ const $ = load(html);
210
+ // Extrai metadados e links ANTES de remover elementos de navegação
211
+ const { title, description } = extractMeta($);
212
+ const links = extractLinks($, baseUrl);
213
+ // ── Caminho 1: Mozilla Readability ────────────────────────────────────────
214
+ if (onlyMain) {
215
+ const readable = extractWithReadability(html, baseUrl);
216
+ if (readable) {
217
+ return {
218
+ html: readable.html,
219
+ text: readable.text,
220
+ // Título do Readability é mais preciso (remove sufixos de site)
221
+ title: readable.title || title,
222
+ description,
223
+ links,
224
+ };
225
+ }
226
+ }
227
+ // ── Caminho 2: Cheerio (fallback) ─────────────────────────────────────────
228
+ const cheerio = extractWithCheerio($, onlyMain);
229
+ return {
230
+ html: cheerio.html,
231
+ text: cheerio.text,
232
+ title,
233
+ description,
234
+ links,
235
+ };
236
+ }
237
+ //# sourceMappingURL=content.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"content.js","sourceRoot":"","sources":["../../../src/scraper/extractors/content.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAmB,MAAM,SAAS,CAAC;AAChD,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAC9B,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AAEnD,iFAAiF;AAEjF,MAAM,eAAe,GAAG;IACtB,aAAa;IACb,QAAQ;IACR,OAAO;IACP,UAAU;IACV,QAAQ;IACR,KAAK;IACL,YAAY;IACZ,KAAK;IACL,QAAQ;IACR,QAAQ;IACR,qBAAqB;IACrB,iBAAiB;IACjB,sBAAsB;IACtB,MAAM;IACN,SAAS;IACT,aAAa;IACb,OAAO;IACP,SAAS;IACT,SAAS;IACT,cAAc;IACd,cAAc;IACd,UAAU;IACV,OAAO;IACP,UAAU;IACV,WAAW;IACX,UAAU;IACV,wBAAwB;IACxB,uBAAuB;IACvB,KAAK;IACL,MAAM;IACN,cAAc;IACd,gBAAgB;IAChB,QAAQ;IACR,SAAS;IACT,oBAAoB;IACpB,sBAAsB;IACtB,iBAAiB;IACjB,gBAAgB;IAChB,gBAAgB;IAChB,iBAAiB;IACjB,OAAO;IACP,WAAW;IACX,QAAQ;IACR,QAAQ;IACR,UAAU;IACV,WAAW;IACX,gBAAgB;IAChB,eAAe;IACf,gBAAgB;IAChB,gBAAgB;IAChB,WAAW;IACX,WAAW;IACX,kBAAkB;IAClB,aAAa;IACb,YAAY;CACJ,CAAC;AAEX,iFAAiF;AAEjF,MAAM,sBAAsB,GAAG;IAC7B,MAAM;IACN,SAAS;IACT,eAAe;IACf,eAAe;IACf,UAAU;IACV,OAAO;IACP,eAAe;IACf,UAAU;IACV,eAAe;IACf,kBAAkB;IAClB,gBAAgB;IAChB,eAAe;IACf,YAAY;IACZ,eAAe;IACf,YAAY;IACZ,eAAe;CACP,CAAC;AAYX,8EAA8E;AAE9E,SAAS,YAAY,CAAC,CAAa,EAAE,OAAgB;IACnD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAE/B,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QAC1B,MAAM,IAAI,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAChC,IAAI,CAAC,IAAI;YAAE,OAAO;QAClB,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,OAAO;QACjC,IAAI,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;YAAE,OAAO;QAC3C,IAAI,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;YAAE,OAAO;QACvC,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;YAAE,OAAO;QAEpC,8BAA8B;QAC9B,IAAI,QAAQ,GAAG,IAAI,CAAC;QACpB,IAAI,OAAO,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAC9D,IAAI,CAAC;gBACH,QAAQ,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;YACzC,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO;YACT,CAAC;QACH,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACxB,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YACnB,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACvB,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC;AAED,8EAA8E;AAE9E,SAAS,WAAW,CAAC,CAAa;IAChC,MAAM,KAAK,GACT,CAAC,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE;QAChC,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE;QACtD,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE;QAC7B,EAAE,CAAC;IAEL,MAAM,WAAW,GACf,CAAC,CAAC,0BAA0B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE;QACrD,CAAC,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE;QAC5D,CAAC,CAAC,kCAAkC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE;QAC7D,EAAE,CAAC;IAEL,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,CAAC;AAChC,CAAC;AAED,kFAAkF;AAClF,EAAE;AACF,+EAA+E;AAC/E,8EAA8E;AAE9E,SAAS,sBAAsB,CAC7B,IAAY,EACZ,OAAgB;IAEhB,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE;YAC1B,wEAAwE;YACxE,GAAG,EAAE,OAAO,IAAI,qBAAqB;SACtC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE;YAClD,6DAA6D;YAC7D,aAAa,EAAE,EAAE;SAClB,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAE/B,8CAA8C;QAC9C,IACE,CAAC,OAAO;YACR,CAAC,OAAO,CAAC,OAAO;YAChB,CAAC,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,EAAE,MAAM,IAAI,CAAC,CAAC,GAAG,GAAG,EAChD,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO;YACL,IAAI,EAAE,OAAO,CAAC,OAAO;YACrB,IAAI,EAAE,CAAC,OAAO,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE;YAC7D,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,EAAE;SAC3B,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,0DAA0D;QAC1D,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,iFAAiF;AAEjF,SAAS,kBAAkB,CACzB,CAAa,EACb,QAAiB;IAEjB,eAAe;IACf,eAAe,CAAC,OAAO,CAAC,CAAC,QAAQ,EAAE,EAAE;QACnC,IAAI,CAAC;YACH,CAAC,CAAC,QAAQ,CAAC,CAAC,MAAM,EAAE,CAAC;QACvB,CAAC;QAAC,MAAM,CAAC;YACP,wCAAwC;QAC1C,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,qCAAqC;IACrC,IAAI,SAAS,GAAyB,CAAC,CAAC,MAAM,CAAC,CAAC;IAEhD,IAAI,QAAQ,EAAE,CAAC;QACb,6CAA6C;QAC7C,KAAK,MAAM,QAAQ,IAAI,sBAAsB,EAAE,CAAC;YAC9C,MAAM,EAAE,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;YACvB,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAClB,MAAM,IAAI,GAAG,EAAE,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC3D,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;oBACtB,SAAS,GAAG,EAAE,CAAC,KAAK,EAAE,CAAC;oBACvB,MAAM;gBACR,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,mDAAmD;IACnD,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC;IAC9C,SAAS,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC;IAClD,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACvC,CAAC,CAAC,EAAE,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC;IAC5B,CAAC,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;IAC3C,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAE1D,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;AACrC,CAAC;AAED,kFAAkF;AAElF;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,cAAc,CAC5B,IAAY,EACZ,QAAQ,GAAG,IAAI,EACf,OAAgB;IAEhB,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IAErB,mEAAmE;IACnE,MAAM,EAAE,KAAK,EAAE,WAAW,EAAE,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC;IAC9C,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;IAEvC,6EAA6E;IAC7E,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,QAAQ,GAAG,sBAAsB,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QACvD,IAAI,QAAQ,EAAE,CAAC;YACb,OAAO;gBACL,IAAI,EAAE,QAAQ,CAAC,IAAI;gBACnB,IAAI,EAAE,QAAQ,CAAC,IAAI;gBACnB,gEAAgE;gBAChE,KAAK,EAAE,QAAQ,CAAC,KAAK,IAAI,KAAK;gBAC9B,WAAW;gBACX,KAAK;aACN,CAAC;QACJ,CAAC;IACH,CAAC;IAED,6EAA6E;IAC7E,MAAM,OAAO,GAAG,kBAAkB,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;IAChD,OAAO;QACL,IAAI,EAAE,OAAO,CAAC,IAAI;QAClB,IAAI,EAAE,OAAO,CAAC,IAAI;QAClB,KAAK;QACL,WAAW;QACX,KAAK;KACN,CAAC;AACJ,CAAC"}
@@ -0,0 +1,17 @@
1
+ import type { SSRData } from "../types.js";
2
+ /**
3
+ * Tenta extrair dados JSON embutidos por frameworks SSR no HTML inicial.
4
+ * Muitos sites Next.js/Nuxt/SvelteKit não precisam de browser —
5
+ * os dados já estão no HTML e podem ser extraídos com Cheerio!
6
+ */
7
+ export declare function extractSSRData(html: string): SSRData | null;
8
+ /**
9
+ * Verifica se a página tem conteúdo suficiente sem JavaScript.
10
+ *
11
+ * Retorna `false` quando:
12
+ * - O texto visível é muito curto (< 200 chars) → SPA ainda não renderizou
13
+ * - Detecta padrões de anti-bot / challenge pages (Cloudflare, DDoS-Guard, etc.)
14
+ * - Detecta loading screens (texto de JS habilitado, spinners, etc.)
15
+ */
16
+ export declare function hasEnoughContent(html: string): boolean;
17
+ //# sourceMappingURL=ssr.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ssr.d.ts","sourceRoot":"","sources":["../../../src/scraper/extractors/ssr.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAoB3C;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,GAAG,IAAI,CA8F3D;AAED;;;;;;;GAOG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAwDtD"}
@@ -0,0 +1,162 @@
1
+ import { load } from "cheerio";
2
+ // ─── Detectores de dados SSR ───────────────────────────────────────────────
3
+ //
4
+ // Frameworks modernos embutem dados no HTML inicial para hidratação no cliente.
5
+ // Extrair esses dados evita a necessidade de browser em ~60-70% dos sites.
6
+ //
7
+ // Ordem: do mais específico para o mais genérico.
8
+ // ──────────────────────────────────────────────────────────────────────────
9
+ /** Tenta parsear JSON com segurança; retorna null em caso de erro */
10
+ function tryParse(raw) {
11
+ if (!raw?.trim())
12
+ return null;
13
+ try {
14
+ return JSON.parse(raw.trim());
15
+ }
16
+ catch {
17
+ return null;
18
+ }
19
+ }
20
+ /**
21
+ * Tenta extrair dados JSON embutidos por frameworks SSR no HTML inicial.
22
+ * Muitos sites Next.js/Nuxt/SvelteKit não precisam de browser —
23
+ * os dados já estão no HTML e podem ser extraídos com Cheerio!
24
+ */
25
+ export function extractSSRData(html) {
26
+ const $ = load(html);
27
+ // ── Next.js: <script id="__NEXT_DATA__" type="application/json"> ──────
28
+ const nextRaw = $("#__NEXT_DATA__").text().trim();
29
+ const nextData = tryParse(nextRaw);
30
+ if (nextData)
31
+ return { type: "next", data: nextData };
32
+ // ── Angular Universal: <script id="ng-state" type="application/json"> ─
33
+ const ngRaw = $('script#ng-state[type="application/json"]').text().trim();
34
+ const ngData = tryParse(ngRaw);
35
+ if (ngData)
36
+ return { type: "angular", data: ngData };
37
+ // ── SvelteKit: <script type="application/json" data-sveltekit-fetched> ─
38
+ // SvelteKit 2+ serializa dados de `load()` em tags script com atributo especial
39
+ const svelteFetchedRaw = $('script[data-sveltekit-fetched]').text().trim();
40
+ const svelteFetchedData = tryParse(svelteFetchedRaw);
41
+ if (svelteFetchedData)
42
+ return { type: "sveltekit", data: svelteFetchedData };
43
+ // ── Nuxt 2/3: window.__NUXT__ = ... ──────────────────────────────────
44
+ // Nuxt pode usar JSON ou devalue (formato não-JSON proprietário do Nuxt 3)
45
+ // Tentamos capturar JSON puro; devalue é ignorado (precisa de browser)
46
+ const nuxtMatch = html.match(/window\.__NUXT__\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
47
+ if (nuxtMatch?.[1]) {
48
+ const nuxtData = tryParse(nuxtMatch[1]);
49
+ if (nuxtData)
50
+ return { type: "nuxt", data: nuxtData };
51
+ }
52
+ // ── Nuxt 3 alternativo: useNuxtApp / nuxtState ────────────────────────
53
+ const nuxt3Match = html.match(/window\.__nuxt_state__\s*=\s*'([^']+)'/);
54
+ if (nuxt3Match?.[1]) {
55
+ try {
56
+ const decoded = decodeURIComponent(nuxt3Match[1]);
57
+ const nuxt3Data = tryParse(decoded);
58
+ if (nuxt3Data)
59
+ return { type: "nuxt", data: nuxt3Data };
60
+ }
61
+ catch { }
62
+ }
63
+ // ── Gatsby: window.___gatsby ou window.___GATSBY ──────────────────────
64
+ const gatsbyMatch = html.match(/window\.___(?:gatsby|GATSBY)\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
65
+ if (gatsbyMatch?.[1]) {
66
+ const gatsbyData = tryParse(gatsbyMatch[1]);
67
+ if (gatsbyData)
68
+ return { type: "gatsby", data: gatsbyData };
69
+ }
70
+ // ── Remix / React Router v7: window.__remixContext ────────────────────
71
+ const remixMatch = html.match(/window\.__remix(?:Context|RouterManifest)\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
72
+ if (remixMatch?.[1]) {
73
+ const remixData = tryParse(remixMatch[1]);
74
+ if (remixData)
75
+ return { type: "remix", data: remixData };
76
+ }
77
+ // ── TanStack Router / Start: window.__TSR_DEHYDRATED__ ───────────────
78
+ const tanstackMatch = html.match(/window\.__(?:TSR_DEHYDRATED|TANSTACK_ROUTER_CONTEXT|TRT_DEHYDRATED)__\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
79
+ if (tanstackMatch?.[1]) {
80
+ const tsrData = tryParse(tanstackMatch[1]);
81
+ if (tsrData)
82
+ return { type: "tanstack", data: tsrData };
83
+ }
84
+ // ── Vue SSR: window.__VUE_SSR_CONTEXT__ / window.__pinia ─────────────
85
+ const vueMatch = html.match(/window\.__(?:VUE_SSR_CONTEXT__|VUE_STORE__|pinia)\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
86
+ if (vueMatch?.[1]) {
87
+ const vueData = tryParse(vueMatch[1]);
88
+ if (vueData)
89
+ return { type: "vue", data: vueData };
90
+ }
91
+ // ── SvelteKit legado: window.__SVELTEKIT__ ────────────────────────────
92
+ const svelteLegacyMatch = html.match(/window\.__(?:SVELTEKIT|sveltekit)__?\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
93
+ if (svelteLegacyMatch?.[1]) {
94
+ const svelteData = tryParse(svelteLegacyMatch[1]);
95
+ if (svelteData)
96
+ return { type: "sveltekit", data: svelteData };
97
+ }
98
+ // ── Genérico: window.__INITIAL_STATE__ / __APP_STATE__ / __REDUX_STATE__ ─
99
+ // Cobre Redux, MobX, Zustand e qualquer store serializado manualmente
100
+ const genericMatch = html.match(/window\.__(?:INITIAL_STATE|APP_STATE|REDUX_STATE|STORE_STATE|DATA|STATE|PROPS)__\s*=\s*(\{[\s\S]*?\})\s*;?\s*<\/script>/);
101
+ if (genericMatch?.[1]) {
102
+ const genericData = tryParse(genericMatch[1]);
103
+ if (genericData)
104
+ return { type: "generic", data: genericData };
105
+ }
106
+ return null;
107
+ }
108
+ /**
109
+ * Verifica se a página tem conteúdo suficiente sem JavaScript.
110
+ *
111
+ * Retorna `false` quando:
112
+ * - O texto visível é muito curto (< 200 chars) → SPA ainda não renderizou
113
+ * - Detecta padrões de anti-bot / challenge pages (Cloudflare, DDoS-Guard, etc.)
114
+ * - Detecta loading screens (texto de JS habilitado, spinners, etc.)
115
+ */
116
+ export function hasEnoughContent(html) {
117
+ const $ = load(html);
118
+ // Remove elementos que não geram conteúdo legível
119
+ $("script, style, noscript, iframe, svg, img").remove();
120
+ const bodyText = $("body").text().replace(/\s+/g, " ").trim();
121
+ // Heurística básica: texto muito curto = SPA sem SSR ou página vazia
122
+ if (bodyText.length < 200)
123
+ return false;
124
+ // ── Padrões de anti-bot / challenge pages ────────────────────────────
125
+ // Cada serviço tem uma frase característica que aparece quando bloqueia o bot.
126
+ const antiBotPatterns = [
127
+ // Cloudflare (mais comum)
128
+ /just a moment/i,
129
+ /checking your browser/i,
130
+ /ddos protection by cloudflare/i,
131
+ /ray id:/i, // ID único do Cloudflare
132
+ // DDoS-Guard
133
+ /ddos-guard/i,
134
+ // Imperva / Incapsula
135
+ /incapsula incident id/i,
136
+ /powered by imperva/i,
137
+ // DataDome
138
+ /datadome/i,
139
+ // hCaptcha / reCAPTCHA challenges
140
+ /complete the security check/i,
141
+ /prove you are human/i,
142
+ /please complete the captcha/i,
143
+ // Loading screens / SPA shell genérica
144
+ /please wait/i,
145
+ /enable javascript/i,
146
+ /you need to enable javascript/i,
147
+ /javascript is required/i,
148
+ /javascript is disabled/i,
149
+ /please enable javascript/i,
150
+ // Genérico
151
+ /access denied/i,
152
+ /403 forbidden/i,
153
+ /bot detected/i,
154
+ ];
155
+ const lowerText = bodyText.toLowerCase();
156
+ const isAntiBot = antiBotPatterns.some((p) => p.test(lowerText));
157
+ // Challenge pages tem pouco texto e padrões identificáveis
158
+ if (isAntiBot && bodyText.length < 2_000)
159
+ return false;
160
+ return true;
161
+ }
162
+ //# sourceMappingURL=ssr.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ssr.js","sourceRoot":"","sources":["../../../src/scraper/extractors/ssr.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,CAAC;AAG/B,8EAA8E;AAC9E,EAAE;AACF,gFAAgF;AAChF,2EAA2E;AAC3E,EAAE;AACF,kDAAkD;AAClD,6EAA6E;AAE7E,qEAAqE;AACrE,SAAS,QAAQ,CAAC,GAAW;IAC3B,IAAI,CAAC,GAAG,EAAE,IAAI,EAAE;QAAE,OAAO,IAAI,CAAC;IAC9B,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;IAChC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IAErB,yEAAyE;IACzE,MAAM,OAAO,GAAG,CAAC,CAAC,gBAAgB,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAClD,MAAM,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC;IACnC,IAAI,QAAQ;QAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAEtD,yEAAyE;IACzE,MAAM,KAAK,GAAG,CAAC,CAAC,0CAA0C,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAC1E,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC;IAC/B,IAAI,MAAM;QAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC;IAErD,0EAA0E;IAC1E,gFAAgF;IAChF,MAAM,gBAAgB,GAAG,CAAC,CAAC,gCAAgC,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAC3E,MAAM,iBAAiB,GAAG,QAAQ,CAAC,gBAAgB,CAAC,CAAC;IACrD,IAAI,iBAAiB;QAAE,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,iBAAiB,EAAE,CAAC;IAE7E,wEAAwE;IACxE,2EAA2E;IAC3E,uEAAuE;IACvE,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,yDAAyD,CAAC,CAAC;IACxF,IAAI,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACnB,MAAM,QAAQ,GAAG,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;QACxC,IAAI,QAAQ;YAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IACxD,CAAC;IAED,yEAAyE;IACzE,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,wCAAwC,CAAC,CAAC;IACxE,IAAI,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,kBAAkB,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;YAClD,MAAM,SAAS,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC;YACpC,IAAI,SAAS;gBAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC;QAC1D,CAAC;QAAC,MAAM,CAAC,CAAA,CAAC;IACZ,CAAC;IAED,yEAAyE;IACzE,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAC5B,qEAAqE,CACtE,CAAC;IACF,IAAI,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACrB,MAAM,UAAU,GAAG,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5C,IAAI,UAAU;YAAE,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;IAC9D,CAAC;IAED,yEAAyE;IACzE,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAC3B,kFAAkF,CACnF,CAAC;IACF,IAAI,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACpB,MAAM,SAAS,GAAG,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1C,IAAI,SAAS;YAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC;IAC3D,CAAC;IAED,wEAAwE;IACxE,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAC9B,8GAA8G,CAC/G,CAAC;IACF,IAAI,aAAa,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACvB,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3C,IAAI,OAAO;YAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;IAC1D,CAAC;IAED,wEAAwE;IACxE,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CACzB,0FAA0F,CAC3F,CAAC;IACF,IAAI,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAClB,MAAM,OAAO,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QACtC,IAAI,OAAO;YAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;IACrD,CAAC;IAED,yEAAyE;IACzE,MAAM,iBAAiB,GAAG,IAAI,CAAC,KAAK,CAClC,6EAA6E,CAC9E,CAAC;IACF,IAAI,iBAAiB,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC3B,MAAM,UAAU,GAAG,QAAQ,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC,CAAC;QAClD,IAAI,UAAU;YAAE,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;IACjE,CAAC;IAED,4EAA4E;IAC5E,sEAAsE;IACtE,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAC7B,yHAAyH,CAC1H,CAAC;IACF,IAAI,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACtB,MAAM,WAAW,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;QAC9C,IAAI,WAAW;YAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,WAAW,EAAE,CAAC;IACjE,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IAErB,kDAAkD;IAClD,CAAC,CAAC,2CAA2C,CAAC,CAAC,MAAM,EAAE,CAAC;IAExD,MAAM,QAAQ,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAE9D,qEAAqE;IACrE,IAAI,QAAQ,CAAC,MAAM,GAAG,GAAG;QAAE,OAAO,KAAK,CAAC;IAExC,wEAAwE;IACxE,+EAA+E;IAC/E,MAAM,eAAe,GAAa;QAChC,0BAA0B;QAC1B,gBAAgB;QAChB,wBAAwB;QACxB,gCAAgC;QAChC,UAAU,EAA8B,yBAAyB;QAEjE,aAAa;QACb,aAAa;QAEb,sBAAsB;QACtB,wBAAwB;QACxB,qBAAqB;QAErB,WAAW;QACX,WAAW;QAEX,kCAAkC;QAClC,8BAA8B;QAC9B,sBAAsB;QACtB,8BAA8B;QAE9B,uCAAuC;QACvC,cAAc;QACd,oBAAoB;QACpB,gCAAgC;QAChC,yBAAyB;QACzB,yBAAyB;QACzB,2BAA2B;QAE3B,WAAW;QACX,gBAAgB;QAChB,gBAAgB;QAChB,eAAe;KAChB,CAAC;IAEF,MAAM,SAAS,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC;IACzC,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;IAEjE,2DAA2D;IAC3D,IAAI,SAAS,IAAI,QAAQ,CAAC,MAAM,GAAG,KAAK;QAAE,OAAO,KAAK,CAAC;IAEvD,OAAO,IAAI,CAAC;AACd,CAAC"}
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Converte HTML em Markdown limpo e legível por humanos/LLMs.
3
+ */
4
+ export declare function htmlToMarkdown(html: string): string;
5
+ //# sourceMappingURL=to-markdown.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"to-markdown.d.ts","sourceRoot":"","sources":["../../../src/scraper/extractors/to-markdown.ts"],"names":[],"mappings":"AA6FA;;GAEG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAwBnD"}
@@ -0,0 +1,103 @@
1
+ // ─── Conversor HTML → Markdown ─────────────────────────────────────────────
2
+ // Usa Turndown (CJS) + plugin GFM para tabelas pipe nativas
3
+ import TurndownService from "turndown";
4
+ import { tables, strikethrough } from "turndown-plugin-gfm";
5
+ let _td = null;
6
+ function getTurndown() {
7
+ if (_td)
8
+ return _td;
9
+ _td = new TurndownService({
10
+ headingStyle: "atx", // # Título em vez de sublinhado
11
+ bulletListMarker: "-",
12
+ codeBlockStyle: "fenced", // ```code``` em vez de indentado
13
+ hr: "---",
14
+ strongDelimiter: "**",
15
+ emDelimiter: "_",
16
+ linkStyle: "inlined",
17
+ });
18
+ // ── Plugin GFM: tabelas pipe e strikethrough ────────────────────────────
19
+ // Converte <table> → | col1 | col2 | em vez de HTML bruto
20
+ _td.use(tables);
21
+ _td.use(strikethrough);
22
+ // ── Regras customizadas ──────────────────────────────────────────────────
23
+ // Remove completamente elementos que não geram conteúdo útil
24
+ // Nota: Turndown.remove() aceita apenas tag names, não CSS selectors
25
+ _td.remove([
26
+ "script",
27
+ "style",
28
+ "noscript",
29
+ "iframe",
30
+ "nav",
31
+ "footer",
32
+ "header",
33
+ "button",
34
+ "form",
35
+ ]);
36
+ // figcaption dentro de figure: remove (evita legenda solta no Markdown)
37
+ _td.addRule("removeFigcaption", {
38
+ filter(node) {
39
+ return (node.nodeName === "FIGCAPTION" &&
40
+ node.parentNode?.nodeName === "FIGURE");
41
+ },
42
+ replacement: () => "",
43
+ });
44
+ // Classes de anúncio (.ad, .ads) — Turndown.remove() não aceita CSS selectors
45
+ _td.addRule("removeAds", {
46
+ filter(node) {
47
+ if (node.nodeType !== 1)
48
+ return false;
49
+ const cls = node.getAttribute("class") ?? "";
50
+ return /\bad\b|\bads\b/.test(cls);
51
+ },
52
+ replacement: () => "",
53
+ });
54
+ // Imagens: extrai alt text de forma limpa
55
+ _td.addRule("images", {
56
+ filter: "img",
57
+ replacement(_content, node) {
58
+ const img = node;
59
+ const alt = img.getAttribute("alt")?.trim() ?? "";
60
+ const src = img.getAttribute("src") ?? "";
61
+ if (!src)
62
+ return "";
63
+ return alt ? `![${alt}](${src})` : `![image](${src})`;
64
+ },
65
+ });
66
+ // Links: remove links vazios ou com href #
67
+ _td.addRule("cleanLinks", {
68
+ filter(node) {
69
+ return (node.nodeName === "A" &&
70
+ (!node.getAttribute("href") ||
71
+ node.getAttribute("href") === "#" ||
72
+ node.getAttribute("href")?.startsWith("javascript:") === true));
73
+ },
74
+ replacement(content) {
75
+ return content; // Mantém apenas o texto, sem o link
76
+ },
77
+ });
78
+ return _td;
79
+ }
80
+ /**
81
+ * Converte HTML em Markdown limpo e legível por humanos/LLMs.
82
+ */
83
+ export function htmlToMarkdown(html) {
84
+ if (!html.trim())
85
+ return "";
86
+ const td = getTurndown();
87
+ let markdown = td.turndown(html);
88
+ // ── Limpeza pós-conversão ──────────────────────────────────────────────
89
+ // Remove linhas que são só espaços/pontuação
90
+ markdown = markdown
91
+ .split("\n")
92
+ .filter((line) => line.trim().length > 0 || line === "")
93
+ .join("\n");
94
+ // Colapsa 3+ linhas em branco para no máximo 2
95
+ markdown = markdown.replace(/\n{3,}/g, "\n\n");
96
+ // Remove espaços trailing
97
+ markdown = markdown
98
+ .split("\n")
99
+ .map((l) => l.trimEnd())
100
+ .join("\n");
101
+ return markdown.trim();
102
+ }
103
+ //# sourceMappingURL=to-markdown.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"to-markdown.js","sourceRoot":"","sources":["../../../src/scraper/extractors/to-markdown.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4DAA4D;AAE5D,OAAO,eAAe,MAAM,UAAU,CAAC;AACvC,OAAO,EAAE,MAAM,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAE5D,IAAI,GAAG,GAA2B,IAAI,CAAC;AAEvC,SAAS,WAAW;IAClB,IAAI,GAAG;QAAE,OAAO,GAAG,CAAC;IAEpB,GAAG,GAAG,IAAI,eAAe,CAAC;QACxB,YAAY,EAAE,KAAK,EAAQ,gCAAgC;QAC3D,gBAAgB,EAAE,GAAG;QACrB,cAAc,EAAE,QAAQ,EAAG,iCAAiC;QAC5D,EAAE,EAAE,KAAK;QACT,eAAe,EAAE,IAAI;QACrB,WAAW,EAAE,GAAG;QAChB,SAAS,EAAE,SAAS;KACrB,CAAC,CAAC;IAEH,2EAA2E;IAC3E,0DAA0D;IAC1D,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC;IAEvB,4EAA4E;IAE5E,6DAA6D;IAC7D,qEAAqE;IACrE,GAAG,CAAC,MAAM,CAAC;QACT,QAAQ;QACR,OAAO;QACP,UAAU;QACV,QAAQ;QACR,KAAK;QACL,QAAQ;QACR,QAAQ;QACR,QAAQ;QACR,MAAM;KACP,CAAC,CAAC;IAEH,wEAAwE;IACxE,GAAG,CAAC,OAAO,CAAC,kBAAkB,EAAE;QAC9B,MAAM,CAAC,IAAI;YACT,OAAO,CACL,IAAI,CAAC,QAAQ,KAAK,YAAY;gBAC9B,IAAI,CAAC,UAAU,EAAE,QAAQ,KAAK,QAAQ,CACvC,CAAC;QACJ,CAAC;QACD,WAAW,EAAE,GAAG,EAAE,CAAC,EAAE;KACtB,CAAC,CAAC;IAEH,8EAA8E;IAC9E,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE;QACvB,MAAM,CAAC,IAAI;YACT,IAAI,IAAI,CAAC,QAAQ,KAAK,CAAC;gBAAE,OAAO,KAAK,CAAC;YACtC,MAAM,GAAG,GAAI,IAAgB,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;YAC1D,OAAO,gBAAgB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACpC,CAAC;QACD,WAAW,EAAE,GAAG,EAAE,CAAC,EAAE;KACtB,CAAC,CAAC;IAEH,0CAA0C;IAC1C,GAAG,CAAC,OAAO,CAAC,QAAQ,EAAE;QACpB,MAAM,EAAE,KAAK;QACb,WAAW,CAAC,QAAQ,EAAE,IAAI;YACxB,MAAM,GAAG,GAAG,IAAwB,CAAC;YACrC,MAAM,GAAG,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAClD,MAAM,GAAG,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;YAC1C,IAAI,CAAC,GAAG;gBAAE,OAAO,EAAE,CAAC;YACpB,OAAO,GAAG,CAAC,CAAC,CAAC,KAAK,GAAG,KAAK,GAAG,GAAG,CAAC,CAAC,CAAC,YAAY,GAAG,GAAG,CAAC;QACxD,CAAC;KACF,CAAC,CAAC;IAEH,2CAA2C;IAC3C,GAAG,CAAC,OAAO,CAAC,YAAY,EAAE;QACxB,MAAM,CAAC,IAAI;YACT,OAAO,CACL,IAAI,CAAC,QAAQ,KAAK,GAAG;gBACrB,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC;oBACzB,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,KAAK,GAAG;oBACjC,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,EAAE,UAAU,CAAC,aAAa,CAAC,KAAK,IAAI,CAAC,CACjE,CAAC;QACJ,CAAC;QACD,WAAW,CAAC,OAAO;YACjB,OAAO,OAAO,CAAC,CAAC,oCAAoC;QACtD,CAAC;KACF,CAAC,CAAC;IAEH,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAE5B,MAAM,EAAE,GAAG,WAAW,EAAE,CAAC;IACzB,IAAI,QAAQ,GAAG,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAEjC,0EAA0E;IAE1E,6CAA6C;IAC7C,QAAQ,GAAG,QAAQ;SAChB,KAAK,CAAC,IAAI,CAAC;SACX,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,KAAK,EAAE,CAAC;SACvD,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,+CAA+C;IAC/C,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAE/C,0BAA0B;IAC1B,QAAQ,GAAG,QAAQ;SAChB,KAAK,CAAC,IAAI,CAAC;SACX,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC;SACvB,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAC;AACzB,CAAC"}
@@ -0,0 +1,27 @@
1
+ import type { FirecrawlConfig, ScrapeOptions, ScrapeResult } from "./types.js";
2
+ export declare class Firecrawl {
3
+ private readonly fullConfig;
4
+ private readonly tier1;
5
+ private readonly tier2;
6
+ private readonly tier3;
7
+ private readonly config;
8
+ constructor(fullConfig?: FirecrawlConfig);
9
+ scrape(url: string, options?: ScrapeOptions): Promise<ScrapeResult>;
10
+ /**
11
+ * Scrapia múltiplas URLs em paralelo com concorrência limitada.
12
+ * Erros em URLs individuais não derrubam o lote inteiro.
13
+ *
14
+ * @param urls - Lista de URLs a scrapeiar
15
+ * @param options - Opções aplicadas a todas as URLs
16
+ * @param concurrency - Máximo de scrapes simultâneos. Default: 3
17
+ */
18
+ scrapeMany(urls: string[], options?: ScrapeOptions, concurrency?: number): Promise<ScrapeResult[]>;
19
+ /**
20
+ * Fecha o browser Playwright (Tier 3).
21
+ * Sempre chamar ao terminar para evitar processos Chromium órfãos.
22
+ */
23
+ close(): Promise<void>;
24
+ private log;
25
+ }
26
+ export type { ScrapeOptions, ScrapeResult, ScrapeTier, ContentFormat, SSRData, InterceptedAPI, FirecrawlConfig, TierRawResult, } from "./types.js";
27
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scraper/index.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EACV,eAAe,EACf,aAAa,EACb,YAAY,EACb,MAAM,YAAY,CAAC;AAepB,qBAAa,SAAS;IAYR,OAAO,CAAC,QAAQ,CAAC,UAAU;IAXvC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAY;IAClC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAe;IACrC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAe;IACrC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAMrB;gBAE2B,UAAU,GAAE,eAAoB;IAevD,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC,YAAY,CAAC;IAqG7E;;;;;;;OAOG;IACG,UAAU,CACd,IAAI,EAAE,MAAM,EAAE,EACd,OAAO,GAAE,aAAkB,EAC3B,WAAW,SAAI,GACd,OAAO,CAAC,YAAY,EAAE,CAAC;IAkC1B;;;OAGG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAM5B,OAAO,CAAC,GAAG;CAKZ;AAGD,YAAY,EACV,aAAa,EACb,YAAY,EACZ,UAAU,EACV,aAAa,EACb,OAAO,EACP,cAAc,EACd,eAAe,EACf,aAAa,GACd,MAAM,YAAY,CAAC"}