@vakra-dev/reader 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +29 -26
- package/dist/cli/index.js +1356 -1039
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +233 -50
- package/dist/index.js +1591 -1042
- package/dist/index.js.map +1 -1
- package/package.json +2 -1
package/dist/cli/index.js
CHANGED
|
@@ -17,132 +17,6 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
|
|
|
17
17
|
// src/scraper.ts
|
|
18
18
|
import pLimit from "p-limit";
|
|
19
19
|
|
|
20
|
-
// src/cloudflare/detector.ts
|
|
21
|
-
var CHALLENGE_DOM_SELECTORS = [
|
|
22
|
-
"#challenge-running",
|
|
23
|
-
"#challenge-stage",
|
|
24
|
-
"#challenge-form",
|
|
25
|
-
".cf-browser-verification"
|
|
26
|
-
];
|
|
27
|
-
var CHALLENGE_TEXT_PATTERNS = [
|
|
28
|
-
"verifying you are human",
|
|
29
|
-
"checking if the site connection is secure",
|
|
30
|
-
"this process is automatic. your browser will redirect"
|
|
31
|
-
];
|
|
32
|
-
var BLOCKED_SIGNALS = [
|
|
33
|
-
"you have been blocked",
|
|
34
|
-
"access to this page has been denied",
|
|
35
|
-
"sorry, you have been blocked",
|
|
36
|
-
"access denied",
|
|
37
|
-
"403 forbidden"
|
|
38
|
-
];
|
|
39
|
-
async function detectChallenge(hero) {
|
|
40
|
-
const signals = [];
|
|
41
|
-
let type = "none";
|
|
42
|
-
try {
|
|
43
|
-
if (!hero.document) {
|
|
44
|
-
return {
|
|
45
|
-
isChallenge: false,
|
|
46
|
-
type: "none",
|
|
47
|
-
confidence: 0,
|
|
48
|
-
signals: ["No document available"]
|
|
49
|
-
};
|
|
50
|
-
}
|
|
51
|
-
const html = await hero.document.documentElement.outerHTML;
|
|
52
|
-
const htmlLower = html.toLowerCase();
|
|
53
|
-
for (const selector of CHALLENGE_DOM_SELECTORS) {
|
|
54
|
-
if (htmlLower.includes(selector.toLowerCase())) {
|
|
55
|
-
signals.push(`Challenge element: ${selector}`);
|
|
56
|
-
type = "js_challenge";
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
for (const pattern of CHALLENGE_TEXT_PATTERNS) {
|
|
60
|
-
if (htmlLower.includes(pattern)) {
|
|
61
|
-
signals.push(`Challenge text: "${pattern}"`);
|
|
62
|
-
type = type === "none" ? "js_challenge" : type;
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
|
|
66
|
-
signals.push('Challenge text: "waiting for...to respond"');
|
|
67
|
-
type = type === "none" ? "js_challenge" : type;
|
|
68
|
-
}
|
|
69
|
-
for (const pattern of BLOCKED_SIGNALS) {
|
|
70
|
-
if (htmlLower.includes(pattern)) {
|
|
71
|
-
signals.push(`Blocked: "${pattern}"`);
|
|
72
|
-
type = "blocked";
|
|
73
|
-
break;
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
const isChallenge = signals.length > 0;
|
|
77
|
-
const confidence = isChallenge ? 100 : 0;
|
|
78
|
-
return {
|
|
79
|
-
isChallenge,
|
|
80
|
-
type: isChallenge ? type : "none",
|
|
81
|
-
confidence,
|
|
82
|
-
signals
|
|
83
|
-
};
|
|
84
|
-
} catch (error) {
|
|
85
|
-
return {
|
|
86
|
-
isChallenge: false,
|
|
87
|
-
type: "none",
|
|
88
|
-
confidence: 0,
|
|
89
|
-
signals: [`Error during detection: ${error.message}`]
|
|
90
|
-
};
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
// src/cloudflare/handler.ts
|
|
95
|
-
async function waitForChallengeResolution(hero, options) {
|
|
96
|
-
const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
|
|
97
|
-
const startTime = Date.now();
|
|
98
|
-
const log = (msg) => verbose && console.log(` ${msg}`);
|
|
99
|
-
while (Date.now() - startTime < maxWaitMs) {
|
|
100
|
-
const elapsed = Date.now() - startTime;
|
|
101
|
-
try {
|
|
102
|
-
const currentUrl = await hero.url;
|
|
103
|
-
if (currentUrl !== initialUrl) {
|
|
104
|
-
log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
|
|
105
|
-
log(` Waiting for new page to load...`);
|
|
106
|
-
try {
|
|
107
|
-
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
108
|
-
log(` DOMContentLoaded`);
|
|
109
|
-
} catch {
|
|
110
|
-
log(` DOMContentLoaded timeout, continuing...`);
|
|
111
|
-
}
|
|
112
|
-
await hero.waitForPaintingStable().catch(() => {
|
|
113
|
-
});
|
|
114
|
-
log(` Page stabilized`);
|
|
115
|
-
return { resolved: true, method: "url_redirect", waitedMs: elapsed };
|
|
116
|
-
}
|
|
117
|
-
} catch {
|
|
118
|
-
}
|
|
119
|
-
const detection = await detectChallenge(hero);
|
|
120
|
-
if (!detection.isChallenge) {
|
|
121
|
-
log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
|
|
122
|
-
log(` Waiting for page to load...`);
|
|
123
|
-
try {
|
|
124
|
-
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
125
|
-
log(` DOMContentLoaded`);
|
|
126
|
-
} catch {
|
|
127
|
-
log(` DOMContentLoaded timeout, continuing...`);
|
|
128
|
-
}
|
|
129
|
-
await hero.waitForPaintingStable().catch(() => {
|
|
130
|
-
});
|
|
131
|
-
log(` Page stabilized`);
|
|
132
|
-
return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
|
|
133
|
-
}
|
|
134
|
-
log(
|
|
135
|
-
`\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
|
|
136
|
-
);
|
|
137
|
-
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
|
|
138
|
-
}
|
|
139
|
-
return {
|
|
140
|
-
resolved: false,
|
|
141
|
-
method: "timeout",
|
|
142
|
-
waitedMs: Date.now() - startTime
|
|
143
|
-
};
|
|
144
|
-
}
|
|
145
|
-
|
|
146
20
|
// src/formatters/markdown.ts
|
|
147
21
|
import TurndownService from "turndown";
|
|
148
22
|
var turndownService = new TurndownService({
|
|
@@ -156,84 +30,6 @@ var turndownService = new TurndownService({
|
|
|
156
30
|
linkStyle: "inlined",
|
|
157
31
|
linkReferenceStyle: "full"
|
|
158
32
|
});
|
|
159
|
-
function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
160
|
-
const sections = [];
|
|
161
|
-
if (includeMetadata) {
|
|
162
|
-
sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
163
|
-
}
|
|
164
|
-
if (pages.length > 1) {
|
|
165
|
-
sections.push(createMarkdownTOC(pages));
|
|
166
|
-
}
|
|
167
|
-
sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
|
|
168
|
-
return sections.join("\n\n");
|
|
169
|
-
}
|
|
170
|
-
function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
171
|
-
const title = website.title || extractDomainFromUrl(baseUrl);
|
|
172
|
-
const description = website.description || "";
|
|
173
|
-
let header = `# Website Scrape: ${title}
|
|
174
|
-
|
|
175
|
-
`;
|
|
176
|
-
header += `**Base URL:** ${baseUrl}
|
|
177
|
-
`;
|
|
178
|
-
header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
|
|
179
|
-
`;
|
|
180
|
-
header += `**Duration:** ${duration}ms
|
|
181
|
-
`;
|
|
182
|
-
header += `**Total pages:** ${totalPages}
|
|
183
|
-
`;
|
|
184
|
-
if (description) {
|
|
185
|
-
header += `**Description:** ${description}
|
|
186
|
-
`;
|
|
187
|
-
}
|
|
188
|
-
if (website.author) {
|
|
189
|
-
header += `**Author:** ${website.author}
|
|
190
|
-
`;
|
|
191
|
-
}
|
|
192
|
-
if (website.language) {
|
|
193
|
-
header += `**Language:** ${website.language}
|
|
194
|
-
`;
|
|
195
|
-
}
|
|
196
|
-
return header;
|
|
197
|
-
}
|
|
198
|
-
function createMarkdownTOC(pages) {
|
|
199
|
-
let toc = "## Table of Contents\n\n";
|
|
200
|
-
pages.forEach((page, index) => {
|
|
201
|
-
const depth = " ".repeat(page.depth);
|
|
202
|
-
const pageNumber = index + 1;
|
|
203
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
204
|
-
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
205
|
-
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
206
|
-
toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
|
|
207
|
-
`;
|
|
208
|
-
});
|
|
209
|
-
return toc;
|
|
210
|
-
}
|
|
211
|
-
function createMarkdownPage(page, pageNumber) {
|
|
212
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
213
|
-
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
214
|
-
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
215
|
-
let pageContent = `---
|
|
216
|
-
|
|
217
|
-
`;
|
|
218
|
-
pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
|
|
219
|
-
|
|
220
|
-
`;
|
|
221
|
-
pageContent += `**URL:** ${page.url}
|
|
222
|
-
`;
|
|
223
|
-
pageContent += `**Title:** ${page.title}
|
|
224
|
-
`;
|
|
225
|
-
pageContent += `**Depth:** ${page.depth}
|
|
226
|
-
`;
|
|
227
|
-
pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
|
|
228
|
-
|
|
229
|
-
`;
|
|
230
|
-
pageContent += `---
|
|
231
|
-
|
|
232
|
-
`;
|
|
233
|
-
const markdown = htmlToMarkdown(page.html);
|
|
234
|
-
pageContent += markdown;
|
|
235
|
-
return pageContent;
|
|
236
|
-
}
|
|
237
33
|
function htmlToMarkdown(html) {
|
|
238
34
|
try {
|
|
239
35
|
return turndownService.turndown(html);
|
|
@@ -242,574 +38,339 @@ function htmlToMarkdown(html) {
|
|
|
242
38
|
return html.replace(/<[^>]*>/g, "").trim();
|
|
243
39
|
}
|
|
244
40
|
}
|
|
245
|
-
function extractDomainFromUrl(url) {
|
|
246
|
-
try {
|
|
247
|
-
return new URL(url).hostname;
|
|
248
|
-
} catch {
|
|
249
|
-
return "Unknown";
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
// src/formatters/html.ts
|
|
254
|
-
function formatToHTML(pages, baseUrl, scrapedAt, duration, website) {
|
|
255
|
-
const html = `<!DOCTYPE html>
|
|
256
|
-
<html lang="${website.language || "en"}">
|
|
257
|
-
<head>
|
|
258
|
-
<meta charset="${website.charset || "UTF-8"}">
|
|
259
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
260
|
-
<title>Scrape: ${website.title || extractDomainFromUrl2(baseUrl)}</title>
|
|
261
|
-
${generateMetaTags(website)}
|
|
262
|
-
<style>
|
|
263
|
-
${generateCSS()}
|
|
264
|
-
</style>
|
|
265
|
-
</head>
|
|
266
|
-
<body>
|
|
267
|
-
<header class="header">
|
|
268
|
-
<h1>Website Scrape: ${escapeHtml(website.title || extractDomainFromUrl2(baseUrl))}</h1>
|
|
269
|
-
<div class="meta-info">
|
|
270
|
-
<p><strong>Base URL:</strong> <a href="${escapeHtml(
|
|
271
|
-
baseUrl
|
|
272
|
-
)}" target="_blank">${escapeHtml(baseUrl)}</a></p>
|
|
273
|
-
<p><strong>Scraped at:</strong> ${new Date(scrapedAt).toLocaleString()}</p>
|
|
274
|
-
<p><strong>Duration:</strong> ${duration}ms</p>
|
|
275
|
-
<p><strong>Total pages:</strong> ${pages.length}</p>
|
|
276
|
-
${website.description ? `<p><strong>Description:</strong> ${escapeHtml(website.description)}</p>` : ""}
|
|
277
|
-
${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
|
|
278
|
-
${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
|
|
279
|
-
</div>
|
|
280
|
-
</header>
|
|
281
|
-
|
|
282
|
-
${pages.length > 1 ? generateTOC(pages) : ""}
|
|
283
|
-
|
|
284
|
-
<main class="content">
|
|
285
|
-
${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
|
|
286
|
-
</main>
|
|
287
|
-
|
|
288
|
-
<footer class="footer">
|
|
289
|
-
<p>Generated by Reader JS/TS SDK</p>
|
|
290
|
-
</footer>
|
|
291
|
-
|
|
292
|
-
<script>
|
|
293
|
-
${generateJavaScript()}
|
|
294
|
-
</script>
|
|
295
|
-
</body>
|
|
296
|
-
</html>`;
|
|
297
|
-
return html;
|
|
298
|
-
}
|
|
299
|
-
function generateMetaTags(website) {
|
|
300
|
-
const tags = [];
|
|
301
|
-
if (website.description) {
|
|
302
|
-
tags.push(`<meta name="description" content="${escapeHtml(website.description)}">`);
|
|
303
|
-
}
|
|
304
|
-
if (website.author) {
|
|
305
|
-
tags.push(`<meta name="author" content="${escapeHtml(website.author)}">`);
|
|
306
|
-
}
|
|
307
|
-
if (website.keywords) {
|
|
308
|
-
tags.push(`<meta name="keywords" content="${escapeHtml(website.keywords.join(", "))}">`);
|
|
309
|
-
}
|
|
310
|
-
if (website.robots) {
|
|
311
|
-
tags.push(`<meta name="robots" content="${escapeHtml(website.robots)}">`);
|
|
312
|
-
}
|
|
313
|
-
if (website.themeColor) {
|
|
314
|
-
tags.push(`<meta name="theme-color" content="${escapeHtml(website.themeColor)}">`);
|
|
315
|
-
}
|
|
316
|
-
if (website.favicon) {
|
|
317
|
-
tags.push(`<link rel="icon" href="${escapeHtml(website.favicon)}">`);
|
|
318
|
-
}
|
|
319
|
-
if (website.canonical) {
|
|
320
|
-
tags.push(`<link rel="canonical" href="${escapeHtml(website.canonical)}">`);
|
|
321
|
-
}
|
|
322
|
-
if (website.openGraph) {
|
|
323
|
-
const og = website.openGraph;
|
|
324
|
-
if (og.title) tags.push(`<meta property="og:title" content="${escapeHtml(og.title)}">`);
|
|
325
|
-
if (og.description)
|
|
326
|
-
tags.push(`<meta property="og:description" content="${escapeHtml(og.description)}">`);
|
|
327
|
-
if (og.type) tags.push(`<meta property="og:type" content="${escapeHtml(og.type)}">`);
|
|
328
|
-
if (og.url) tags.push(`<meta property="og:url" content="${escapeHtml(og.url)}">`);
|
|
329
|
-
if (og.image) tags.push(`<meta property="og:image" content="${escapeHtml(og.image)}">`);
|
|
330
|
-
if (og.siteName)
|
|
331
|
-
tags.push(`<meta property="og:site_name" content="${escapeHtml(og.siteName)}">`);
|
|
332
|
-
if (og.locale) tags.push(`<meta property="og:locale" content="${escapeHtml(og.locale)}">`);
|
|
333
|
-
}
|
|
334
|
-
if (website.twitter) {
|
|
335
|
-
const twitter = website.twitter;
|
|
336
|
-
if (twitter.card) tags.push(`<meta name="twitter:card" content="${escapeHtml(twitter.card)}">`);
|
|
337
|
-
if (twitter.site) tags.push(`<meta name="twitter:site" content="${escapeHtml(twitter.site)}">`);
|
|
338
|
-
if (twitter.creator)
|
|
339
|
-
tags.push(`<meta name="twitter:creator" content="${escapeHtml(twitter.creator)}">`);
|
|
340
|
-
if (twitter.title)
|
|
341
|
-
tags.push(`<meta name="twitter:title" content="${escapeHtml(twitter.title)}">`);
|
|
342
|
-
if (twitter.description)
|
|
343
|
-
tags.push(`<meta name="twitter:description" content="${escapeHtml(twitter.description)}">`);
|
|
344
|
-
if (twitter.image)
|
|
345
|
-
tags.push(`<meta name="twitter:image" content="${escapeHtml(twitter.image)}">`);
|
|
346
|
-
}
|
|
347
|
-
return tags.join("\n ");
|
|
348
|
-
}
|
|
349
|
-
function generateCSS() {
|
|
350
|
-
return `
|
|
351
|
-
* {
|
|
352
|
-
margin: 0;
|
|
353
|
-
padding: 0;
|
|
354
|
-
box-sizing: border-box;
|
|
355
|
-
}
|
|
356
|
-
|
|
357
|
-
body {
|
|
358
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
359
|
-
line-height: 1.6;
|
|
360
|
-
color: #333;
|
|
361
|
-
background-color: #f8f9fa;
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
.header {
|
|
365
|
-
background: white;
|
|
366
|
-
padding: 2rem;
|
|
367
|
-
border-bottom: 1px solid #e9ecef;
|
|
368
|
-
margin-bottom: 2rem;
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
.header h1 {
|
|
372
|
-
color: #2c3e50;
|
|
373
|
-
margin-bottom: 1rem;
|
|
374
|
-
font-size: 2rem;
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
.meta-info {
|
|
378
|
-
display: grid;
|
|
379
|
-
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
|
380
|
-
gap: 0.5rem;
|
|
381
|
-
}
|
|
382
|
-
|
|
383
|
-
.meta-info p {
|
|
384
|
-
margin: 0.25rem 0;
|
|
385
|
-
font-size: 0.9rem;
|
|
386
|
-
color: #6c757d;
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
.toc {
|
|
390
|
-
background: white;
|
|
391
|
-
padding: 1.5rem;
|
|
392
|
-
margin: 2rem 0;
|
|
393
|
-
border-radius: 8px;
|
|
394
|
-
border: 1px solid #e9ecef;
|
|
395
|
-
}
|
|
396
|
-
|
|
397
|
-
.toc h2 {
|
|
398
|
-
color: #2c3e50;
|
|
399
|
-
margin-bottom: 1rem;
|
|
400
|
-
font-size: 1.25rem;
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
.toc ul {
|
|
404
|
-
list-style: none;
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
.toc li {
|
|
408
|
-
margin: 0.5rem 0;
|
|
409
|
-
}
|
|
410
|
-
|
|
411
|
-
.toc a {
|
|
412
|
-
color: #007bff;
|
|
413
|
-
text-decoration: none;
|
|
414
|
-
transition: color 0.2s;
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
.toc a:hover {
|
|
418
|
-
color: #0056b3;
|
|
419
|
-
text-decoration: underline;
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
.content {
|
|
423
|
-
max-width: 800px;
|
|
424
|
-
margin: 0 auto;
|
|
425
|
-
padding: 0 1rem;
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
.page {
|
|
429
|
-
background: white;
|
|
430
|
-
margin: 2rem 0;
|
|
431
|
-
padding: 2rem;
|
|
432
|
-
border-radius: 8px;
|
|
433
|
-
border: 1px solid #e9ecef;
|
|
434
|
-
box-shadow: 0 2px 4px rgba(0,0,0,0.05);
|
|
435
|
-
}
|
|
436
|
-
|
|
437
|
-
.page-header {
|
|
438
|
-
border-bottom: 2px solid #e9ecef;
|
|
439
|
-
padding-bottom: 1rem;
|
|
440
|
-
margin-bottom: 2rem;
|
|
441
|
-
}
|
|
442
|
-
|
|
443
|
-
.page-header h2 {
|
|
444
|
-
color: #2c3e50;
|
|
445
|
-
margin-bottom: 0.5rem;
|
|
446
|
-
font-size: 1.5rem;
|
|
447
|
-
}
|
|
448
|
-
|
|
449
|
-
.page-meta {
|
|
450
|
-
display: flex;
|
|
451
|
-
flex-wrap: wrap;
|
|
452
|
-
gap: 1rem;
|
|
453
|
-
font-size: 0.9rem;
|
|
454
|
-
color: #6c757d;
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
.page-content {
|
|
458
|
-
line-height: 1.8;
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
.page-content h1, .page-content h2, .page-content h3,
|
|
462
|
-
.page-content h4, .page-content h5, .page-content h6 {
|
|
463
|
-
color: #2c3e50;
|
|
464
|
-
margin: 1.5rem 0 0.5rem 0;
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
.page-content p {
|
|
468
|
-
margin: 1rem 0;
|
|
469
|
-
}
|
|
470
|
-
|
|
471
|
-
.page-content a {
|
|
472
|
-
color: #007bff;
|
|
473
|
-
text-decoration: none;
|
|
474
|
-
}
|
|
475
|
-
|
|
476
|
-
.page-content a:hover {
|
|
477
|
-
text-decoration: underline;
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
.page-content code {
|
|
481
|
-
background: #f8f9fa;
|
|
482
|
-
padding: 0.2rem 0.4rem;
|
|
483
|
-
border-radius: 4px;
|
|
484
|
-
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
|
485
|
-
font-size: 0.9em;
|
|
486
|
-
}
|
|
487
|
-
|
|
488
|
-
.page-content pre {
|
|
489
|
-
background: #f8f9fa;
|
|
490
|
-
padding: 1rem;
|
|
491
|
-
border-radius: 4px;
|
|
492
|
-
overflow-x: auto;
|
|
493
|
-
margin: 1rem 0;
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
.page-content blockquote {
|
|
497
|
-
border-left: 4px solid #007bff;
|
|
498
|
-
padding-left: 1rem;
|
|
499
|
-
margin: 1rem 0;
|
|
500
|
-
color: #6c757d;
|
|
501
|
-
}
|
|
502
|
-
|
|
503
|
-
.footer {
|
|
504
|
-
text-align: center;
|
|
505
|
-
padding: 2rem;
|
|
506
|
-
margin-top: 3rem;
|
|
507
|
-
border-top: 1px solid #e9ecef;
|
|
508
|
-
color: #6c757d;
|
|
509
|
-
font-size: 0.9rem;
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
@media (max-width: 768px) {
|
|
513
|
-
.header {
|
|
514
|
-
padding: 1rem;
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
.header h1 {
|
|
518
|
-
font-size: 1.5rem;
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
.page {
|
|
522
|
-
padding: 1rem;
|
|
523
|
-
}
|
|
524
|
-
|
|
525
|
-
.page-meta {
|
|
526
|
-
flex-direction: column;
|
|
527
|
-
gap: 0.5rem;
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
`.trim();
|
|
531
|
-
}
|
|
532
|
-
function generateTOC(pages) {
|
|
533
|
-
const tocItems = pages.map((page, index) => {
|
|
534
|
-
const pageNumber = index + 1;
|
|
535
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
536
|
-
const id = `page-${pageNumber}`;
|
|
537
|
-
return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
|
|
538
|
-
}).join("\n");
|
|
539
|
-
return `
|
|
540
|
-
<nav class="toc">
|
|
541
|
-
<h2>Table of Contents</h2>
|
|
542
|
-
<ul>
|
|
543
|
-
${tocItems}
|
|
544
|
-
</ul>
|
|
545
|
-
</nav>`;
|
|
546
|
-
}
|
|
547
|
-
function generatePageHTML(page, pageNumber) {
|
|
548
|
-
const id = `page-${pageNumber}`;
|
|
549
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
550
|
-
return `
|
|
551
|
-
<article class="page" id="${id}">
|
|
552
|
-
<div class="page-header">
|
|
553
|
-
<h2>${pageNumber}. ${escapeHtml(title)}</h2>
|
|
554
|
-
<div class="page-meta">
|
|
555
|
-
<span><strong>URL:</strong> <a href="${escapeHtml(
|
|
556
|
-
page.url
|
|
557
|
-
)}" target="_blank">${escapeHtml(page.url)}</a></span>
|
|
558
|
-
<span><strong>Depth:</strong> ${page.depth}</span>
|
|
559
|
-
<span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
|
|
560
|
-
</div>
|
|
561
|
-
</div>
|
|
562
|
-
<div class="page-content">
|
|
563
|
-
${page.html}
|
|
564
|
-
</div>
|
|
565
|
-
</article>`;
|
|
566
|
-
}
|
|
567
|
-
function generateJavaScript() {
|
|
568
|
-
return `
|
|
569
|
-
// Smooth scrolling for TOC links
|
|
570
|
-
document.querySelectorAll('a[href^="#"]').forEach(anchor => {
|
|
571
|
-
anchor.addEventListener('click', function (e) {
|
|
572
|
-
e.preventDefault();
|
|
573
|
-
const target = document.querySelector(this.getAttribute('href'));
|
|
574
|
-
if (target) {
|
|
575
|
-
target.scrollIntoView({
|
|
576
|
-
behavior: 'smooth',
|
|
577
|
-
block: 'start'
|
|
578
|
-
});
|
|
579
|
-
}
|
|
580
|
-
});
|
|
581
|
-
});
|
|
582
|
-
|
|
583
|
-
// Highlight current section in TOC
|
|
584
|
-
window.addEventListener('scroll', function() {
|
|
585
|
-
const pages = document.querySelectorAll('.page');
|
|
586
|
-
const tocLinks = document.querySelectorAll('.toc a');
|
|
587
|
-
|
|
588
|
-
let currentPage = null;
|
|
589
|
-
pages.forEach(page => {
|
|
590
|
-
const rect = page.getBoundingClientRect();
|
|
591
|
-
if (rect.top <= 100) {
|
|
592
|
-
currentPage = page;
|
|
593
|
-
}
|
|
594
|
-
});
|
|
595
|
-
|
|
596
|
-
tocLinks.forEach(link => {
|
|
597
|
-
link.style.fontWeight = 'normal';
|
|
598
|
-
const target = document.querySelector(link.getAttribute('href'));
|
|
599
|
-
if (target === currentPage) {
|
|
600
|
-
link.style.fontWeight = 'bold';
|
|
601
|
-
}
|
|
602
|
-
});
|
|
603
|
-
});
|
|
604
|
-
`;
|
|
605
|
-
}
|
|
606
|
-
function escapeHtml(text) {
|
|
607
|
-
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'").replace(/\//g, "/");
|
|
608
|
-
}
|
|
609
|
-
function extractDomainFromUrl2(url) {
|
|
610
|
-
try {
|
|
611
|
-
return new URL(url).hostname;
|
|
612
|
-
} catch {
|
|
613
|
-
return "Unknown";
|
|
614
|
-
}
|
|
615
|
-
}
|
|
616
|
-
|
|
617
|
-
// src/formatters/json.ts
|
|
618
|
-
function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
|
|
619
|
-
const jsonResult = {
|
|
620
|
-
metadata: {
|
|
621
|
-
baseUrl,
|
|
622
|
-
totalPages: pages.length,
|
|
623
|
-
scrapedAt,
|
|
624
|
-
duration,
|
|
625
|
-
website
|
|
626
|
-
},
|
|
627
|
-
pages: pages.map((page, index) => ({
|
|
628
|
-
index: index + 1,
|
|
629
|
-
url: page.url,
|
|
630
|
-
title: page.title,
|
|
631
|
-
markdown: page.markdown,
|
|
632
|
-
html: page.html,
|
|
633
|
-
fetchedAt: page.fetchedAt,
|
|
634
|
-
depth: page.depth,
|
|
635
|
-
wordCount: countWords(page.markdown),
|
|
636
|
-
readingTime: estimateReadingTime(page.markdown)
|
|
637
|
-
}))
|
|
638
|
-
};
|
|
639
|
-
return JSON.stringify(jsonResult, null, 2);
|
|
640
|
-
}
|
|
641
|
-
function countWords(markdown) {
|
|
642
|
-
const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
|
|
643
|
-
return plainText.split(/\s+/).filter((word) => word.length > 0).length;
|
|
644
|
-
}
|
|
645
|
-
function estimateReadingTime(markdown) {
|
|
646
|
-
const wordCount = countWords(markdown);
|
|
647
|
-
return Math.ceil(wordCount / 200);
|
|
648
|
-
}
|
|
649
|
-
|
|
650
|
-
// src/formatters/text.ts
|
|
651
|
-
import { parseHTML } from "linkedom";
|
|
652
|
-
function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
653
|
-
const sections = [];
|
|
654
|
-
if (includeMetadata) {
|
|
655
|
-
sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
656
|
-
}
|
|
657
|
-
sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
|
|
658
|
-
return sections.join("\n\n");
|
|
659
|
-
}
|
|
660
|
-
function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
661
|
-
const title = website.title || extractDomainFromUrl3(baseUrl);
|
|
662
|
-
const lines = [];
|
|
663
|
-
lines.push(`=== ${title} ===`);
|
|
664
|
-
lines.push("");
|
|
665
|
-
lines.push(`URL: ${baseUrl}`);
|
|
666
|
-
lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
|
|
667
|
-
lines.push(`Duration: ${duration}ms`);
|
|
668
|
-
lines.push(`Pages: ${totalPages}`);
|
|
669
|
-
if (website.description) {
|
|
670
|
-
lines.push(`Description: ${website.description}`);
|
|
671
|
-
}
|
|
672
|
-
if (website.author) {
|
|
673
|
-
lines.push(`Author: ${website.author}`);
|
|
674
|
-
}
|
|
675
|
-
if (website.language) {
|
|
676
|
-
lines.push(`Language: ${website.language}`);
|
|
677
|
-
}
|
|
678
|
-
return lines.join("\n");
|
|
679
|
-
}
|
|
680
|
-
function createTextPage(page, pageNumber, showSeparator) {
|
|
681
|
-
const lines = [];
|
|
682
|
-
if (showSeparator) {
|
|
683
|
-
lines.push("\u2500".repeat(60));
|
|
684
|
-
lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
|
|
685
|
-
lines.push(`URL: ${page.url}`);
|
|
686
|
-
lines.push("\u2500".repeat(60));
|
|
687
|
-
}
|
|
688
|
-
const plainText = htmlToPlainText(page.html);
|
|
689
|
-
lines.push(plainText);
|
|
690
|
-
return lines.join("\n");
|
|
691
|
-
}
|
|
692
|
-
function htmlToPlainText(html) {
|
|
693
|
-
const { document } = parseHTML(html);
|
|
694
|
-
const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
|
|
695
|
-
elementsToRemove.forEach((tag) => {
|
|
696
|
-
document.querySelectorAll(tag).forEach((el) => el.remove());
|
|
697
|
-
});
|
|
698
|
-
let text = document.body?.textContent || document.documentElement?.textContent || "";
|
|
699
|
-
text = text.replace(/[ \t]+/g, " ");
|
|
700
|
-
text = text.replace(/\n[ \t]+/g, "\n");
|
|
701
|
-
text = text.replace(/[ \t]+\n/g, "\n");
|
|
702
|
-
text = text.replace(/\n{3,}/g, "\n\n");
|
|
703
|
-
text = text.trim();
|
|
704
|
-
return text;
|
|
705
|
-
}
|
|
706
|
-
function extractDomainFromUrl3(url) {
|
|
707
|
-
try {
|
|
708
|
-
return new URL(url).hostname;
|
|
709
|
-
} catch {
|
|
710
|
-
return "Unknown";
|
|
711
|
-
}
|
|
712
|
-
}
|
|
713
41
|
|
|
714
42
|
// src/utils/content-cleaner.ts
|
|
715
|
-
import { parseHTML
|
|
43
|
+
import { parseHTML } from "linkedom";
|
|
716
44
|
var ALWAYS_REMOVE_SELECTORS = [
|
|
717
|
-
// Navigation and menus
|
|
718
|
-
"nav",
|
|
719
|
-
"header nav",
|
|
720
|
-
"footer nav",
|
|
721
|
-
".nav",
|
|
722
|
-
".navigation",
|
|
723
|
-
".menu",
|
|
724
|
-
".navbar",
|
|
725
|
-
".sidebar",
|
|
726
|
-
".aside",
|
|
727
|
-
// Header and footer elements
|
|
728
|
-
"header",
|
|
729
|
-
"footer",
|
|
730
|
-
".site-header",
|
|
731
|
-
".page-header",
|
|
732
|
-
".site-footer",
|
|
733
|
-
".page-footer",
|
|
734
|
-
// Social media and sharing
|
|
735
|
-
".social",
|
|
736
|
-
".share",
|
|
737
|
-
".sharing",
|
|
738
|
-
".twitter",
|
|
739
|
-
".facebook",
|
|
740
|
-
".linkedin",
|
|
741
|
-
".instagram",
|
|
742
|
-
// Comments and discussions
|
|
743
|
-
".comments",
|
|
744
|
-
".comment",
|
|
745
|
-
".discussion",
|
|
746
|
-
".disqus",
|
|
747
|
-
// Forms and interactive elements
|
|
748
|
-
"form",
|
|
749
|
-
"input",
|
|
750
|
-
"button:not([type='submit'])",
|
|
751
|
-
"select",
|
|
752
|
-
"textarea",
|
|
753
45
|
// Scripts and styles
|
|
754
46
|
"script",
|
|
755
47
|
"style",
|
|
756
48
|
"noscript",
|
|
49
|
+
"link[rel='stylesheet']",
|
|
757
50
|
// Hidden elements
|
|
758
51
|
"[hidden]",
|
|
52
|
+
"[aria-hidden='true']",
|
|
759
53
|
"[style*='display: none']",
|
|
760
54
|
"[style*='display:none']",
|
|
761
|
-
|
|
762
|
-
"
|
|
763
|
-
|
|
764
|
-
"
|
|
55
|
+
"[style*='visibility: hidden']",
|
|
56
|
+
"[style*='visibility:hidden']",
|
|
57
|
+
// SVG icons and decorative elements
|
|
58
|
+
"svg[aria-hidden='true']",
|
|
59
|
+
"svg.icon",
|
|
60
|
+
"svg[class*='icon']",
|
|
61
|
+
// Template and metadata
|
|
62
|
+
"template",
|
|
63
|
+
"meta",
|
|
64
|
+
// Embeds that don't convert to text
|
|
65
|
+
"iframe",
|
|
66
|
+
"canvas",
|
|
67
|
+
"object",
|
|
68
|
+
"embed",
|
|
69
|
+
// Forms (usually not main content)
|
|
70
|
+
"form",
|
|
71
|
+
"input",
|
|
72
|
+
"select",
|
|
73
|
+
"textarea",
|
|
74
|
+
"button"
|
|
75
|
+
];
|
|
76
|
+
var OVERLAY_SELECTORS = [
|
|
77
|
+
"[class*='modal']",
|
|
78
|
+
"[class*='popup']",
|
|
79
|
+
"[class*='overlay']",
|
|
80
|
+
"[class*='dialog']",
|
|
81
|
+
"[role='dialog']",
|
|
82
|
+
"[role='alertdialog']",
|
|
83
|
+
"[class*='cookie']",
|
|
84
|
+
"[class*='consent']",
|
|
85
|
+
"[class*='gdpr']",
|
|
86
|
+
"[class*='privacy-banner']",
|
|
87
|
+
"[class*='notification-bar']",
|
|
88
|
+
"[id*='cookie']",
|
|
89
|
+
"[id*='consent']",
|
|
90
|
+
"[id*='gdpr']",
|
|
91
|
+
// Fixed/sticky positioned elements
|
|
92
|
+
"[style*='position: fixed']",
|
|
93
|
+
"[style*='position:fixed']",
|
|
94
|
+
"[style*='position: sticky']",
|
|
95
|
+
"[style*='position:sticky']"
|
|
96
|
+
];
|
|
97
|
+
var NAVIGATION_SELECTORS = [
|
|
98
|
+
// Semantic elements
|
|
99
|
+
"header",
|
|
100
|
+
"footer",
|
|
101
|
+
"nav",
|
|
102
|
+
"aside",
|
|
103
|
+
// Header variations
|
|
104
|
+
".header",
|
|
105
|
+
".top",
|
|
106
|
+
".navbar",
|
|
107
|
+
"#header",
|
|
108
|
+
// Footer variations
|
|
109
|
+
".footer",
|
|
110
|
+
".bottom",
|
|
111
|
+
"#footer",
|
|
112
|
+
// Sidebars
|
|
113
|
+
".sidebar",
|
|
114
|
+
".side",
|
|
115
|
+
".aside",
|
|
116
|
+
"#sidebar",
|
|
117
|
+
// Modals/popups (backup if not caught by OVERLAY_SELECTORS)
|
|
765
118
|
".modal",
|
|
119
|
+
".popup",
|
|
120
|
+
"#modal",
|
|
766
121
|
".overlay",
|
|
767
|
-
|
|
122
|
+
// Ads
|
|
123
|
+
".ad",
|
|
124
|
+
".ads",
|
|
125
|
+
".advert",
|
|
126
|
+
"#ad",
|
|
127
|
+
// Language selectors
|
|
128
|
+
".lang-selector",
|
|
129
|
+
".language",
|
|
130
|
+
"#language-selector",
|
|
131
|
+
// Social
|
|
132
|
+
".social",
|
|
133
|
+
".social-media",
|
|
134
|
+
".social-links",
|
|
135
|
+
"#social",
|
|
136
|
+
// Navigation/menus
|
|
137
|
+
".menu",
|
|
138
|
+
".navigation",
|
|
139
|
+
"#nav",
|
|
768
140
|
// Breadcrumbs
|
|
769
|
-
".breadcrumb",
|
|
770
141
|
".breadcrumbs",
|
|
771
|
-
"
|
|
142
|
+
"#breadcrumbs",
|
|
143
|
+
// Share buttons
|
|
144
|
+
".share",
|
|
145
|
+
"#share",
|
|
146
|
+
// Widgets
|
|
147
|
+
".widget",
|
|
148
|
+
"#widget",
|
|
149
|
+
// Cookie notices (backup)
|
|
150
|
+
".cookie",
|
|
151
|
+
"#cookie"
|
|
152
|
+
];
|
|
153
|
+
var FORCE_INCLUDE_SELECTORS = [
|
|
154
|
+
// IDs
|
|
155
|
+
"#main",
|
|
156
|
+
"#content",
|
|
157
|
+
"#main-content",
|
|
158
|
+
"#article",
|
|
159
|
+
"#post",
|
|
160
|
+
"#page-content",
|
|
161
|
+
// Semantic elements
|
|
162
|
+
"main",
|
|
163
|
+
"article",
|
|
164
|
+
"[role='main']",
|
|
165
|
+
// Classes
|
|
166
|
+
".main-content",
|
|
167
|
+
".content",
|
|
168
|
+
".post-content",
|
|
169
|
+
".article-content",
|
|
170
|
+
".entry-content",
|
|
171
|
+
".page-content",
|
|
172
|
+
".article-body",
|
|
173
|
+
".post-body",
|
|
174
|
+
".story-content",
|
|
175
|
+
".blog-content"
|
|
772
176
|
];
|
|
773
177
|
var AD_SELECTORS = [
|
|
774
|
-
//
|
|
775
|
-
".
|
|
776
|
-
".ads",
|
|
777
|
-
".advertisement",
|
|
778
|
-
".promotion",
|
|
779
|
-
".sponsored",
|
|
780
|
-
"[class*='ad-']",
|
|
781
|
-
"[id*='ad-']",
|
|
782
|
-
"[class*='advert']",
|
|
783
|
-
"[id*='advert']",
|
|
784
|
-
"[class*='banner']",
|
|
785
|
-
"[id*='banner']",
|
|
178
|
+
// Google ads
|
|
179
|
+
"ins.adsbygoogle",
|
|
786
180
|
".google-ad",
|
|
787
181
|
".adsense",
|
|
182
|
+
// Generic ad containers
|
|
788
183
|
"[data-ad]",
|
|
789
184
|
"[data-ads]",
|
|
790
|
-
"
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
"
|
|
794
|
-
"
|
|
795
|
-
"
|
|
185
|
+
"[data-ad-slot]",
|
|
186
|
+
"[data-ad-client]",
|
|
187
|
+
// Common ad class patterns
|
|
188
|
+
".ad-container",
|
|
189
|
+
".ad-wrapper",
|
|
190
|
+
".advertisement",
|
|
191
|
+
".sponsored-content",
|
|
192
|
+
// Tracking pixels
|
|
193
|
+
"img[width='1'][height='1']",
|
|
194
|
+
"img[src*='pixel']",
|
|
195
|
+
"img[src*='tracking']",
|
|
196
|
+
"img[src*='analytics']"
|
|
796
197
|
];
|
|
797
|
-
function
|
|
798
|
-
const
|
|
799
|
-
const
|
|
800
|
-
|
|
198
|
+
function getLinkDensity(element) {
|
|
199
|
+
const text = element.textContent || "";
|
|
200
|
+
const textLength = text.trim().length;
|
|
201
|
+
if (textLength === 0) return 1;
|
|
202
|
+
let linkLength = 0;
|
|
203
|
+
element.querySelectorAll("a").forEach((link) => {
|
|
204
|
+
linkLength += (link.textContent || "").trim().length;
|
|
205
|
+
});
|
|
206
|
+
return linkLength / textLength;
|
|
207
|
+
}
|
|
208
|
+
function getContentScore(element) {
|
|
209
|
+
let score = 0;
|
|
210
|
+
const text = element.textContent || "";
|
|
211
|
+
const textLength = text.trim().length;
|
|
212
|
+
score += Math.min(textLength / 100, 50);
|
|
213
|
+
score += element.querySelectorAll("p").length * 3;
|
|
214
|
+
score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2;
|
|
215
|
+
score += element.querySelectorAll("img").length * 1;
|
|
216
|
+
score -= element.querySelectorAll("a").length * 0.5;
|
|
217
|
+
score -= element.querySelectorAll("li").length * 0.2;
|
|
218
|
+
const linkDensity = getLinkDensity(element);
|
|
219
|
+
if (linkDensity > 0.5) score -= 30;
|
|
220
|
+
else if (linkDensity > 0.3) score -= 15;
|
|
221
|
+
const classAndId = (element.className || "") + " " + (element.id || "");
|
|
222
|
+
if (/article|content|post|body|main|entry/i.test(classAndId)) score += 25;
|
|
223
|
+
if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) score -= 25;
|
|
224
|
+
return score;
|
|
225
|
+
}
|
|
226
|
+
function looksLikeNavigation(element) {
|
|
227
|
+
const linkDensity = getLinkDensity(element);
|
|
228
|
+
if (linkDensity > 0.5) return true;
|
|
229
|
+
const listItems = element.querySelectorAll("li");
|
|
230
|
+
const links = element.querySelectorAll("a");
|
|
231
|
+
if (listItems.length > 5 && links.length > listItems.length * 0.8) return true;
|
|
232
|
+
return false;
|
|
233
|
+
}
|
|
234
|
+
function removeElements(document, selectors) {
|
|
235
|
+
for (const selector of selectors) {
|
|
801
236
|
try {
|
|
802
237
|
document.querySelectorAll(selector).forEach((el) => el.remove());
|
|
803
238
|
} catch {
|
|
804
239
|
}
|
|
805
240
|
}
|
|
241
|
+
}
|
|
242
|
+
function removeWithProtection(document, selectorsToRemove, protectedSelectors) {
|
|
243
|
+
for (const selector of selectorsToRemove) {
|
|
244
|
+
try {
|
|
245
|
+
document.querySelectorAll(selector).forEach((element) => {
|
|
246
|
+
const isProtected = protectedSelectors.some((ps) => {
|
|
247
|
+
try {
|
|
248
|
+
return element.matches(ps);
|
|
249
|
+
} catch {
|
|
250
|
+
return false;
|
|
251
|
+
}
|
|
252
|
+
});
|
|
253
|
+
if (isProtected) return;
|
|
254
|
+
const containsProtected = protectedSelectors.some((ps) => {
|
|
255
|
+
try {
|
|
256
|
+
return element.querySelector(ps) !== null;
|
|
257
|
+
} catch {
|
|
258
|
+
return false;
|
|
259
|
+
}
|
|
260
|
+
});
|
|
261
|
+
if (containsProtected) return;
|
|
262
|
+
element.remove();
|
|
263
|
+
});
|
|
264
|
+
} catch {
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
function findMainContent(document) {
|
|
269
|
+
const isValidContent = (el) => {
|
|
270
|
+
if (!el) return false;
|
|
271
|
+
const text = el.textContent || "";
|
|
272
|
+
if (text.trim().length < 100) return false;
|
|
273
|
+
if (looksLikeNavigation(el)) return false;
|
|
274
|
+
return true;
|
|
275
|
+
};
|
|
276
|
+
const main = document.querySelector("main");
|
|
277
|
+
if (isValidContent(main) && getLinkDensity(main) < 0.4) {
|
|
278
|
+
return main;
|
|
279
|
+
}
|
|
280
|
+
const roleMain = document.querySelector('[role="main"]');
|
|
281
|
+
if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) {
|
|
282
|
+
return roleMain;
|
|
283
|
+
}
|
|
284
|
+
const articles = document.querySelectorAll("article");
|
|
285
|
+
if (articles.length === 1 && isValidContent(articles[0])) {
|
|
286
|
+
return articles[0];
|
|
287
|
+
}
|
|
288
|
+
const contentSelectors = [
|
|
289
|
+
"#content",
|
|
290
|
+
"#main-content",
|
|
291
|
+
"#main",
|
|
292
|
+
".content",
|
|
293
|
+
".main-content",
|
|
294
|
+
".post-content",
|
|
295
|
+
".article-content",
|
|
296
|
+
".entry-content",
|
|
297
|
+
".page-content",
|
|
298
|
+
".article-body",
|
|
299
|
+
".post-body",
|
|
300
|
+
".story-content",
|
|
301
|
+
".blog-content"
|
|
302
|
+
];
|
|
303
|
+
for (const selector of contentSelectors) {
|
|
304
|
+
try {
|
|
305
|
+
const el = document.querySelector(selector);
|
|
306
|
+
if (isValidContent(el) && getLinkDensity(el) < 0.4) {
|
|
307
|
+
return el;
|
|
308
|
+
}
|
|
309
|
+
} catch {
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
const candidates = [];
|
|
313
|
+
const containers = document.querySelectorAll("div, section, article");
|
|
314
|
+
containers.forEach((el) => {
|
|
315
|
+
const text = el.textContent || "";
|
|
316
|
+
if (text.trim().length < 200) return;
|
|
317
|
+
const score = getContentScore(el);
|
|
318
|
+
if (score > 0) {
|
|
319
|
+
candidates.push({ el, score });
|
|
320
|
+
}
|
|
321
|
+
});
|
|
322
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
323
|
+
if (candidates.length > 0 && candidates[0].score > 20) {
|
|
324
|
+
return candidates[0].el;
|
|
325
|
+
}
|
|
326
|
+
return null;
|
|
327
|
+
}
|
|
328
|
+
function cleanHtml(html, baseUrl, options = {}) {
|
|
329
|
+
const {
|
|
330
|
+
removeAds = true,
|
|
331
|
+
removeBase64Images = true,
|
|
332
|
+
onlyMainContent = true,
|
|
333
|
+
includeTags,
|
|
334
|
+
excludeTags
|
|
335
|
+
} = options;
|
|
336
|
+
const { document } = parseHTML(html);
|
|
337
|
+
removeElements(document, ALWAYS_REMOVE_SELECTORS);
|
|
338
|
+
removeElements(document, OVERLAY_SELECTORS);
|
|
806
339
|
if (removeAds) {
|
|
807
|
-
|
|
340
|
+
removeElements(document, AD_SELECTORS);
|
|
341
|
+
}
|
|
342
|
+
if (excludeTags && excludeTags.length > 0) {
|
|
343
|
+
removeElements(document, excludeTags);
|
|
344
|
+
}
|
|
345
|
+
if (onlyMainContent) {
|
|
346
|
+
removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS);
|
|
347
|
+
const mainContent = findMainContent(document);
|
|
348
|
+
if (mainContent) {
|
|
349
|
+
const body = document.body;
|
|
350
|
+
if (body) {
|
|
351
|
+
const clone = mainContent.cloneNode(true);
|
|
352
|
+
body.innerHTML = "";
|
|
353
|
+
body.appendChild(clone);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
if (includeTags && includeTags.length > 0) {
|
|
358
|
+
const matchedElements = [];
|
|
359
|
+
for (const selector of includeTags) {
|
|
808
360
|
try {
|
|
809
|
-
document.querySelectorAll(selector).forEach((el) =>
|
|
361
|
+
document.querySelectorAll(selector).forEach((el) => {
|
|
362
|
+
matchedElements.push(el.cloneNode(true));
|
|
363
|
+
});
|
|
810
364
|
} catch {
|
|
811
365
|
}
|
|
812
366
|
}
|
|
367
|
+
if (matchedElements.length > 0) {
|
|
368
|
+
const body = document.body;
|
|
369
|
+
if (body) {
|
|
370
|
+
body.innerHTML = "";
|
|
371
|
+
matchedElements.forEach((el) => body.appendChild(el));
|
|
372
|
+
}
|
|
373
|
+
}
|
|
813
374
|
}
|
|
814
375
|
if (removeBase64Images) {
|
|
815
376
|
removeBase64ImagesFromDocument(document);
|
|
@@ -834,7 +395,10 @@ function removeBase64ImagesFromDocument(document) {
|
|
|
834
395
|
document.querySelectorAll("[style*='data:image']").forEach((el) => {
|
|
835
396
|
const style = el.getAttribute("style");
|
|
836
397
|
if (style) {
|
|
837
|
-
const cleanedStyle = style.replace(
|
|
398
|
+
const cleanedStyle = style.replace(
|
|
399
|
+
/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi,
|
|
400
|
+
""
|
|
401
|
+
);
|
|
838
402
|
if (cleanedStyle.trim()) {
|
|
839
403
|
el.setAttribute("style", cleanedStyle);
|
|
840
404
|
} else {
|
|
@@ -871,7 +435,7 @@ function cleanContent(html, baseUrl, options = {}) {
|
|
|
871
435
|
}
|
|
872
436
|
|
|
873
437
|
// src/utils/metadata-extractor.ts
|
|
874
|
-
import { parseHTML as
|
|
438
|
+
import { parseHTML as parseHTML2 } from "linkedom";
|
|
875
439
|
|
|
876
440
|
// src/utils/url-helpers.ts
|
|
877
441
|
import { URL as URL2 } from "url";
|
|
@@ -944,8 +508,26 @@ function isSameDomain(url, baseUrl) {
|
|
|
944
508
|
function getUrlKey(url) {
|
|
945
509
|
try {
|
|
946
510
|
const parsedUrl = new URL2(url);
|
|
511
|
+
parsedUrl.hash = "";
|
|
947
512
|
parsedUrl.search = "";
|
|
948
|
-
|
|
513
|
+
if (parsedUrl.hostname.startsWith("www.")) {
|
|
514
|
+
parsedUrl.hostname = parsedUrl.hostname.slice(4);
|
|
515
|
+
}
|
|
516
|
+
if (parsedUrl.protocol === "http:" && parsedUrl.port === "80" || parsedUrl.protocol === "https:" && parsedUrl.port === "443") {
|
|
517
|
+
parsedUrl.port = "";
|
|
518
|
+
}
|
|
519
|
+
const indexFiles = ["index.html", "index.htm", "default.html", "default.htm", "index.php"];
|
|
520
|
+
for (const indexFile of indexFiles) {
|
|
521
|
+
if (parsedUrl.pathname.endsWith(`/${indexFile}`)) {
|
|
522
|
+
parsedUrl.pathname = parsedUrl.pathname.slice(0, -indexFile.length);
|
|
523
|
+
break;
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
let normalized = parsedUrl.toString().toLowerCase();
|
|
527
|
+
if (normalized.endsWith("/") && parsedUrl.pathname !== "/") {
|
|
528
|
+
normalized = normalized.slice(0, -1);
|
|
529
|
+
}
|
|
530
|
+
return normalized;
|
|
949
531
|
} catch {
|
|
950
532
|
return url.toLowerCase();
|
|
951
533
|
}
|
|
@@ -1012,7 +594,7 @@ function extractMetadata(html, baseUrl) {
|
|
|
1012
594
|
return extractWebsiteMetadata(html, baseUrl);
|
|
1013
595
|
}
|
|
1014
596
|
function extractWebsiteMetadata(html, baseUrl) {
|
|
1015
|
-
const { document } =
|
|
597
|
+
const { document } = parseHTML2(html);
|
|
1016
598
|
const metadata = {
|
|
1017
599
|
title: null,
|
|
1018
600
|
description: null,
|
|
@@ -1228,96 +810,934 @@ function parseRobotsTxt(content, userAgent = "*") {
|
|
|
1228
810
|
}
|
|
1229
811
|
}
|
|
1230
812
|
}
|
|
1231
|
-
return rules;
|
|
1232
|
-
}
|
|
1233
|
-
function isPathAllowed(path, rules) {
|
|
1234
|
-
const normalizedPath = path.startsWith("/") ? path : "/" + path;
|
|
1235
|
-
for (const allowedPath of rules.allowedPaths) {
|
|
1236
|
-
if (pathMatches(normalizedPath, allowedPath)) {
|
|
1237
|
-
return true;
|
|
1238
|
-
}
|
|
813
|
+
return rules;
|
|
814
|
+
}
|
|
815
|
+
function isPathAllowed(path, rules) {
|
|
816
|
+
const normalizedPath = path.startsWith("/") ? path : "/" + path;
|
|
817
|
+
for (const allowedPath of rules.allowedPaths) {
|
|
818
|
+
if (pathMatches(normalizedPath, allowedPath)) {
|
|
819
|
+
return true;
|
|
820
|
+
}
|
|
821
|
+
}
|
|
822
|
+
for (const disallowedPath of rules.disallowedPaths) {
|
|
823
|
+
if (pathMatches(normalizedPath, disallowedPath)) {
|
|
824
|
+
return false;
|
|
825
|
+
}
|
|
826
|
+
}
|
|
827
|
+
return true;
|
|
828
|
+
}
|
|
829
|
+
function pathMatches(path, pattern) {
|
|
830
|
+
if (!pattern) {
|
|
831
|
+
return false;
|
|
832
|
+
}
|
|
833
|
+
let regexPattern = pattern.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
834
|
+
if (regexPattern.endsWith("\\$")) {
|
|
835
|
+
regexPattern = regexPattern.slice(0, -2) + "$";
|
|
836
|
+
} else {
|
|
837
|
+
regexPattern = "^" + regexPattern;
|
|
838
|
+
}
|
|
839
|
+
try {
|
|
840
|
+
const regex = new RegExp(regexPattern);
|
|
841
|
+
return regex.test(path);
|
|
842
|
+
} catch {
|
|
843
|
+
return path.startsWith(pattern);
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
847
|
+
try {
|
|
848
|
+
const url = new URL("/robots.txt", baseUrl);
|
|
849
|
+
const response = await fetch(url.toString(), {
|
|
850
|
+
headers: {
|
|
851
|
+
"User-Agent": "ReaderEngine/1.0"
|
|
852
|
+
}
|
|
853
|
+
});
|
|
854
|
+
if (!response.ok) {
|
|
855
|
+
return null;
|
|
856
|
+
}
|
|
857
|
+
const content = await response.text();
|
|
858
|
+
return parseRobotsTxt(content, "ReaderEngine");
|
|
859
|
+
} catch {
|
|
860
|
+
return null;
|
|
861
|
+
}
|
|
862
|
+
}
|
|
863
|
+
function isUrlAllowed(url, rules) {
|
|
864
|
+
if (!rules) {
|
|
865
|
+
return true;
|
|
866
|
+
}
|
|
867
|
+
try {
|
|
868
|
+
const parsedUrl = new URL(url);
|
|
869
|
+
return isPathAllowed(parsedUrl.pathname + parsedUrl.search, rules);
|
|
870
|
+
} catch {
|
|
871
|
+
return true;
|
|
872
|
+
}
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
// src/types.ts
|
|
876
|
+
var DEFAULT_OPTIONS = {
|
|
877
|
+
urls: [],
|
|
878
|
+
formats: ["markdown"],
|
|
879
|
+
timeoutMs: 3e4,
|
|
880
|
+
includePatterns: [],
|
|
881
|
+
excludePatterns: [],
|
|
882
|
+
// Content cleaning defaults
|
|
883
|
+
removeAds: true,
|
|
884
|
+
removeBase64Images: true,
|
|
885
|
+
onlyMainContent: true,
|
|
886
|
+
includeTags: [],
|
|
887
|
+
excludeTags: [],
|
|
888
|
+
skipTLSVerification: true,
|
|
889
|
+
// Batch defaults
|
|
890
|
+
batchConcurrency: 1,
|
|
891
|
+
batchTimeoutMs: 3e5,
|
|
892
|
+
maxRetries: 2,
|
|
893
|
+
onProgress: () => {
|
|
894
|
+
},
|
|
895
|
+
// Default no-op progress callback
|
|
896
|
+
// Hero-specific defaults
|
|
897
|
+
verbose: false,
|
|
898
|
+
showChrome: false
|
|
899
|
+
};
|
|
900
|
+
|
|
901
|
+
// src/engines/types.ts
|
|
902
|
+
var ENGINE_CONFIGS = {
|
|
903
|
+
http: {
|
|
904
|
+
name: "http",
|
|
905
|
+
timeout: 3e3,
|
|
906
|
+
maxTimeout: 1e4,
|
|
907
|
+
quality: 100,
|
|
908
|
+
features: {
|
|
909
|
+
javascript: false,
|
|
910
|
+
cloudflare: false,
|
|
911
|
+
tlsFingerprint: false,
|
|
912
|
+
waitFor: false,
|
|
913
|
+
screenshots: false
|
|
914
|
+
}
|
|
915
|
+
},
|
|
916
|
+
tlsclient: {
|
|
917
|
+
name: "tlsclient",
|
|
918
|
+
timeout: 5e3,
|
|
919
|
+
maxTimeout: 15e3,
|
|
920
|
+
quality: 80,
|
|
921
|
+
features: {
|
|
922
|
+
javascript: false,
|
|
923
|
+
cloudflare: false,
|
|
924
|
+
tlsFingerprint: true,
|
|
925
|
+
waitFor: false,
|
|
926
|
+
screenshots: false
|
|
927
|
+
}
|
|
928
|
+
},
|
|
929
|
+
hero: {
|
|
930
|
+
name: "hero",
|
|
931
|
+
timeout: 3e4,
|
|
932
|
+
maxTimeout: 6e4,
|
|
933
|
+
quality: 50,
|
|
934
|
+
features: {
|
|
935
|
+
javascript: true,
|
|
936
|
+
cloudflare: true,
|
|
937
|
+
tlsFingerprint: true,
|
|
938
|
+
waitFor: true,
|
|
939
|
+
screenshots: true
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
};
|
|
943
|
+
var DEFAULT_ENGINE_ORDER = ["http", "tlsclient", "hero"];
|
|
944
|
+
|
|
945
|
+
// src/engines/errors.ts
|
|
946
|
+
var EngineError = class extends Error {
|
|
947
|
+
engine;
|
|
948
|
+
retryable;
|
|
949
|
+
constructor(engine, message, options) {
|
|
950
|
+
super(`[${engine}] ${message}`);
|
|
951
|
+
this.name = "EngineError";
|
|
952
|
+
this.engine = engine;
|
|
953
|
+
this.retryable = options?.retryable ?? true;
|
|
954
|
+
this.cause = options?.cause;
|
|
955
|
+
if (Error.captureStackTrace) {
|
|
956
|
+
Error.captureStackTrace(this, this.constructor);
|
|
957
|
+
}
|
|
958
|
+
}
|
|
959
|
+
};
|
|
960
|
+
var ChallengeDetectedError = class extends EngineError {
|
|
961
|
+
challengeType;
|
|
962
|
+
constructor(engine, challengeType) {
|
|
963
|
+
super(engine, `Challenge detected: ${challengeType || "unknown"}`, { retryable: true });
|
|
964
|
+
this.name = "ChallengeDetectedError";
|
|
965
|
+
this.challengeType = challengeType || "unknown";
|
|
966
|
+
}
|
|
967
|
+
};
|
|
968
|
+
var InsufficientContentError = class extends EngineError {
|
|
969
|
+
contentLength;
|
|
970
|
+
threshold;
|
|
971
|
+
constructor(engine, contentLength, threshold = 100) {
|
|
972
|
+
super(engine, `Insufficient content: ${contentLength} chars (threshold: ${threshold})`, { retryable: true });
|
|
973
|
+
this.name = "InsufficientContentError";
|
|
974
|
+
this.contentLength = contentLength;
|
|
975
|
+
this.threshold = threshold;
|
|
976
|
+
}
|
|
977
|
+
};
|
|
978
|
+
var HttpError = class extends EngineError {
|
|
979
|
+
statusCode;
|
|
980
|
+
constructor(engine, statusCode, statusText) {
|
|
981
|
+
const retryable = statusCode >= 500 || statusCode === 429;
|
|
982
|
+
super(engine, `HTTP ${statusCode}${statusText ? `: ${statusText}` : ""}`, { retryable });
|
|
983
|
+
this.name = "HttpError";
|
|
984
|
+
this.statusCode = statusCode;
|
|
985
|
+
}
|
|
986
|
+
};
|
|
987
|
+
var EngineTimeoutError = class extends EngineError {
|
|
988
|
+
timeoutMs;
|
|
989
|
+
constructor(engine, timeoutMs) {
|
|
990
|
+
super(engine, `Timeout after ${timeoutMs}ms`, { retryable: true });
|
|
991
|
+
this.name = "EngineTimeoutError";
|
|
992
|
+
this.timeoutMs = timeoutMs;
|
|
993
|
+
}
|
|
994
|
+
};
|
|
995
|
+
var EngineUnavailableError = class extends EngineError {
|
|
996
|
+
constructor(engine, reason) {
|
|
997
|
+
super(engine, reason || "Engine not available", { retryable: false });
|
|
998
|
+
this.name = "EngineUnavailableError";
|
|
999
|
+
}
|
|
1000
|
+
};
|
|
1001
|
+
var AllEnginesFailedError = class extends Error {
|
|
1002
|
+
attemptedEngines;
|
|
1003
|
+
errors;
|
|
1004
|
+
constructor(attemptedEngines, errors) {
|
|
1005
|
+
const summary = attemptedEngines.map((e) => `${e}: ${errors.get(e)?.message || "unknown"}`).join("; ");
|
|
1006
|
+
super(`All engines failed: ${summary}`);
|
|
1007
|
+
this.name = "AllEnginesFailedError";
|
|
1008
|
+
this.attemptedEngines = attemptedEngines;
|
|
1009
|
+
this.errors = errors;
|
|
1010
|
+
}
|
|
1011
|
+
};
|
|
1012
|
+
|
|
1013
|
+
// src/engines/http/index.ts
|
|
1014
|
+
var DEFAULT_HEADERS = {
|
|
1015
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
1016
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
1017
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
1018
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
1019
|
+
"Cache-Control": "no-cache",
|
|
1020
|
+
Pragma: "no-cache",
|
|
1021
|
+
"Sec-Fetch-Dest": "document",
|
|
1022
|
+
"Sec-Fetch-Mode": "navigate",
|
|
1023
|
+
"Sec-Fetch-Site": "none",
|
|
1024
|
+
"Sec-Fetch-User": "?1",
|
|
1025
|
+
"Upgrade-Insecure-Requests": "1"
|
|
1026
|
+
};
|
|
1027
|
+
var CHALLENGE_PATTERNS = [
|
|
1028
|
+
// Cloudflare
|
|
1029
|
+
"cf-browser-verification",
|
|
1030
|
+
"cf_chl_opt",
|
|
1031
|
+
"challenge-platform",
|
|
1032
|
+
"cf-spinner",
|
|
1033
|
+
"Just a moment",
|
|
1034
|
+
"Checking your browser",
|
|
1035
|
+
"checking if the site connection is secure",
|
|
1036
|
+
"Enable JavaScript and cookies",
|
|
1037
|
+
"Attention Required",
|
|
1038
|
+
"_cf_chl_tk",
|
|
1039
|
+
"Verifying you are human",
|
|
1040
|
+
"cf-turnstile",
|
|
1041
|
+
"/cdn-cgi/challenge-platform/",
|
|
1042
|
+
// Generic bot detection
|
|
1043
|
+
"Please Wait...",
|
|
1044
|
+
"DDoS protection by",
|
|
1045
|
+
"Access denied",
|
|
1046
|
+
"bot detection",
|
|
1047
|
+
"are you a robot",
|
|
1048
|
+
"complete the security check"
|
|
1049
|
+
];
|
|
1050
|
+
var CLOUDFLARE_INFRA_PATTERNS = ["/cdn-cgi/", "cloudflare", "__cf_bm", "cf-ray"];
|
|
1051
|
+
var MIN_CONTENT_LENGTH = 100;
|
|
1052
|
+
var HttpEngine = class {
|
|
1053
|
+
config = ENGINE_CONFIGS.http;
|
|
1054
|
+
async scrape(meta) {
|
|
1055
|
+
const startTime = Date.now();
|
|
1056
|
+
const { url, options, logger: logger4, abortSignal } = meta;
|
|
1057
|
+
try {
|
|
1058
|
+
const controller = new AbortController();
|
|
1059
|
+
const timeoutId = setTimeout(() => controller.abort(), this.config.maxTimeout);
|
|
1060
|
+
if (abortSignal) {
|
|
1061
|
+
abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1062
|
+
}
|
|
1063
|
+
logger4?.debug(`[http] Fetching ${url}`);
|
|
1064
|
+
const response = await fetch(url, {
|
|
1065
|
+
method: "GET",
|
|
1066
|
+
headers: {
|
|
1067
|
+
...DEFAULT_HEADERS,
|
|
1068
|
+
...options.headers || {}
|
|
1069
|
+
},
|
|
1070
|
+
redirect: "follow",
|
|
1071
|
+
signal: controller.signal
|
|
1072
|
+
});
|
|
1073
|
+
clearTimeout(timeoutId);
|
|
1074
|
+
const duration = Date.now() - startTime;
|
|
1075
|
+
const html = await response.text();
|
|
1076
|
+
logger4?.debug(`[http] Got response: ${response.status} (${html.length} chars) in ${duration}ms`);
|
|
1077
|
+
if (response.status >= 400) {
|
|
1078
|
+
throw new HttpError("http", response.status, response.statusText);
|
|
1079
|
+
}
|
|
1080
|
+
const challengeType = this.detectChallenge(html);
|
|
1081
|
+
if (challengeType) {
|
|
1082
|
+
logger4?.debug(`[http] Challenge detected: ${challengeType}`);
|
|
1083
|
+
throw new ChallengeDetectedError("http", challengeType);
|
|
1084
|
+
}
|
|
1085
|
+
const textContent = this.extractText(html);
|
|
1086
|
+
if (textContent.length < MIN_CONTENT_LENGTH) {
|
|
1087
|
+
logger4?.debug(`[http] Insufficient content: ${textContent.length} chars`);
|
|
1088
|
+
throw new InsufficientContentError("http", textContent.length, MIN_CONTENT_LENGTH);
|
|
1089
|
+
}
|
|
1090
|
+
return {
|
|
1091
|
+
html,
|
|
1092
|
+
url: response.url,
|
|
1093
|
+
statusCode: response.status,
|
|
1094
|
+
contentType: response.headers.get("content-type") || void 0,
|
|
1095
|
+
headers: this.headersToRecord(response.headers),
|
|
1096
|
+
engine: "http",
|
|
1097
|
+
duration
|
|
1098
|
+
};
|
|
1099
|
+
} catch (error) {
|
|
1100
|
+
if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof HttpError) {
|
|
1101
|
+
throw error;
|
|
1102
|
+
}
|
|
1103
|
+
if (error instanceof Error) {
|
|
1104
|
+
if (error.name === "AbortError") {
|
|
1105
|
+
throw new EngineTimeoutError("http", this.config.maxTimeout);
|
|
1106
|
+
}
|
|
1107
|
+
throw new EngineError("http", error.message, { cause: error });
|
|
1108
|
+
}
|
|
1109
|
+
throw new EngineError("http", String(error));
|
|
1110
|
+
}
|
|
1111
|
+
}
|
|
1112
|
+
/**
|
|
1113
|
+
* Detect challenge patterns in HTML
|
|
1114
|
+
* @returns Challenge type or null if no challenge detected
|
|
1115
|
+
*/
|
|
1116
|
+
detectChallenge(html) {
|
|
1117
|
+
const htmlLower = html.toLowerCase();
|
|
1118
|
+
const hasCloudflare = CLOUDFLARE_INFRA_PATTERNS.some((p) => htmlLower.includes(p.toLowerCase()));
|
|
1119
|
+
for (const pattern of CHALLENGE_PATTERNS) {
|
|
1120
|
+
if (htmlLower.includes(pattern.toLowerCase())) {
|
|
1121
|
+
if (hasCloudflare || pattern.includes("cf")) {
|
|
1122
|
+
return "cloudflare";
|
|
1123
|
+
}
|
|
1124
|
+
return "bot-detection";
|
|
1125
|
+
}
|
|
1126
|
+
}
|
|
1127
|
+
return null;
|
|
1128
|
+
}
|
|
1129
|
+
/**
|
|
1130
|
+
* Convert Headers to Record<string, string>
|
|
1131
|
+
*/
|
|
1132
|
+
headersToRecord(headers) {
|
|
1133
|
+
const record = {};
|
|
1134
|
+
headers.forEach((value, key) => {
|
|
1135
|
+
record[key] = value;
|
|
1136
|
+
});
|
|
1137
|
+
return record;
|
|
1138
|
+
}
|
|
1139
|
+
/**
|
|
1140
|
+
* Extract visible text from HTML (rough extraction)
|
|
1141
|
+
*/
|
|
1142
|
+
extractText(html) {
|
|
1143
|
+
return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
1144
|
+
}
|
|
1145
|
+
isAvailable() {
|
|
1146
|
+
return true;
|
|
1147
|
+
}
|
|
1148
|
+
};
|
|
1149
|
+
var httpEngine = new HttpEngine();
|
|
1150
|
+
|
|
1151
|
+
// src/engines/tlsclient/index.ts
|
|
1152
|
+
import { gotScraping } from "got-scraping";
|
|
1153
|
+
var JS_REQUIRED_PATTERNS = [
|
|
1154
|
+
// Cloudflare JS challenge
|
|
1155
|
+
"cf-browser-verification",
|
|
1156
|
+
"challenge-platform",
|
|
1157
|
+
"_cf_chl_tk",
|
|
1158
|
+
"/cdn-cgi/challenge-platform/",
|
|
1159
|
+
// Generic JS requirements
|
|
1160
|
+
"Enable JavaScript",
|
|
1161
|
+
"JavaScript is required",
|
|
1162
|
+
"Please enable JavaScript",
|
|
1163
|
+
"requires JavaScript",
|
|
1164
|
+
"noscript"
|
|
1165
|
+
];
|
|
1166
|
+
var BLOCKED_PATTERNS = [
|
|
1167
|
+
"Access denied",
|
|
1168
|
+
"Sorry, you have been blocked",
|
|
1169
|
+
"bot detected",
|
|
1170
|
+
"suspicious activity",
|
|
1171
|
+
"too many requests"
|
|
1172
|
+
];
|
|
1173
|
+
var MIN_CONTENT_LENGTH2 = 100;
|
|
1174
|
+
var TlsClientEngine = class {
|
|
1175
|
+
config = ENGINE_CONFIGS.tlsclient;
|
|
1176
|
+
available = true;
|
|
1177
|
+
constructor() {
|
|
1178
|
+
try {
|
|
1179
|
+
if (!gotScraping) {
|
|
1180
|
+
this.available = false;
|
|
1181
|
+
}
|
|
1182
|
+
} catch {
|
|
1183
|
+
this.available = false;
|
|
1184
|
+
}
|
|
1185
|
+
}
|
|
1186
|
+
async scrape(meta) {
|
|
1187
|
+
if (!this.available) {
|
|
1188
|
+
throw new EngineUnavailableError("tlsclient", "got-scraping not available");
|
|
1189
|
+
}
|
|
1190
|
+
const startTime = Date.now();
|
|
1191
|
+
const { url, options, logger: logger4, abortSignal } = meta;
|
|
1192
|
+
try {
|
|
1193
|
+
const controller = new AbortController();
|
|
1194
|
+
const timeoutId = setTimeout(() => controller.abort(), this.config.maxTimeout);
|
|
1195
|
+
if (abortSignal) {
|
|
1196
|
+
abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1197
|
+
}
|
|
1198
|
+
logger4?.debug(`[tlsclient] Fetching ${url}`);
|
|
1199
|
+
const response = await gotScraping({
|
|
1200
|
+
url,
|
|
1201
|
+
timeout: {
|
|
1202
|
+
request: this.config.maxTimeout
|
|
1203
|
+
},
|
|
1204
|
+
headers: options.headers,
|
|
1205
|
+
followRedirect: true
|
|
1206
|
+
// got-scraping handles browser fingerprinting automatically
|
|
1207
|
+
// It uses header generators and proper TLS settings
|
|
1208
|
+
});
|
|
1209
|
+
clearTimeout(timeoutId);
|
|
1210
|
+
const duration = Date.now() - startTime;
|
|
1211
|
+
const html = response.body;
|
|
1212
|
+
logger4?.debug(`[tlsclient] Got response: ${response.statusCode} (${html.length} chars) in ${duration}ms`);
|
|
1213
|
+
if (response.statusCode >= 400) {
|
|
1214
|
+
throw new HttpError("tlsclient", response.statusCode, response.statusMessage);
|
|
1215
|
+
}
|
|
1216
|
+
const challengeType = this.detectJsRequired(html);
|
|
1217
|
+
if (challengeType) {
|
|
1218
|
+
logger4?.debug(`[tlsclient] JS required: ${challengeType}`);
|
|
1219
|
+
throw new ChallengeDetectedError("tlsclient", challengeType);
|
|
1220
|
+
}
|
|
1221
|
+
const blockedReason = this.detectBlocked(html);
|
|
1222
|
+
if (blockedReason) {
|
|
1223
|
+
logger4?.debug(`[tlsclient] Blocked: ${blockedReason}`);
|
|
1224
|
+
throw new ChallengeDetectedError("tlsclient", `blocked: ${blockedReason}`);
|
|
1225
|
+
}
|
|
1226
|
+
const textContent = this.extractText(html);
|
|
1227
|
+
if (textContent.length < MIN_CONTENT_LENGTH2) {
|
|
1228
|
+
logger4?.debug(`[tlsclient] Insufficient content: ${textContent.length} chars`);
|
|
1229
|
+
throw new InsufficientContentError("tlsclient", textContent.length, MIN_CONTENT_LENGTH2);
|
|
1230
|
+
}
|
|
1231
|
+
return {
|
|
1232
|
+
html,
|
|
1233
|
+
url: response.url,
|
|
1234
|
+
statusCode: response.statusCode,
|
|
1235
|
+
contentType: response.headers["content-type"],
|
|
1236
|
+
headers: response.headers,
|
|
1237
|
+
engine: "tlsclient",
|
|
1238
|
+
duration
|
|
1239
|
+
};
|
|
1240
|
+
} catch (error) {
|
|
1241
|
+
if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof HttpError || error instanceof EngineUnavailableError) {
|
|
1242
|
+
throw error;
|
|
1243
|
+
}
|
|
1244
|
+
if (error instanceof Error) {
|
|
1245
|
+
if (error.name === "TimeoutError" || error.message.includes("timeout")) {
|
|
1246
|
+
throw new EngineTimeoutError("tlsclient", this.config.maxTimeout);
|
|
1247
|
+
}
|
|
1248
|
+
if (error.name === "AbortError") {
|
|
1249
|
+
throw new EngineTimeoutError("tlsclient", this.config.maxTimeout);
|
|
1250
|
+
}
|
|
1251
|
+
throw new EngineError("tlsclient", error.message, { cause: error });
|
|
1252
|
+
}
|
|
1253
|
+
throw new EngineError("tlsclient", String(error));
|
|
1254
|
+
}
|
|
1255
|
+
}
|
|
1256
|
+
/**
|
|
1257
|
+
* Detect patterns that require JS execution
|
|
1258
|
+
*/
|
|
1259
|
+
detectJsRequired(html) {
|
|
1260
|
+
const htmlLower = html.toLowerCase();
|
|
1261
|
+
for (const pattern of JS_REQUIRED_PATTERNS) {
|
|
1262
|
+
if (htmlLower.includes(pattern.toLowerCase())) {
|
|
1263
|
+
if (pattern.includes("cf") || pattern.includes("cloudflare")) {
|
|
1264
|
+
return "cloudflare-js";
|
|
1265
|
+
}
|
|
1266
|
+
return "js-required";
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1269
|
+
return null;
|
|
1270
|
+
}
|
|
1271
|
+
/**
|
|
1272
|
+
* Detect blocked/denied patterns
|
|
1273
|
+
*/
|
|
1274
|
+
detectBlocked(html) {
|
|
1275
|
+
const htmlLower = html.toLowerCase();
|
|
1276
|
+
for (const pattern of BLOCKED_PATTERNS) {
|
|
1277
|
+
if (htmlLower.includes(pattern.toLowerCase())) {
|
|
1278
|
+
return pattern;
|
|
1279
|
+
}
|
|
1280
|
+
}
|
|
1281
|
+
return null;
|
|
1282
|
+
}
|
|
1283
|
+
/**
|
|
1284
|
+
* Extract visible text from HTML
|
|
1285
|
+
*/
|
|
1286
|
+
extractText(html) {
|
|
1287
|
+
return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
1288
|
+
}
|
|
1289
|
+
isAvailable() {
|
|
1290
|
+
return this.available;
|
|
1291
|
+
}
|
|
1292
|
+
};
|
|
1293
|
+
var tlsClientEngine = new TlsClientEngine();
|
|
1294
|
+
|
|
1295
|
+
// src/cloudflare/detector.ts
|
|
1296
|
+
var CLOUDFLARE_CHALLENGE_SELECTORS = [
|
|
1297
|
+
"#challenge-running",
|
|
1298
|
+
"#challenge-stage",
|
|
1299
|
+
"#challenge-form",
|
|
1300
|
+
".cf-browser-verification",
|
|
1301
|
+
"#cf-wrapper",
|
|
1302
|
+
"#cf-hcaptcha-container",
|
|
1303
|
+
"#turnstile-wrapper"
|
|
1304
|
+
];
|
|
1305
|
+
var CLOUDFLARE_TEXT_PATTERNS = [
|
|
1306
|
+
"checking if the site connection is secure",
|
|
1307
|
+
"this process is automatic. your browser will redirect",
|
|
1308
|
+
"ray id:",
|
|
1309
|
+
"performance & security by cloudflare"
|
|
1310
|
+
];
|
|
1311
|
+
var CLOUDFLARE_INFRA_PATTERNS2 = [
|
|
1312
|
+
"/cdn-cgi/",
|
|
1313
|
+
"cloudflare",
|
|
1314
|
+
"__cf_bm",
|
|
1315
|
+
"cf-ray"
|
|
1316
|
+
];
|
|
1317
|
+
var CLOUDFLARE_BLOCKED_PATTERNS = [
|
|
1318
|
+
"sorry, you have been blocked",
|
|
1319
|
+
"ray id:"
|
|
1320
|
+
];
|
|
1321
|
+
async function detectChallenge(hero) {
|
|
1322
|
+
const signals = [];
|
|
1323
|
+
let type = "none";
|
|
1324
|
+
let hasCloudflareInfra = false;
|
|
1325
|
+
let hasChallengeIndicator = false;
|
|
1326
|
+
try {
|
|
1327
|
+
if (!hero.document) {
|
|
1328
|
+
return {
|
|
1329
|
+
isChallenge: false,
|
|
1330
|
+
type: "none",
|
|
1331
|
+
confidence: 0,
|
|
1332
|
+
signals: ["No document available"]
|
|
1333
|
+
};
|
|
1334
|
+
}
|
|
1335
|
+
const html = await hero.document.documentElement.outerHTML;
|
|
1336
|
+
const htmlLower = html.toLowerCase();
|
|
1337
|
+
for (const pattern of CLOUDFLARE_INFRA_PATTERNS2) {
|
|
1338
|
+
if (htmlLower.includes(pattern)) {
|
|
1339
|
+
hasCloudflareInfra = true;
|
|
1340
|
+
signals.push(`Cloudflare infra: "${pattern}"`);
|
|
1341
|
+
break;
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
if (!hasCloudflareInfra) {
|
|
1345
|
+
return {
|
|
1346
|
+
isChallenge: false,
|
|
1347
|
+
type: "none",
|
|
1348
|
+
confidence: 0,
|
|
1349
|
+
signals: ["No Cloudflare infrastructure detected"]
|
|
1350
|
+
};
|
|
1351
|
+
}
|
|
1352
|
+
for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
|
|
1353
|
+
try {
|
|
1354
|
+
const element = await hero.document.querySelector(selector);
|
|
1355
|
+
if (element) {
|
|
1356
|
+
hasChallengeIndicator = true;
|
|
1357
|
+
signals.push(`Challenge element: ${selector}`);
|
|
1358
|
+
type = "js_challenge";
|
|
1359
|
+
}
|
|
1360
|
+
} catch {
|
|
1361
|
+
}
|
|
1362
|
+
}
|
|
1363
|
+
for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
|
|
1364
|
+
if (htmlLower.includes(pattern)) {
|
|
1365
|
+
hasChallengeIndicator = true;
|
|
1366
|
+
signals.push(`Challenge text: "${pattern}"`);
|
|
1367
|
+
type = type === "none" ? "js_challenge" : type;
|
|
1368
|
+
}
|
|
1369
|
+
}
|
|
1370
|
+
if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
|
|
1371
|
+
hasChallengeIndicator = true;
|
|
1372
|
+
signals.push('Challenge text: "waiting for...to respond"');
|
|
1373
|
+
type = type === "none" ? "js_challenge" : type;
|
|
1374
|
+
}
|
|
1375
|
+
const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
|
|
1376
|
+
if (hasBlocked) {
|
|
1377
|
+
hasChallengeIndicator = true;
|
|
1378
|
+
signals.push("Cloudflare block page detected");
|
|
1379
|
+
type = "blocked";
|
|
1380
|
+
}
|
|
1381
|
+
const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
|
|
1382
|
+
const confidence = isChallenge ? 100 : 0;
|
|
1383
|
+
return {
|
|
1384
|
+
isChallenge,
|
|
1385
|
+
type: isChallenge ? type : "none",
|
|
1386
|
+
confidence,
|
|
1387
|
+
signals
|
|
1388
|
+
};
|
|
1389
|
+
} catch (error) {
|
|
1390
|
+
return {
|
|
1391
|
+
isChallenge: false,
|
|
1392
|
+
type: "none",
|
|
1393
|
+
confidence: 0,
|
|
1394
|
+
signals: [`Error during detection: ${error.message}`]
|
|
1395
|
+
};
|
|
1396
|
+
}
|
|
1397
|
+
}
|
|
1398
|
+
|
|
1399
|
+
// src/cloudflare/handler.ts
|
|
1400
|
+
async function waitForChallengeResolution(hero, options) {
|
|
1401
|
+
const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
|
|
1402
|
+
const startTime = Date.now();
|
|
1403
|
+
const log = (msg) => verbose && console.log(` ${msg}`);
|
|
1404
|
+
while (Date.now() - startTime < maxWaitMs) {
|
|
1405
|
+
const elapsed = Date.now() - startTime;
|
|
1406
|
+
try {
|
|
1407
|
+
const currentUrl = await hero.url;
|
|
1408
|
+
if (currentUrl !== initialUrl) {
|
|
1409
|
+
log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
|
|
1410
|
+
log(` Waiting for new page to load...`);
|
|
1411
|
+
try {
|
|
1412
|
+
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
1413
|
+
log(` DOMContentLoaded`);
|
|
1414
|
+
} catch {
|
|
1415
|
+
log(` DOMContentLoaded timeout, continuing...`);
|
|
1416
|
+
}
|
|
1417
|
+
await hero.waitForPaintingStable().catch(() => {
|
|
1418
|
+
});
|
|
1419
|
+
log(` Page stabilized`);
|
|
1420
|
+
return { resolved: true, method: "url_redirect", waitedMs: elapsed };
|
|
1421
|
+
}
|
|
1422
|
+
} catch {
|
|
1423
|
+
}
|
|
1424
|
+
const detection = await detectChallenge(hero);
|
|
1425
|
+
if (!detection.isChallenge) {
|
|
1426
|
+
log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
|
|
1427
|
+
log(` Waiting for page to load...`);
|
|
1428
|
+
try {
|
|
1429
|
+
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
1430
|
+
log(` DOMContentLoaded`);
|
|
1431
|
+
} catch {
|
|
1432
|
+
log(` DOMContentLoaded timeout, continuing...`);
|
|
1433
|
+
}
|
|
1434
|
+
await hero.waitForPaintingStable().catch(() => {
|
|
1435
|
+
});
|
|
1436
|
+
log(` Page stabilized`);
|
|
1437
|
+
return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
|
|
1438
|
+
}
|
|
1439
|
+
log(
|
|
1440
|
+
`\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
|
|
1441
|
+
);
|
|
1442
|
+
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
|
|
1443
|
+
}
|
|
1444
|
+
return {
|
|
1445
|
+
resolved: false,
|
|
1446
|
+
method: "timeout",
|
|
1447
|
+
waitedMs: Date.now() - startTime
|
|
1448
|
+
};
|
|
1449
|
+
}
|
|
1450
|
+
|
|
1451
|
+
// src/engines/hero/index.ts
|
|
1452
|
+
var MIN_CONTENT_LENGTH3 = 100;
|
|
1453
|
+
var HeroEngine = class {
|
|
1454
|
+
config = ENGINE_CONFIGS.hero;
|
|
1455
|
+
async scrape(meta) {
|
|
1456
|
+
const startTime = Date.now();
|
|
1457
|
+
const { url, options, logger: logger4, abortSignal } = meta;
|
|
1458
|
+
const pool = options.pool;
|
|
1459
|
+
if (!pool) {
|
|
1460
|
+
throw new EngineUnavailableError("hero", "Browser pool not available");
|
|
1461
|
+
}
|
|
1462
|
+
if (abortSignal?.aborted) {
|
|
1463
|
+
throw new EngineTimeoutError("hero", 0);
|
|
1464
|
+
}
|
|
1465
|
+
logger4?.debug(`[hero] Starting browser scrape of ${url}`);
|
|
1466
|
+
try {
|
|
1467
|
+
const result = await pool.withBrowser(async (hero) => {
|
|
1468
|
+
let aborted = false;
|
|
1469
|
+
if (abortSignal) {
|
|
1470
|
+
abortSignal.addEventListener("abort", () => {
|
|
1471
|
+
aborted = true;
|
|
1472
|
+
}, { once: true });
|
|
1473
|
+
}
|
|
1474
|
+
const timeoutMs = options.timeoutMs || this.config.maxTimeout;
|
|
1475
|
+
await hero.goto(url, { timeoutMs });
|
|
1476
|
+
if (aborted) {
|
|
1477
|
+
throw new EngineTimeoutError("hero", Date.now() - startTime);
|
|
1478
|
+
}
|
|
1479
|
+
try {
|
|
1480
|
+
await hero.waitForLoad("DomContentLoaded", { timeoutMs });
|
|
1481
|
+
} catch {
|
|
1482
|
+
}
|
|
1483
|
+
await hero.waitForPaintingStable();
|
|
1484
|
+
if (aborted) {
|
|
1485
|
+
throw new EngineTimeoutError("hero", Date.now() - startTime);
|
|
1486
|
+
}
|
|
1487
|
+
const initialUrl = await hero.url;
|
|
1488
|
+
const detection = await detectChallenge(hero);
|
|
1489
|
+
if (detection.isChallenge) {
|
|
1490
|
+
logger4?.debug(`[hero] Challenge detected: ${detection.type}`);
|
|
1491
|
+
if (detection.type === "blocked") {
|
|
1492
|
+
throw new ChallengeDetectedError("hero", "blocked");
|
|
1493
|
+
}
|
|
1494
|
+
const resolution = await waitForChallengeResolution(hero, {
|
|
1495
|
+
maxWaitMs: 45e3,
|
|
1496
|
+
pollIntervalMs: 500,
|
|
1497
|
+
verbose: options.verbose,
|
|
1498
|
+
initialUrl
|
|
1499
|
+
});
|
|
1500
|
+
if (!resolution.resolved) {
|
|
1501
|
+
throw new ChallengeDetectedError("hero", `unresolved: ${detection.type}`);
|
|
1502
|
+
}
|
|
1503
|
+
logger4?.debug(`[hero] Challenge resolved via ${resolution.method} in ${resolution.waitedMs}ms`);
|
|
1504
|
+
}
|
|
1505
|
+
if (aborted) {
|
|
1506
|
+
throw new EngineTimeoutError("hero", Date.now() - startTime);
|
|
1507
|
+
}
|
|
1508
|
+
await this.waitForFinalPage(hero, url, logger4);
|
|
1509
|
+
if (aborted) {
|
|
1510
|
+
throw new EngineTimeoutError("hero", Date.now() - startTime);
|
|
1511
|
+
}
|
|
1512
|
+
if (options.waitForSelector) {
|
|
1513
|
+
try {
|
|
1514
|
+
await hero.waitForElement(hero.document.querySelector(options.waitForSelector), {
|
|
1515
|
+
timeoutMs
|
|
1516
|
+
});
|
|
1517
|
+
} catch {
|
|
1518
|
+
logger4?.debug(`[hero] Selector not found: ${options.waitForSelector}`);
|
|
1519
|
+
}
|
|
1520
|
+
}
|
|
1521
|
+
const html = await hero.document.documentElement.outerHTML;
|
|
1522
|
+
const finalUrl = await hero.url;
|
|
1523
|
+
const textContent = this.extractText(html);
|
|
1524
|
+
if (textContent.length < MIN_CONTENT_LENGTH3) {
|
|
1525
|
+
logger4?.debug(`[hero] Insufficient content: ${textContent.length} chars`);
|
|
1526
|
+
throw new InsufficientContentError("hero", textContent.length, MIN_CONTENT_LENGTH3);
|
|
1527
|
+
}
|
|
1528
|
+
const duration = Date.now() - startTime;
|
|
1529
|
+
logger4?.debug(`[hero] Success: ${html.length} chars in ${duration}ms`);
|
|
1530
|
+
return {
|
|
1531
|
+
html,
|
|
1532
|
+
url: finalUrl,
|
|
1533
|
+
statusCode: 200,
|
|
1534
|
+
// Hero doesn't expose status code directly
|
|
1535
|
+
engine: "hero",
|
|
1536
|
+
duration
|
|
1537
|
+
};
|
|
1538
|
+
});
|
|
1539
|
+
return result;
|
|
1540
|
+
} catch (error) {
|
|
1541
|
+
if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof EngineTimeoutError || error instanceof EngineUnavailableError) {
|
|
1542
|
+
throw error;
|
|
1543
|
+
}
|
|
1544
|
+
if (error instanceof Error) {
|
|
1545
|
+
if (error.name === "TimeoutError" || error.message.includes("timeout")) {
|
|
1546
|
+
throw new EngineTimeoutError("hero", this.config.maxTimeout);
|
|
1547
|
+
}
|
|
1548
|
+
if (error.message.includes("Navigation") || error.message.includes("ERR_")) {
|
|
1549
|
+
throw new EngineError("hero", `Navigation failed: ${error.message}`, { cause: error });
|
|
1550
|
+
}
|
|
1551
|
+
throw new EngineError("hero", error.message, { cause: error });
|
|
1552
|
+
}
|
|
1553
|
+
throw new EngineError("hero", String(error));
|
|
1554
|
+
}
|
|
1555
|
+
}
|
|
1556
|
+
/**
|
|
1557
|
+
* Wait for the final page to load after any Cloudflare redirects
|
|
1558
|
+
*/
|
|
1559
|
+
async waitForFinalPage(hero, originalUrl, logger4) {
|
|
1560
|
+
const maxWaitMs = 15e3;
|
|
1561
|
+
const startTime = Date.now();
|
|
1562
|
+
try {
|
|
1563
|
+
await hero.waitForLoad("AllContentLoaded", { timeoutMs: maxWaitMs });
|
|
1564
|
+
} catch {
|
|
1565
|
+
}
|
|
1566
|
+
let currentUrl = await hero.url;
|
|
1567
|
+
const normalizeUrl2 = (url) => url.replace(/\/+$/, "");
|
|
1568
|
+
const urlChanged = normalizeUrl2(currentUrl) !== normalizeUrl2(originalUrl);
|
|
1569
|
+
if (urlChanged || currentUrl.includes("__cf_chl")) {
|
|
1570
|
+
logger4?.debug(`[hero] Cloudflare redirect detected: ${originalUrl} \u2192 ${currentUrl}`);
|
|
1571
|
+
let lastUrl = currentUrl;
|
|
1572
|
+
let stableCount = 0;
|
|
1573
|
+
while (Date.now() - startTime < maxWaitMs) {
|
|
1574
|
+
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
1575
|
+
try {
|
|
1576
|
+
currentUrl = await hero.url;
|
|
1577
|
+
if (currentUrl === lastUrl) {
|
|
1578
|
+
stableCount++;
|
|
1579
|
+
if (stableCount >= 2) {
|
|
1580
|
+
break;
|
|
1581
|
+
}
|
|
1582
|
+
} else {
|
|
1583
|
+
stableCount = 0;
|
|
1584
|
+
lastUrl = currentUrl;
|
|
1585
|
+
logger4?.debug(`[hero] URL changed to: ${currentUrl}`);
|
|
1586
|
+
}
|
|
1587
|
+
} catch {
|
|
1588
|
+
}
|
|
1589
|
+
}
|
|
1590
|
+
try {
|
|
1591
|
+
await hero.waitForLoad("AllContentLoaded", { timeoutMs: 1e4 });
|
|
1592
|
+
} catch {
|
|
1593
|
+
}
|
|
1594
|
+
}
|
|
1595
|
+
await hero.waitForPaintingStable();
|
|
1596
|
+
await new Promise((resolve) => setTimeout(resolve, 2e3));
|
|
1597
|
+
}
|
|
1598
|
+
/**
|
|
1599
|
+
* Extract visible text from HTML
|
|
1600
|
+
*/
|
|
1601
|
+
extractText(html) {
|
|
1602
|
+
return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
1239
1603
|
}
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
return false;
|
|
1243
|
-
}
|
|
1604
|
+
isAvailable() {
|
|
1605
|
+
return true;
|
|
1244
1606
|
}
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1607
|
+
};
|
|
1608
|
+
var heroEngine = new HeroEngine();
|
|
1609
|
+
|
|
1610
|
+
// src/engines/orchestrator.ts
|
|
1611
|
+
var ENGINE_REGISTRY = {
|
|
1612
|
+
http: httpEngine,
|
|
1613
|
+
tlsclient: tlsClientEngine,
|
|
1614
|
+
hero: heroEngine
|
|
1615
|
+
};
|
|
1616
|
+
var EngineOrchestrator = class {
|
|
1617
|
+
options;
|
|
1618
|
+
engines;
|
|
1619
|
+
engineOrder;
|
|
1620
|
+
constructor(options = {}) {
|
|
1621
|
+
this.options = options;
|
|
1622
|
+
this.engineOrder = this.resolveEngineOrder();
|
|
1623
|
+
this.engines = this.engineOrder.map((name) => ENGINE_REGISTRY[name]).filter((engine) => engine.isAvailable());
|
|
1250
1624
|
}
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1625
|
+
/**
|
|
1626
|
+
* Resolve the engine order based on options
|
|
1627
|
+
*/
|
|
1628
|
+
resolveEngineOrder() {
|
|
1629
|
+
if (this.options.forceEngine) {
|
|
1630
|
+
return [this.options.forceEngine];
|
|
1631
|
+
}
|
|
1632
|
+
let order = this.options.engines || [...DEFAULT_ENGINE_ORDER];
|
|
1633
|
+
if (this.options.skipEngines) {
|
|
1634
|
+
order = order.filter((e) => !this.options.skipEngines.includes(e));
|
|
1635
|
+
}
|
|
1636
|
+
return order;
|
|
1256
1637
|
}
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
return
|
|
1638
|
+
/**
|
|
1639
|
+
* Get available engines
|
|
1640
|
+
*/
|
|
1641
|
+
getAvailableEngines() {
|
|
1642
|
+
return this.engines.map((e) => e.config.name);
|
|
1262
1643
|
}
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1644
|
+
/**
|
|
1645
|
+
* Scrape a URL using the engine cascade
|
|
1646
|
+
*
|
|
1647
|
+
* @param meta - Engine metadata (url, options, logger, abortSignal)
|
|
1648
|
+
* @returns Scrape result with engine metadata
|
|
1649
|
+
* @throws AllEnginesFailedError if all engines fail
|
|
1650
|
+
*/
|
|
1651
|
+
async scrape(meta) {
|
|
1652
|
+
const attemptedEngines = [];
|
|
1653
|
+
const engineErrors = /* @__PURE__ */ new Map();
|
|
1654
|
+
const logger4 = meta.logger || this.options.logger;
|
|
1655
|
+
const verbose = this.options.verbose || meta.options.verbose;
|
|
1656
|
+
if (this.engines.length === 0) {
|
|
1657
|
+
throw new AllEnginesFailedError([], engineErrors);
|
|
1658
|
+
}
|
|
1659
|
+
const log = (msg) => {
|
|
1660
|
+
if (verbose) {
|
|
1661
|
+
logger4?.info(msg);
|
|
1662
|
+
} else {
|
|
1663
|
+
logger4?.debug(msg);
|
|
1664
|
+
}
|
|
1665
|
+
};
|
|
1666
|
+
log(`[orchestrator] Starting scrape of ${meta.url} with engines: ${this.engineOrder.join(" \u2192 ")}`);
|
|
1667
|
+
for (const engine of this.engines) {
|
|
1668
|
+
const engineName = engine.config.name;
|
|
1669
|
+
attemptedEngines.push(engineName);
|
|
1670
|
+
try {
|
|
1671
|
+
log(`[orchestrator] Trying ${engineName} engine...`);
|
|
1672
|
+
const controller = new AbortController();
|
|
1673
|
+
const timeoutId = setTimeout(() => controller.abort(), engine.config.maxTimeout);
|
|
1674
|
+
if (meta.abortSignal) {
|
|
1675
|
+
meta.abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1676
|
+
}
|
|
1677
|
+
try {
|
|
1678
|
+
const result = await engine.scrape({
|
|
1679
|
+
...meta,
|
|
1680
|
+
abortSignal: controller.signal
|
|
1681
|
+
});
|
|
1682
|
+
clearTimeout(timeoutId);
|
|
1683
|
+
log(`[orchestrator] \u2713 ${engineName} succeeded in ${result.duration}ms`);
|
|
1684
|
+
return {
|
|
1685
|
+
...result,
|
|
1686
|
+
attemptedEngines,
|
|
1687
|
+
engineErrors
|
|
1688
|
+
};
|
|
1689
|
+
} finally {
|
|
1690
|
+
clearTimeout(timeoutId);
|
|
1691
|
+
}
|
|
1692
|
+
} catch (error) {
|
|
1693
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
1694
|
+
engineErrors.set(engineName, err);
|
|
1695
|
+
if (error instanceof ChallengeDetectedError) {
|
|
1696
|
+
log(`[orchestrator] ${engineName} detected challenge: ${error.challengeType}`);
|
|
1697
|
+
} else if (error instanceof InsufficientContentError) {
|
|
1698
|
+
log(`[orchestrator] ${engineName} insufficient content: ${error.contentLength} chars`);
|
|
1699
|
+
} else if (error instanceof HttpError) {
|
|
1700
|
+
log(`[orchestrator] ${engineName} HTTP error: ${error.statusCode}`);
|
|
1701
|
+
} else if (error instanceof EngineTimeoutError) {
|
|
1702
|
+
log(`[orchestrator] ${engineName} timed out after ${error.timeoutMs}ms`);
|
|
1703
|
+
} else if (error instanceof EngineUnavailableError) {
|
|
1704
|
+
log(`[orchestrator] ${engineName} unavailable: ${err.message}`);
|
|
1705
|
+
} else {
|
|
1706
|
+
log(`[orchestrator] ${engineName} failed: ${err.message}`);
|
|
1707
|
+
}
|
|
1708
|
+
if (!this.shouldRetry(error)) {
|
|
1709
|
+
log(`[orchestrator] Non-retryable error, stopping cascade`);
|
|
1710
|
+
break;
|
|
1711
|
+
}
|
|
1712
|
+
log(`[orchestrator] Falling back to next engine...`);
|
|
1270
1713
|
}
|
|
1271
|
-
});
|
|
1272
|
-
if (!response.ok) {
|
|
1273
|
-
return null;
|
|
1274
1714
|
}
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
} catch {
|
|
1278
|
-
return null;
|
|
1279
|
-
}
|
|
1280
|
-
}
|
|
1281
|
-
function isUrlAllowed(url, rules) {
|
|
1282
|
-
if (!rules) {
|
|
1283
|
-
return true;
|
|
1715
|
+
log(`[orchestrator] All engines failed for ${meta.url}`);
|
|
1716
|
+
throw new AllEnginesFailedError(attemptedEngines, engineErrors);
|
|
1284
1717
|
}
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1718
|
+
/**
|
|
1719
|
+
* Determine if we should retry with next engine
|
|
1720
|
+
*/
|
|
1721
|
+
shouldRetry(error) {
|
|
1722
|
+
if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof EngineTimeoutError) {
|
|
1723
|
+
return true;
|
|
1724
|
+
}
|
|
1725
|
+
if (error instanceof HttpError) {
|
|
1726
|
+
return error.statusCode === 403 || error.statusCode === 429 || error.statusCode >= 500;
|
|
1727
|
+
}
|
|
1728
|
+
if (error instanceof EngineUnavailableError) {
|
|
1729
|
+
return true;
|
|
1730
|
+
}
|
|
1731
|
+
if (error instanceof EngineError) {
|
|
1732
|
+
return error.retryable;
|
|
1733
|
+
}
|
|
1289
1734
|
return true;
|
|
1290
1735
|
}
|
|
1291
|
-
}
|
|
1292
|
-
|
|
1293
|
-
// src/types.ts
|
|
1294
|
-
var DEFAULT_OPTIONS = {
|
|
1295
|
-
urls: [],
|
|
1296
|
-
formats: ["markdown"],
|
|
1297
|
-
includeMetadata: true,
|
|
1298
|
-
timeoutMs: 3e4,
|
|
1299
|
-
includePatterns: [],
|
|
1300
|
-
excludePatterns: [],
|
|
1301
|
-
// Content cleaning defaults
|
|
1302
|
-
removeAds: true,
|
|
1303
|
-
removeBase64Images: true,
|
|
1304
|
-
skipTLSVerification: true,
|
|
1305
|
-
// Batch defaults
|
|
1306
|
-
batchConcurrency: 1,
|
|
1307
|
-
batchTimeoutMs: 3e5,
|
|
1308
|
-
maxRetries: 2,
|
|
1309
|
-
onProgress: () => {
|
|
1310
|
-
},
|
|
1311
|
-
// Default no-op progress callback
|
|
1312
|
-
// Hero-specific defaults
|
|
1313
|
-
verbose: false,
|
|
1314
|
-
showChrome: false
|
|
1315
1736
|
};
|
|
1316
1737
|
|
|
1317
1738
|
// src/scraper.ts
|
|
1318
1739
|
var Scraper = class {
|
|
1319
1740
|
options;
|
|
1320
|
-
pool;
|
|
1321
1741
|
logger = createLogger("scraper");
|
|
1322
1742
|
robotsCache = /* @__PURE__ */ new Map();
|
|
1323
1743
|
constructor(options) {
|
|
@@ -1325,10 +1745,6 @@ var Scraper = class {
|
|
|
1325
1745
|
...DEFAULT_OPTIONS,
|
|
1326
1746
|
...options
|
|
1327
1747
|
};
|
|
1328
|
-
if (!options.pool) {
|
|
1329
|
-
throw new Error("Browser pool must be provided. Use ReaderClient for automatic pool management.");
|
|
1330
|
-
}
|
|
1331
|
-
this.pool = options.pool;
|
|
1332
1748
|
}
|
|
1333
1749
|
/**
|
|
1334
1750
|
* Get robots.txt rules for a URL, cached per domain
|
|
@@ -1396,52 +1812,7 @@ var Scraper = class {
|
|
|
1396
1812
|
return { result: null, error: lastError };
|
|
1397
1813
|
}
|
|
1398
1814
|
/**
|
|
1399
|
-
*
|
|
1400
|
-
* Cloudflare often does silent redirects even when bypassed, we need to ensure
|
|
1401
|
-
* we're on the actual content page before scraping.
|
|
1402
|
-
*/
|
|
1403
|
-
async waitForFinalPage(hero, originalUrl, verbose) {
|
|
1404
|
-
const maxWaitMs = 15e3;
|
|
1405
|
-
const startTime = Date.now();
|
|
1406
|
-
const log = (msg) => verbose && this.logger.info(msg);
|
|
1407
|
-
try {
|
|
1408
|
-
await hero.waitForLoad("AllContentLoaded", { timeoutMs: maxWaitMs });
|
|
1409
|
-
} catch {
|
|
1410
|
-
}
|
|
1411
|
-
let currentUrl = await hero.url;
|
|
1412
|
-
const normalizeUrl2 = (url) => url.replace(/\/+$/, "");
|
|
1413
|
-
const urlChanged = normalizeUrl2(currentUrl) !== normalizeUrl2(originalUrl);
|
|
1414
|
-
if (urlChanged || currentUrl.includes("__cf_chl")) {
|
|
1415
|
-
log(`Cloudflare redirect detected: ${originalUrl} \u2192 ${currentUrl}`);
|
|
1416
|
-
let lastUrl = currentUrl;
|
|
1417
|
-
let stableCount = 0;
|
|
1418
|
-
while (Date.now() - startTime < maxWaitMs) {
|
|
1419
|
-
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
1420
|
-
try {
|
|
1421
|
-
currentUrl = await hero.url;
|
|
1422
|
-
if (currentUrl === lastUrl) {
|
|
1423
|
-
stableCount++;
|
|
1424
|
-
if (stableCount >= 2) {
|
|
1425
|
-
break;
|
|
1426
|
-
}
|
|
1427
|
-
} else {
|
|
1428
|
-
stableCount = 0;
|
|
1429
|
-
lastUrl = currentUrl;
|
|
1430
|
-
log(`URL changed to: ${currentUrl}`);
|
|
1431
|
-
}
|
|
1432
|
-
} catch {
|
|
1433
|
-
}
|
|
1434
|
-
}
|
|
1435
|
-
try {
|
|
1436
|
-
await hero.waitForLoad("AllContentLoaded", { timeoutMs: 1e4 });
|
|
1437
|
-
} catch {
|
|
1438
|
-
}
|
|
1439
|
-
}
|
|
1440
|
-
await hero.waitForPaintingStable();
|
|
1441
|
-
await new Promise((resolve) => setTimeout(resolve, 2e3));
|
|
1442
|
-
}
|
|
1443
|
-
/**
|
|
1444
|
-
* Scrape a single URL
|
|
1815
|
+
* Scrape a single URL using the engine orchestrator
|
|
1445
1816
|
*/
|
|
1446
1817
|
async scrapeSingleUrl(url, index) {
|
|
1447
1818
|
const startTime = Date.now();
|
|
@@ -1450,133 +1821,84 @@ var Scraper = class {
|
|
|
1450
1821
|
throw new Error(`URL blocked by robots.txt: ${url}`);
|
|
1451
1822
|
}
|
|
1452
1823
|
try {
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
if (this.options.waitForSelector) {
|
|
1487
|
-
try {
|
|
1488
|
-
await hero.waitForElement(hero.document.querySelector(this.options.waitForSelector), {
|
|
1489
|
-
timeoutMs: this.options.timeoutMs
|
|
1490
|
-
});
|
|
1491
|
-
} catch (error) {
|
|
1492
|
-
this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
|
|
1493
|
-
}
|
|
1494
|
-
}
|
|
1495
|
-
const pageTitle = await hero.document.title;
|
|
1496
|
-
const html = await hero.document.documentElement.outerHTML;
|
|
1497
|
-
const cleanedHtml = cleanContent(html, url, {
|
|
1498
|
-
removeAds: this.options.removeAds,
|
|
1499
|
-
removeBase64Images: this.options.removeBase64Images
|
|
1824
|
+
const orchestrator = new EngineOrchestrator({
|
|
1825
|
+
engines: this.options.engines,
|
|
1826
|
+
skipEngines: this.options.skipEngines,
|
|
1827
|
+
forceEngine: this.options.forceEngine,
|
|
1828
|
+
logger: this.logger,
|
|
1829
|
+
verbose: this.options.verbose
|
|
1830
|
+
});
|
|
1831
|
+
const engineResult = await orchestrator.scrape({
|
|
1832
|
+
url,
|
|
1833
|
+
options: this.options,
|
|
1834
|
+
logger: this.logger
|
|
1835
|
+
});
|
|
1836
|
+
if (this.options.verbose) {
|
|
1837
|
+
this.logger.info(
|
|
1838
|
+
`[scraper] ${url} scraped with ${engineResult.engine} engine in ${engineResult.duration}ms (attempted: ${engineResult.attemptedEngines.join(" \u2192 ")})`
|
|
1839
|
+
);
|
|
1840
|
+
}
|
|
1841
|
+
const cleanedHtml = cleanContent(engineResult.html, engineResult.url, {
|
|
1842
|
+
removeAds: this.options.removeAds,
|
|
1843
|
+
removeBase64Images: this.options.removeBase64Images,
|
|
1844
|
+
onlyMainContent: this.options.onlyMainContent,
|
|
1845
|
+
includeTags: this.options.includeTags,
|
|
1846
|
+
excludeTags: this.options.excludeTags
|
|
1847
|
+
});
|
|
1848
|
+
const websiteMetadata = extractMetadata(cleanedHtml, engineResult.url);
|
|
1849
|
+
const duration = Date.now() - startTime;
|
|
1850
|
+
const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
|
|
1851
|
+
const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
|
|
1852
|
+
if (this.options.onProgress) {
|
|
1853
|
+
this.options.onProgress({
|
|
1854
|
+
completed: index + 1,
|
|
1855
|
+
total: this.options.urls.length,
|
|
1856
|
+
currentUrl: url
|
|
1500
1857
|
});
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
const
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
// Will be set by formatter
|
|
1509
|
-
html: cleanedHtml,
|
|
1510
|
-
fetchedAt: scrapedAt,
|
|
1511
|
-
depth: 0,
|
|
1512
|
-
hadChallenge,
|
|
1513
|
-
challengeType,
|
|
1514
|
-
waitTimeMs
|
|
1515
|
-
};
|
|
1516
|
-
const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
|
|
1517
|
-
[page],
|
|
1518
|
-
url,
|
|
1519
|
-
scrapedAt,
|
|
1520
|
-
duration,
|
|
1521
|
-
websiteMetadata,
|
|
1522
|
-
this.options.includeMetadata
|
|
1523
|
-
) : void 0;
|
|
1524
|
-
const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1525
|
-
const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1526
|
-
const text = this.options.formats.includes("text") ? formatToText(
|
|
1527
|
-
[page],
|
|
1528
|
-
url,
|
|
1529
|
-
scrapedAt,
|
|
1530
|
-
duration,
|
|
1531
|
-
websiteMetadata,
|
|
1532
|
-
this.options.includeMetadata
|
|
1533
|
-
) : void 0;
|
|
1534
|
-
if (this.options.onProgress) {
|
|
1535
|
-
this.options.onProgress({
|
|
1536
|
-
completed: index + 1,
|
|
1537
|
-
total: this.options.urls.length,
|
|
1538
|
-
currentUrl: url
|
|
1539
|
-
});
|
|
1540
|
-
}
|
|
1541
|
-
let proxyMetadata;
|
|
1542
|
-
if (this.options.proxy) {
|
|
1543
|
-
const proxy = this.options.proxy;
|
|
1544
|
-
if (proxy.url) {
|
|
1545
|
-
try {
|
|
1546
|
-
const proxyUrl = new URL(proxy.url);
|
|
1547
|
-
proxyMetadata = {
|
|
1548
|
-
host: proxyUrl.hostname,
|
|
1549
|
-
port: parseInt(proxyUrl.port, 10) || 80,
|
|
1550
|
-
country: proxy.country
|
|
1551
|
-
};
|
|
1552
|
-
} catch {
|
|
1553
|
-
}
|
|
1554
|
-
} else if (proxy.host && proxy.port) {
|
|
1858
|
+
}
|
|
1859
|
+
let proxyMetadata;
|
|
1860
|
+
if (this.options.proxy) {
|
|
1861
|
+
const proxy = this.options.proxy;
|
|
1862
|
+
if (proxy.url) {
|
|
1863
|
+
try {
|
|
1864
|
+
const proxyUrl = new URL(proxy.url);
|
|
1555
1865
|
proxyMetadata = {
|
|
1556
|
-
host:
|
|
1557
|
-
port:
|
|
1866
|
+
host: proxyUrl.hostname,
|
|
1867
|
+
port: parseInt(proxyUrl.port, 10) || 80,
|
|
1558
1868
|
country: proxy.country
|
|
1559
1869
|
};
|
|
1870
|
+
} catch {
|
|
1560
1871
|
}
|
|
1872
|
+
} else if (proxy.host && proxy.port) {
|
|
1873
|
+
proxyMetadata = {
|
|
1874
|
+
host: proxy.host,
|
|
1875
|
+
port: proxy.port,
|
|
1876
|
+
country: proxy.country
|
|
1877
|
+
};
|
|
1561
1878
|
}
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
return result;
|
|
1577
|
-
});
|
|
1879
|
+
}
|
|
1880
|
+
const result = {
|
|
1881
|
+
markdown,
|
|
1882
|
+
html: htmlOutput,
|
|
1883
|
+
metadata: {
|
|
1884
|
+
baseUrl: url,
|
|
1885
|
+
totalPages: 1,
|
|
1886
|
+
scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1887
|
+
duration,
|
|
1888
|
+
website: websiteMetadata,
|
|
1889
|
+
proxy: proxyMetadata
|
|
1890
|
+
}
|
|
1891
|
+
};
|
|
1892
|
+
return result;
|
|
1578
1893
|
} catch (error) {
|
|
1579
|
-
|
|
1894
|
+
if (error instanceof AllEnginesFailedError) {
|
|
1895
|
+
const engineSummary = error.attemptedEngines.map((e) => `${e}: ${error.errors.get(e)?.message || "unknown"}`).join("; ");
|
|
1896
|
+
this.logger.error(`Failed to scrape ${url}: All engines failed - ${engineSummary}`);
|
|
1897
|
+
} else if (error instanceof Error) {
|
|
1898
|
+
this.logger.error(`Failed to scrape ${url}: ${error.message}`);
|
|
1899
|
+
} else {
|
|
1900
|
+
this.logger.error(`Failed to scrape ${url}: ${String(error)}`);
|
|
1901
|
+
}
|
|
1580
1902
|
if (this.options.onProgress) {
|
|
1581
1903
|
this.options.onProgress({
|
|
1582
1904
|
completed: index + 1,
|
|
@@ -1618,7 +1940,7 @@ async function scrape(options) {
|
|
|
1618
1940
|
}
|
|
1619
1941
|
|
|
1620
1942
|
// src/crawler.ts
|
|
1621
|
-
import { parseHTML as
|
|
1943
|
+
import { parseHTML as parseHTML3 } from "linkedom";
|
|
1622
1944
|
|
|
1623
1945
|
// src/utils/rate-limiter.ts
|
|
1624
1946
|
import pLimit2 from "p-limit";
|
|
@@ -1767,12 +2089,26 @@ var Crawler = class {
|
|
|
1767
2089
|
*/
|
|
1768
2090
|
extractLinks(html, baseUrl, depth) {
|
|
1769
2091
|
const links = [];
|
|
1770
|
-
const { document } =
|
|
2092
|
+
const { document } = parseHTML3(html);
|
|
1771
2093
|
document.querySelectorAll("a[href]").forEach((anchor) => {
|
|
1772
|
-
const
|
|
2094
|
+
const rawHref = anchor.getAttribute("href");
|
|
2095
|
+
if (!rawHref) return;
|
|
2096
|
+
const href = rawHref.trim();
|
|
1773
2097
|
if (!href) return;
|
|
1774
|
-
|
|
2098
|
+
if (href.startsWith("#")) return;
|
|
2099
|
+
const lowerHref = href.toLowerCase();
|
|
2100
|
+
if (lowerHref.startsWith("javascript:") || lowerHref.startsWith("mailto:") || lowerHref.startsWith("tel:") || lowerHref.startsWith("data:") || lowerHref.startsWith("blob:") || lowerHref.startsWith("ftp:")) {
|
|
2101
|
+
return;
|
|
2102
|
+
}
|
|
2103
|
+
let resolved = resolveUrl(href, baseUrl);
|
|
1775
2104
|
if (!resolved || !isValidUrl(resolved)) return;
|
|
2105
|
+
try {
|
|
2106
|
+
const parsed = new URL(resolved);
|
|
2107
|
+
parsed.hash = "";
|
|
2108
|
+
resolved = parsed.toString();
|
|
2109
|
+
} catch {
|
|
2110
|
+
return;
|
|
2111
|
+
}
|
|
1776
2112
|
if (!isSameDomain(resolved, this.options.url)) return;
|
|
1777
2113
|
if (!isContentUrl(resolved)) return;
|
|
1778
2114
|
if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
|
|
@@ -2883,9 +3219,9 @@ program.command("status").description("Check daemon status").option("-p, --port
|
|
|
2883
3219
|
});
|
|
2884
3220
|
program.command("scrape <urls...>").description("Scrape one or more URLs").option(
|
|
2885
3221
|
"-f, --format <formats>",
|
|
2886
|
-
"
|
|
3222
|
+
"Content formats to include (comma-separated: markdown,html)",
|
|
2887
3223
|
"markdown"
|
|
2888
|
-
).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--
|
|
3224
|
+
).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").option("--no-main-content", "Disable main content extraction (include full page)").option("--include-tags <selectors>", "CSS selectors for elements to include (comma-separated)").option("--exclude-tags <selectors>", "CSS selectors for elements to exclude (comma-separated)").option("--engine <name>", "Force a specific engine (http, tlsclient, hero)").option("--skip-engine <names>", "Skip specific engines (comma-separated: http,tlsclient,hero)").action(async (urls, options) => {
|
|
2889
3225
|
const port = parseInt(options.port, 10);
|
|
2890
3226
|
const useStandalone = options.standalone || false;
|
|
2891
3227
|
let useDaemon = false;
|
|
@@ -2902,7 +3238,7 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
|
|
|
2902
3238
|
}) : null;
|
|
2903
3239
|
try {
|
|
2904
3240
|
const formats = options.format.split(",").map((f) => f.trim());
|
|
2905
|
-
const validFormats = ["markdown", "html"
|
|
3241
|
+
const validFormats = ["markdown", "html"];
|
|
2906
3242
|
for (const format of formats) {
|
|
2907
3243
|
if (!validFormats.includes(format)) {
|
|
2908
3244
|
console.error(
|
|
@@ -2915,6 +3251,9 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
|
|
|
2915
3251
|
console.error(`Scraping ${urls.length} URL(s)...`);
|
|
2916
3252
|
console.error(`Formats: ${formats.join(", ")}`);
|
|
2917
3253
|
}
|
|
3254
|
+
const includeTags = options.includeTags ? options.includeTags.split(",").map((s) => s.trim()) : void 0;
|
|
3255
|
+
const excludeTags = options.excludeTags ? options.excludeTags.split(",").map((s) => s.trim()) : void 0;
|
|
3256
|
+
const skipEngines = options.skipEngine ? options.skipEngine.split(",").map((s) => s.trim()) : void 0;
|
|
2918
3257
|
const scrapeOptions = {
|
|
2919
3258
|
urls,
|
|
2920
3259
|
formats,
|
|
@@ -2923,33 +3262,29 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
|
|
|
2923
3262
|
batchTimeoutMs: parseInt(options.batchTimeout, 10),
|
|
2924
3263
|
proxy: options.proxy ? { url: options.proxy } : void 0,
|
|
2925
3264
|
userAgent: options.userAgent,
|
|
2926
|
-
includeMetadata: options.metadata !== false,
|
|
2927
3265
|
verbose: options.verbose || false,
|
|
2928
3266
|
showChrome: options.showChrome || false,
|
|
3267
|
+
// Content cleaning options
|
|
3268
|
+
onlyMainContent: options.mainContent !== false,
|
|
3269
|
+
// --no-main-content sets this to false
|
|
3270
|
+
includeTags,
|
|
3271
|
+
excludeTags,
|
|
3272
|
+
// Engine options
|
|
3273
|
+
forceEngine: options.engine,
|
|
3274
|
+
skipEngines,
|
|
2929
3275
|
onProgress: options.verbose ? ({ completed, total, currentUrl }) => {
|
|
2930
3276
|
console.error(`[${completed}/${total}] ${currentUrl}`);
|
|
2931
3277
|
} : void 0
|
|
2932
3278
|
};
|
|
2933
3279
|
const result = useDaemon ? await daemonClient.scrape(scrapeOptions) : await standaloneClient.scrape(scrapeOptions);
|
|
2934
|
-
|
|
2935
|
-
for (const site of result.data) {
|
|
2936
|
-
if (formats.includes("markdown") && site.markdown) {
|
|
2937
|
-
output += site.markdown + "\n\n";
|
|
2938
|
-
} else if (formats.includes("text") && site.text) {
|
|
2939
|
-
output += site.text + "\n\n";
|
|
2940
|
-
} else if (formats.includes("html") && site.html) {
|
|
2941
|
-
output += site.html + "\n\n";
|
|
2942
|
-
} else if (formats.includes("json") && site.json) {
|
|
2943
|
-
output += site.json + "\n\n";
|
|
2944
|
-
}
|
|
2945
|
-
}
|
|
3280
|
+
const output = JSON.stringify(result, null, 2);
|
|
2946
3281
|
if (options.output) {
|
|
2947
|
-
writeFileSync(options.output, output
|
|
3282
|
+
writeFileSync(options.output, output);
|
|
2948
3283
|
if (options.verbose) {
|
|
2949
3284
|
console.error(`Output written to ${options.output}`);
|
|
2950
3285
|
}
|
|
2951
3286
|
} else {
|
|
2952
|
-
console.log(output
|
|
3287
|
+
console.log(output);
|
|
2953
3288
|
}
|
|
2954
3289
|
if (options.verbose) {
|
|
2955
3290
|
console.error(`
|
|
@@ -2972,7 +3307,7 @@ Summary:`);
|
|
|
2972
3307
|
}
|
|
2973
3308
|
}
|
|
2974
3309
|
});
|
|
2975
|
-
program.command("crawl <url>").description("Crawl a website to discover and optionally scrape pages").option("-d, --depth <n>", "Maximum crawl depth", "1").option("-m, --max-pages <n>", "Maximum pages to discover", "20").option("-s, --scrape", "Also scrape content of discovered pages").option("-f, --format <formats>", "
|
|
3310
|
+
program.command("crawl <url>").description("Crawl a website to discover and optionally scrape pages").option("-d, --depth <n>", "Maximum crawl depth", "1").option("-m, --max-pages <n>", "Maximum pages to discover", "20").option("-s, --scrape", "Also scrape content of discovered pages").option("-f, --format <formats>", "Content formats when scraping (comma-separated: markdown,html)", "markdown").option("-o, --output <file>", "Output file (stdout if omitted)").option("--delay <ms>", "Delay between requests in milliseconds", "1000").option("-t, --timeout <ms>", "Total timeout for crawl operation in milliseconds").option("--include <patterns>", "URL patterns to include (comma-separated regex)").option("--exclude <patterns>", "URL patterns to exclude (comma-separated regex)").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").action(async (url, options) => {
|
|
2976
3311
|
const port = parseInt(options.port, 10);
|
|
2977
3312
|
const useStandalone = options.standalone || false;
|
|
2978
3313
|
let useDaemon = false;
|
|
@@ -3008,38 +3343,20 @@ program.command("crawl <url>").description("Crawl a website to discover and opti
|
|
|
3008
3343
|
verbose: options.verbose || false,
|
|
3009
3344
|
showChrome: options.showChrome || false
|
|
3010
3345
|
};
|
|
3011
|
-
const
|
|
3012
|
-
|
|
3013
|
-
|
|
3014
|
-
|
|
3015
|
-
|
|
3016
|
-
|
|
3017
|
-
|
|
3018
|
-
} else if (formats.includes("text") && site.text) {
|
|
3019
|
-
output += site.text + "\n\n";
|
|
3020
|
-
} else if (formats.includes("html") && site.html) {
|
|
3021
|
-
output += site.html + "\n\n";
|
|
3022
|
-
} else if (formats.includes("json") && site.json) {
|
|
3023
|
-
output += site.json + "\n\n";
|
|
3024
|
-
}
|
|
3025
|
-
}
|
|
3026
|
-
} else {
|
|
3027
|
-
output = JSON.stringify(
|
|
3028
|
-
{
|
|
3029
|
-
urls: result.urls,
|
|
3030
|
-
metadata: result.metadata
|
|
3031
|
-
},
|
|
3032
|
-
null,
|
|
3033
|
-
2
|
|
3034
|
-
);
|
|
3035
|
-
}
|
|
3346
|
+
const formats = options.format.split(",").map((f) => f.trim());
|
|
3347
|
+
const crawlOptionsWithFormats = {
|
|
3348
|
+
...crawlOptions,
|
|
3349
|
+
formats
|
|
3350
|
+
};
|
|
3351
|
+
const result = useDaemon ? await daemonClient.crawl(crawlOptionsWithFormats) : await standaloneClient.crawl(crawlOptionsWithFormats);
|
|
3352
|
+
const output = JSON.stringify(result, null, 2);
|
|
3036
3353
|
if (options.output) {
|
|
3037
|
-
writeFileSync(options.output, output
|
|
3354
|
+
writeFileSync(options.output, output);
|
|
3038
3355
|
if (options.verbose) {
|
|
3039
3356
|
console.error(`Output written to ${options.output}`);
|
|
3040
3357
|
}
|
|
3041
3358
|
} else {
|
|
3042
|
-
console.log(output
|
|
3359
|
+
console.log(output);
|
|
3043
3360
|
}
|
|
3044
3361
|
if (options.verbose) {
|
|
3045
3362
|
console.error(`
|