@vakra-dev/reader 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -26
- package/dist/cli/index.js +429 -733
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +205 -41
- package/dist/index.js +646 -714
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -14,27 +14,36 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
|
|
|
14
14
|
import pLimit from "p-limit";
|
|
15
15
|
|
|
16
16
|
// src/cloudflare/detector.ts
|
|
17
|
-
var
|
|
17
|
+
var CLOUDFLARE_CHALLENGE_SELECTORS = [
|
|
18
18
|
"#challenge-running",
|
|
19
19
|
"#challenge-stage",
|
|
20
20
|
"#challenge-form",
|
|
21
|
-
".cf-browser-verification"
|
|
21
|
+
".cf-browser-verification",
|
|
22
|
+
"#cf-wrapper",
|
|
23
|
+
"#cf-hcaptcha-container",
|
|
24
|
+
"#turnstile-wrapper"
|
|
22
25
|
];
|
|
23
|
-
var
|
|
24
|
-
"verifying you are human",
|
|
26
|
+
var CLOUDFLARE_TEXT_PATTERNS = [
|
|
25
27
|
"checking if the site connection is secure",
|
|
26
|
-
"this process is automatic. your browser will redirect"
|
|
28
|
+
"this process is automatic. your browser will redirect",
|
|
29
|
+
"ray id:",
|
|
30
|
+
"performance & security by cloudflare"
|
|
27
31
|
];
|
|
28
|
-
var
|
|
29
|
-
"
|
|
30
|
-
"
|
|
32
|
+
var CLOUDFLARE_INFRA_PATTERNS = [
|
|
33
|
+
"/cdn-cgi/",
|
|
34
|
+
"cloudflare",
|
|
35
|
+
"__cf_bm",
|
|
36
|
+
"cf-ray"
|
|
37
|
+
];
|
|
38
|
+
var CLOUDFLARE_BLOCKED_PATTERNS = [
|
|
31
39
|
"sorry, you have been blocked",
|
|
32
|
-
"
|
|
33
|
-
"403 forbidden"
|
|
40
|
+
"ray id:"
|
|
34
41
|
];
|
|
35
42
|
async function detectChallenge(hero) {
|
|
36
43
|
const signals = [];
|
|
37
44
|
let type = "none";
|
|
45
|
+
let hasCloudflareInfra = false;
|
|
46
|
+
let hasChallengeIndicator = false;
|
|
38
47
|
try {
|
|
39
48
|
if (!hero.document) {
|
|
40
49
|
return {
|
|
@@ -46,30 +55,51 @@ async function detectChallenge(hero) {
|
|
|
46
55
|
}
|
|
47
56
|
const html = await hero.document.documentElement.outerHTML;
|
|
48
57
|
const htmlLower = html.toLowerCase();
|
|
49
|
-
for (const
|
|
50
|
-
if (htmlLower.includes(
|
|
51
|
-
|
|
52
|
-
|
|
58
|
+
for (const pattern of CLOUDFLARE_INFRA_PATTERNS) {
|
|
59
|
+
if (htmlLower.includes(pattern)) {
|
|
60
|
+
hasCloudflareInfra = true;
|
|
61
|
+
signals.push(`Cloudflare infra: "${pattern}"`);
|
|
62
|
+
break;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
if (!hasCloudflareInfra) {
|
|
66
|
+
return {
|
|
67
|
+
isChallenge: false,
|
|
68
|
+
type: "none",
|
|
69
|
+
confidence: 0,
|
|
70
|
+
signals: ["No Cloudflare infrastructure detected"]
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
|
|
74
|
+
try {
|
|
75
|
+
const element = await hero.document.querySelector(selector);
|
|
76
|
+
if (element) {
|
|
77
|
+
hasChallengeIndicator = true;
|
|
78
|
+
signals.push(`Challenge element: ${selector}`);
|
|
79
|
+
type = "js_challenge";
|
|
80
|
+
}
|
|
81
|
+
} catch {
|
|
53
82
|
}
|
|
54
83
|
}
|
|
55
|
-
for (const pattern of
|
|
84
|
+
for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
|
|
56
85
|
if (htmlLower.includes(pattern)) {
|
|
86
|
+
hasChallengeIndicator = true;
|
|
57
87
|
signals.push(`Challenge text: "${pattern}"`);
|
|
58
88
|
type = type === "none" ? "js_challenge" : type;
|
|
59
89
|
}
|
|
60
90
|
}
|
|
61
91
|
if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
|
|
92
|
+
hasChallengeIndicator = true;
|
|
62
93
|
signals.push('Challenge text: "waiting for...to respond"');
|
|
63
94
|
type = type === "none" ? "js_challenge" : type;
|
|
64
95
|
}
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
}
|
|
96
|
+
const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
|
|
97
|
+
if (hasBlocked) {
|
|
98
|
+
hasChallengeIndicator = true;
|
|
99
|
+
signals.push("Cloudflare block page detected");
|
|
100
|
+
type = "blocked";
|
|
71
101
|
}
|
|
72
|
-
const isChallenge =
|
|
102
|
+
const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
|
|
73
103
|
const confidence = isChallenge ? 100 : 0;
|
|
74
104
|
return {
|
|
75
105
|
isChallenge,
|
|
@@ -186,84 +216,6 @@ var turndownService = new TurndownService({
|
|
|
186
216
|
linkStyle: "inlined",
|
|
187
217
|
linkReferenceStyle: "full"
|
|
188
218
|
});
|
|
189
|
-
function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
190
|
-
const sections = [];
|
|
191
|
-
if (includeMetadata) {
|
|
192
|
-
sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
193
|
-
}
|
|
194
|
-
if (pages.length > 1) {
|
|
195
|
-
sections.push(createMarkdownTOC(pages));
|
|
196
|
-
}
|
|
197
|
-
sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
|
|
198
|
-
return sections.join("\n\n");
|
|
199
|
-
}
|
|
200
|
-
function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
201
|
-
const title = website.title || extractDomainFromUrl(baseUrl);
|
|
202
|
-
const description = website.description || "";
|
|
203
|
-
let header = `# Website Scrape: ${title}
|
|
204
|
-
|
|
205
|
-
`;
|
|
206
|
-
header += `**Base URL:** ${baseUrl}
|
|
207
|
-
`;
|
|
208
|
-
header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
|
|
209
|
-
`;
|
|
210
|
-
header += `**Duration:** ${duration}ms
|
|
211
|
-
`;
|
|
212
|
-
header += `**Total pages:** ${totalPages}
|
|
213
|
-
`;
|
|
214
|
-
if (description) {
|
|
215
|
-
header += `**Description:** ${description}
|
|
216
|
-
`;
|
|
217
|
-
}
|
|
218
|
-
if (website.author) {
|
|
219
|
-
header += `**Author:** ${website.author}
|
|
220
|
-
`;
|
|
221
|
-
}
|
|
222
|
-
if (website.language) {
|
|
223
|
-
header += `**Language:** ${website.language}
|
|
224
|
-
`;
|
|
225
|
-
}
|
|
226
|
-
return header;
|
|
227
|
-
}
|
|
228
|
-
function createMarkdownTOC(pages) {
|
|
229
|
-
let toc = "## Table of Contents\n\n";
|
|
230
|
-
pages.forEach((page, index) => {
|
|
231
|
-
const depth = " ".repeat(page.depth);
|
|
232
|
-
const pageNumber = index + 1;
|
|
233
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
234
|
-
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
235
|
-
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
236
|
-
toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
|
|
237
|
-
`;
|
|
238
|
-
});
|
|
239
|
-
return toc;
|
|
240
|
-
}
|
|
241
|
-
function createMarkdownPage(page, pageNumber) {
|
|
242
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
243
|
-
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
244
|
-
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
245
|
-
let pageContent = `---
|
|
246
|
-
|
|
247
|
-
`;
|
|
248
|
-
pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
|
|
249
|
-
|
|
250
|
-
`;
|
|
251
|
-
pageContent += `**URL:** ${page.url}
|
|
252
|
-
`;
|
|
253
|
-
pageContent += `**Title:** ${page.title}
|
|
254
|
-
`;
|
|
255
|
-
pageContent += `**Depth:** ${page.depth}
|
|
256
|
-
`;
|
|
257
|
-
pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
|
|
258
|
-
|
|
259
|
-
`;
|
|
260
|
-
pageContent += `---
|
|
261
|
-
|
|
262
|
-
`;
|
|
263
|
-
const markdown = htmlToMarkdown(page.html);
|
|
264
|
-
pageContent += markdown;
|
|
265
|
-
return pageContent;
|
|
266
|
-
}
|
|
267
219
|
function htmlToMarkdown(html) {
|
|
268
220
|
try {
|
|
269
221
|
return turndownService.turndown(html);
|
|
@@ -272,596 +224,340 @@ function htmlToMarkdown(html) {
|
|
|
272
224
|
return html.replace(/<[^>]*>/g, "").trim();
|
|
273
225
|
}
|
|
274
226
|
}
|
|
275
|
-
|
|
276
|
-
try {
|
|
277
|
-
return new URL(url).hostname;
|
|
278
|
-
} catch {
|
|
279
|
-
return "Unknown";
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
// src/formatters/html.ts
|
|
284
|
-
function formatToHTML(pages, baseUrl, scrapedAt, duration, website) {
|
|
285
|
-
const html = `<!DOCTYPE html>
|
|
286
|
-
<html lang="${website.language || "en"}">
|
|
287
|
-
<head>
|
|
288
|
-
<meta charset="${website.charset || "UTF-8"}">
|
|
289
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
290
|
-
<title>Scrape: ${website.title || extractDomainFromUrl2(baseUrl)}</title>
|
|
291
|
-
${generateMetaTags(website)}
|
|
292
|
-
<style>
|
|
293
|
-
${generateCSS()}
|
|
294
|
-
</style>
|
|
295
|
-
</head>
|
|
296
|
-
<body>
|
|
297
|
-
<header class="header">
|
|
298
|
-
<h1>Website Scrape: ${escapeHtml(website.title || extractDomainFromUrl2(baseUrl))}</h1>
|
|
299
|
-
<div class="meta-info">
|
|
300
|
-
<p><strong>Base URL:</strong> <a href="${escapeHtml(
|
|
301
|
-
baseUrl
|
|
302
|
-
)}" target="_blank">${escapeHtml(baseUrl)}</a></p>
|
|
303
|
-
<p><strong>Scraped at:</strong> ${new Date(scrapedAt).toLocaleString()}</p>
|
|
304
|
-
<p><strong>Duration:</strong> ${duration}ms</p>
|
|
305
|
-
<p><strong>Total pages:</strong> ${pages.length}</p>
|
|
306
|
-
${website.description ? `<p><strong>Description:</strong> ${escapeHtml(website.description)}</p>` : ""}
|
|
307
|
-
${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
|
|
308
|
-
${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
|
|
309
|
-
</div>
|
|
310
|
-
</header>
|
|
311
|
-
|
|
312
|
-
${pages.length > 1 ? generateTOC(pages) : ""}
|
|
313
|
-
|
|
314
|
-
<main class="content">
|
|
315
|
-
${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
|
|
316
|
-
</main>
|
|
317
|
-
|
|
318
|
-
<footer class="footer">
|
|
319
|
-
<p>Generated by Reader JS/TS SDK</p>
|
|
320
|
-
</footer>
|
|
321
|
-
|
|
322
|
-
<script>
|
|
323
|
-
${generateJavaScript()}
|
|
324
|
-
</script>
|
|
325
|
-
</body>
|
|
326
|
-
</html>`;
|
|
327
|
-
return html;
|
|
328
|
-
}
|
|
329
|
-
function generateMetaTags(website) {
|
|
330
|
-
const tags = [];
|
|
331
|
-
if (website.description) {
|
|
332
|
-
tags.push(`<meta name="description" content="${escapeHtml(website.description)}">`);
|
|
333
|
-
}
|
|
334
|
-
if (website.author) {
|
|
335
|
-
tags.push(`<meta name="author" content="${escapeHtml(website.author)}">`);
|
|
336
|
-
}
|
|
337
|
-
if (website.keywords) {
|
|
338
|
-
tags.push(`<meta name="keywords" content="${escapeHtml(website.keywords.join(", "))}">`);
|
|
339
|
-
}
|
|
340
|
-
if (website.robots) {
|
|
341
|
-
tags.push(`<meta name="robots" content="${escapeHtml(website.robots)}">`);
|
|
342
|
-
}
|
|
343
|
-
if (website.themeColor) {
|
|
344
|
-
tags.push(`<meta name="theme-color" content="${escapeHtml(website.themeColor)}">`);
|
|
345
|
-
}
|
|
346
|
-
if (website.favicon) {
|
|
347
|
-
tags.push(`<link rel="icon" href="${escapeHtml(website.favicon)}">`);
|
|
348
|
-
}
|
|
349
|
-
if (website.canonical) {
|
|
350
|
-
tags.push(`<link rel="canonical" href="${escapeHtml(website.canonical)}">`);
|
|
351
|
-
}
|
|
352
|
-
if (website.openGraph) {
|
|
353
|
-
const og = website.openGraph;
|
|
354
|
-
if (og.title) tags.push(`<meta property="og:title" content="${escapeHtml(og.title)}">`);
|
|
355
|
-
if (og.description)
|
|
356
|
-
tags.push(`<meta property="og:description" content="${escapeHtml(og.description)}">`);
|
|
357
|
-
if (og.type) tags.push(`<meta property="og:type" content="${escapeHtml(og.type)}">`);
|
|
358
|
-
if (og.url) tags.push(`<meta property="og:url" content="${escapeHtml(og.url)}">`);
|
|
359
|
-
if (og.image) tags.push(`<meta property="og:image" content="${escapeHtml(og.image)}">`);
|
|
360
|
-
if (og.siteName)
|
|
361
|
-
tags.push(`<meta property="og:site_name" content="${escapeHtml(og.siteName)}">`);
|
|
362
|
-
if (og.locale) tags.push(`<meta property="og:locale" content="${escapeHtml(og.locale)}">`);
|
|
363
|
-
}
|
|
364
|
-
if (website.twitter) {
|
|
365
|
-
const twitter = website.twitter;
|
|
366
|
-
if (twitter.card) tags.push(`<meta name="twitter:card" content="${escapeHtml(twitter.card)}">`);
|
|
367
|
-
if (twitter.site) tags.push(`<meta name="twitter:site" content="${escapeHtml(twitter.site)}">`);
|
|
368
|
-
if (twitter.creator)
|
|
369
|
-
tags.push(`<meta name="twitter:creator" content="${escapeHtml(twitter.creator)}">`);
|
|
370
|
-
if (twitter.title)
|
|
371
|
-
tags.push(`<meta name="twitter:title" content="${escapeHtml(twitter.title)}">`);
|
|
372
|
-
if (twitter.description)
|
|
373
|
-
tags.push(`<meta name="twitter:description" content="${escapeHtml(twitter.description)}">`);
|
|
374
|
-
if (twitter.image)
|
|
375
|
-
tags.push(`<meta name="twitter:image" content="${escapeHtml(twitter.image)}">`);
|
|
376
|
-
}
|
|
377
|
-
return tags.join("\n ");
|
|
378
|
-
}
|
|
379
|
-
function generateCSS() {
|
|
380
|
-
return `
|
|
381
|
-
* {
|
|
382
|
-
margin: 0;
|
|
383
|
-
padding: 0;
|
|
384
|
-
box-sizing: border-box;
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
body {
|
|
388
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
389
|
-
line-height: 1.6;
|
|
390
|
-
color: #333;
|
|
391
|
-
background-color: #f8f9fa;
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
.header {
|
|
395
|
-
background: white;
|
|
396
|
-
padding: 2rem;
|
|
397
|
-
border-bottom: 1px solid #e9ecef;
|
|
398
|
-
margin-bottom: 2rem;
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
.header h1 {
|
|
402
|
-
color: #2c3e50;
|
|
403
|
-
margin-bottom: 1rem;
|
|
404
|
-
font-size: 2rem;
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
.meta-info {
|
|
408
|
-
display: grid;
|
|
409
|
-
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
|
410
|
-
gap: 0.5rem;
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
.meta-info p {
|
|
414
|
-
margin: 0.25rem 0;
|
|
415
|
-
font-size: 0.9rem;
|
|
416
|
-
color: #6c757d;
|
|
417
|
-
}
|
|
418
|
-
|
|
419
|
-
.toc {
|
|
420
|
-
background: white;
|
|
421
|
-
padding: 1.5rem;
|
|
422
|
-
margin: 2rem 0;
|
|
423
|
-
border-radius: 8px;
|
|
424
|
-
border: 1px solid #e9ecef;
|
|
425
|
-
}
|
|
426
|
-
|
|
427
|
-
.toc h2 {
|
|
428
|
-
color: #2c3e50;
|
|
429
|
-
margin-bottom: 1rem;
|
|
430
|
-
font-size: 1.25rem;
|
|
431
|
-
}
|
|
432
|
-
|
|
433
|
-
.toc ul {
|
|
434
|
-
list-style: none;
|
|
435
|
-
}
|
|
436
|
-
|
|
437
|
-
.toc li {
|
|
438
|
-
margin: 0.5rem 0;
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
.toc a {
|
|
442
|
-
color: #007bff;
|
|
443
|
-
text-decoration: none;
|
|
444
|
-
transition: color 0.2s;
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
.toc a:hover {
|
|
448
|
-
color: #0056b3;
|
|
449
|
-
text-decoration: underline;
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
.content {
|
|
453
|
-
max-width: 800px;
|
|
454
|
-
margin: 0 auto;
|
|
455
|
-
padding: 0 1rem;
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
.page {
|
|
459
|
-
background: white;
|
|
460
|
-
margin: 2rem 0;
|
|
461
|
-
padding: 2rem;
|
|
462
|
-
border-radius: 8px;
|
|
463
|
-
border: 1px solid #e9ecef;
|
|
464
|
-
box-shadow: 0 2px 4px rgba(0,0,0,0.05);
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
.page-header {
|
|
468
|
-
border-bottom: 2px solid #e9ecef;
|
|
469
|
-
padding-bottom: 1rem;
|
|
470
|
-
margin-bottom: 2rem;
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
.page-header h2 {
|
|
474
|
-
color: #2c3e50;
|
|
475
|
-
margin-bottom: 0.5rem;
|
|
476
|
-
font-size: 1.5rem;
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
.page-meta {
|
|
480
|
-
display: flex;
|
|
481
|
-
flex-wrap: wrap;
|
|
482
|
-
gap: 1rem;
|
|
483
|
-
font-size: 0.9rem;
|
|
484
|
-
color: #6c757d;
|
|
485
|
-
}
|
|
486
|
-
|
|
487
|
-
.page-content {
|
|
488
|
-
line-height: 1.8;
|
|
489
|
-
}
|
|
490
|
-
|
|
491
|
-
.page-content h1, .page-content h2, .page-content h3,
|
|
492
|
-
.page-content h4, .page-content h5, .page-content h6 {
|
|
493
|
-
color: #2c3e50;
|
|
494
|
-
margin: 1.5rem 0 0.5rem 0;
|
|
495
|
-
}
|
|
496
|
-
|
|
497
|
-
.page-content p {
|
|
498
|
-
margin: 1rem 0;
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
.page-content a {
|
|
502
|
-
color: #007bff;
|
|
503
|
-
text-decoration: none;
|
|
504
|
-
}
|
|
505
|
-
|
|
506
|
-
.page-content a:hover {
|
|
507
|
-
text-decoration: underline;
|
|
508
|
-
}
|
|
509
|
-
|
|
510
|
-
.page-content code {
|
|
511
|
-
background: #f8f9fa;
|
|
512
|
-
padding: 0.2rem 0.4rem;
|
|
513
|
-
border-radius: 4px;
|
|
514
|
-
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
|
515
|
-
font-size: 0.9em;
|
|
516
|
-
}
|
|
517
|
-
|
|
518
|
-
.page-content pre {
|
|
519
|
-
background: #f8f9fa;
|
|
520
|
-
padding: 1rem;
|
|
521
|
-
border-radius: 4px;
|
|
522
|
-
overflow-x: auto;
|
|
523
|
-
margin: 1rem 0;
|
|
524
|
-
}
|
|
525
|
-
|
|
526
|
-
.page-content blockquote {
|
|
527
|
-
border-left: 4px solid #007bff;
|
|
528
|
-
padding-left: 1rem;
|
|
529
|
-
margin: 1rem 0;
|
|
530
|
-
color: #6c757d;
|
|
531
|
-
}
|
|
532
|
-
|
|
533
|
-
.footer {
|
|
534
|
-
text-align: center;
|
|
535
|
-
padding: 2rem;
|
|
536
|
-
margin-top: 3rem;
|
|
537
|
-
border-top: 1px solid #e9ecef;
|
|
538
|
-
color: #6c757d;
|
|
539
|
-
font-size: 0.9rem;
|
|
540
|
-
}
|
|
541
|
-
|
|
542
|
-
@media (max-width: 768px) {
|
|
543
|
-
.header {
|
|
544
|
-
padding: 1rem;
|
|
545
|
-
}
|
|
546
|
-
|
|
547
|
-
.header h1 {
|
|
548
|
-
font-size: 1.5rem;
|
|
549
|
-
}
|
|
550
|
-
|
|
551
|
-
.page {
|
|
552
|
-
padding: 1rem;
|
|
553
|
-
}
|
|
554
|
-
|
|
555
|
-
.page-meta {
|
|
556
|
-
flex-direction: column;
|
|
557
|
-
gap: 0.5rem;
|
|
558
|
-
}
|
|
559
|
-
}
|
|
560
|
-
`.trim();
|
|
561
|
-
}
|
|
562
|
-
function generateTOC(pages) {
|
|
563
|
-
const tocItems = pages.map((page, index) => {
|
|
564
|
-
const pageNumber = index + 1;
|
|
565
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
566
|
-
const id = `page-${pageNumber}`;
|
|
567
|
-
return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
|
|
568
|
-
}).join("\n");
|
|
569
|
-
return `
|
|
570
|
-
<nav class="toc">
|
|
571
|
-
<h2>Table of Contents</h2>
|
|
572
|
-
<ul>
|
|
573
|
-
${tocItems}
|
|
574
|
-
</ul>
|
|
575
|
-
</nav>`;
|
|
576
|
-
}
|
|
577
|
-
function generatePageHTML(page, pageNumber) {
|
|
578
|
-
const id = `page-${pageNumber}`;
|
|
579
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
580
|
-
return `
|
|
581
|
-
<article class="page" id="${id}">
|
|
582
|
-
<div class="page-header">
|
|
583
|
-
<h2>${pageNumber}. ${escapeHtml(title)}</h2>
|
|
584
|
-
<div class="page-meta">
|
|
585
|
-
<span><strong>URL:</strong> <a href="${escapeHtml(
|
|
586
|
-
page.url
|
|
587
|
-
)}" target="_blank">${escapeHtml(page.url)}</a></span>
|
|
588
|
-
<span><strong>Depth:</strong> ${page.depth}</span>
|
|
589
|
-
<span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
|
|
590
|
-
</div>
|
|
591
|
-
</div>
|
|
592
|
-
<div class="page-content">
|
|
593
|
-
${page.html}
|
|
594
|
-
</div>
|
|
595
|
-
</article>`;
|
|
596
|
-
}
|
|
597
|
-
function generateJavaScript() {
|
|
598
|
-
return `
|
|
599
|
-
// Smooth scrolling for TOC links
|
|
600
|
-
document.querySelectorAll('a[href^="#"]').forEach(anchor => {
|
|
601
|
-
anchor.addEventListener('click', function (e) {
|
|
602
|
-
e.preventDefault();
|
|
603
|
-
const target = document.querySelector(this.getAttribute('href'));
|
|
604
|
-
if (target) {
|
|
605
|
-
target.scrollIntoView({
|
|
606
|
-
behavior: 'smooth',
|
|
607
|
-
block: 'start'
|
|
608
|
-
});
|
|
609
|
-
}
|
|
610
|
-
});
|
|
611
|
-
});
|
|
612
|
-
|
|
613
|
-
// Highlight current section in TOC
|
|
614
|
-
window.addEventListener('scroll', function() {
|
|
615
|
-
const pages = document.querySelectorAll('.page');
|
|
616
|
-
const tocLinks = document.querySelectorAll('.toc a');
|
|
617
|
-
|
|
618
|
-
let currentPage = null;
|
|
619
|
-
pages.forEach(page => {
|
|
620
|
-
const rect = page.getBoundingClientRect();
|
|
621
|
-
if (rect.top <= 100) {
|
|
622
|
-
currentPage = page;
|
|
623
|
-
}
|
|
624
|
-
});
|
|
625
|
-
|
|
626
|
-
tocLinks.forEach(link => {
|
|
627
|
-
link.style.fontWeight = 'normal';
|
|
628
|
-
const target = document.querySelector(link.getAttribute('href'));
|
|
629
|
-
if (target === currentPage) {
|
|
630
|
-
link.style.fontWeight = 'bold';
|
|
631
|
-
}
|
|
632
|
-
});
|
|
633
|
-
});
|
|
634
|
-
`;
|
|
635
|
-
}
|
|
636
|
-
function escapeHtml(text) {
|
|
637
|
-
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'").replace(/\//g, "/");
|
|
638
|
-
}
|
|
639
|
-
function extractDomainFromUrl2(url) {
|
|
640
|
-
try {
|
|
641
|
-
return new URL(url).hostname;
|
|
642
|
-
} catch {
|
|
643
|
-
return "Unknown";
|
|
644
|
-
}
|
|
645
|
-
}
|
|
646
|
-
|
|
647
|
-
// src/formatters/json.ts
|
|
648
|
-
function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
|
|
649
|
-
const jsonResult = {
|
|
650
|
-
metadata: {
|
|
651
|
-
baseUrl,
|
|
652
|
-
totalPages: pages.length,
|
|
653
|
-
scrapedAt,
|
|
654
|
-
duration,
|
|
655
|
-
website
|
|
656
|
-
},
|
|
657
|
-
pages: pages.map((page, index) => ({
|
|
658
|
-
index: index + 1,
|
|
659
|
-
url: page.url,
|
|
660
|
-
title: page.title,
|
|
661
|
-
markdown: page.markdown,
|
|
662
|
-
html: page.html,
|
|
663
|
-
fetchedAt: page.fetchedAt,
|
|
664
|
-
depth: page.depth,
|
|
665
|
-
wordCount: countWords(page.markdown),
|
|
666
|
-
readingTime: estimateReadingTime(page.markdown)
|
|
667
|
-
}))
|
|
668
|
-
};
|
|
669
|
-
return JSON.stringify(jsonResult, null, 2);
|
|
670
|
-
}
|
|
671
|
-
function formatToJsonLite(pages, baseUrl, scrapedAt, duration, website) {
|
|
672
|
-
const jsonResult = {
|
|
673
|
-
metadata: {
|
|
674
|
-
baseUrl,
|
|
675
|
-
totalPages: pages.length,
|
|
676
|
-
scrapedAt,
|
|
677
|
-
duration,
|
|
678
|
-
website
|
|
679
|
-
},
|
|
680
|
-
pages: pages.map((page, index) => ({
|
|
681
|
-
index: index + 1,
|
|
682
|
-
url: page.url,
|
|
683
|
-
title: page.title,
|
|
684
|
-
markdown: page.markdown,
|
|
685
|
-
fetchedAt: page.fetchedAt,
|
|
686
|
-
depth: page.depth,
|
|
687
|
-
wordCount: countWords(page.markdown),
|
|
688
|
-
readingTime: estimateReadingTime(page.markdown)
|
|
689
|
-
}))
|
|
690
|
-
};
|
|
691
|
-
return JSON.stringify(jsonResult, null, 2);
|
|
692
|
-
}
|
|
693
|
-
function countWords(markdown) {
|
|
694
|
-
const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
|
|
695
|
-
return plainText.split(/\s+/).filter((word) => word.length > 0).length;
|
|
696
|
-
}
|
|
697
|
-
function estimateReadingTime(markdown) {
|
|
698
|
-
const wordCount = countWords(markdown);
|
|
699
|
-
return Math.ceil(wordCount / 200);
|
|
700
|
-
}
|
|
701
|
-
|
|
702
|
-
// src/formatters/text.ts
|
|
703
|
-
import { parseHTML } from "linkedom";
|
|
704
|
-
function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
705
|
-
const sections = [];
|
|
706
|
-
if (includeMetadata) {
|
|
707
|
-
sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
708
|
-
}
|
|
709
|
-
sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
|
|
710
|
-
return sections.join("\n\n");
|
|
711
|
-
}
|
|
712
|
-
function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
713
|
-
const title = website.title || extractDomainFromUrl3(baseUrl);
|
|
714
|
-
const lines = [];
|
|
715
|
-
lines.push(`=== ${title} ===`);
|
|
716
|
-
lines.push("");
|
|
717
|
-
lines.push(`URL: ${baseUrl}`);
|
|
718
|
-
lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
|
|
719
|
-
lines.push(`Duration: ${duration}ms`);
|
|
720
|
-
lines.push(`Pages: ${totalPages}`);
|
|
721
|
-
if (website.description) {
|
|
722
|
-
lines.push(`Description: ${website.description}`);
|
|
723
|
-
}
|
|
724
|
-
if (website.author) {
|
|
725
|
-
lines.push(`Author: ${website.author}`);
|
|
726
|
-
}
|
|
727
|
-
if (website.language) {
|
|
728
|
-
lines.push(`Language: ${website.language}`);
|
|
729
|
-
}
|
|
730
|
-
return lines.join("\n");
|
|
731
|
-
}
|
|
732
|
-
function createTextPage(page, pageNumber, showSeparator) {
|
|
733
|
-
const lines = [];
|
|
734
|
-
if (showSeparator) {
|
|
735
|
-
lines.push("\u2500".repeat(60));
|
|
736
|
-
lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
|
|
737
|
-
lines.push(`URL: ${page.url}`);
|
|
738
|
-
lines.push("\u2500".repeat(60));
|
|
739
|
-
}
|
|
740
|
-
const plainText = htmlToPlainText(page.html);
|
|
741
|
-
lines.push(plainText);
|
|
742
|
-
return lines.join("\n");
|
|
743
|
-
}
|
|
744
|
-
function htmlToPlainText(html) {
|
|
745
|
-
const { document } = parseHTML(html);
|
|
746
|
-
const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
|
|
747
|
-
elementsToRemove.forEach((tag) => {
|
|
748
|
-
document.querySelectorAll(tag).forEach((el) => el.remove());
|
|
749
|
-
});
|
|
750
|
-
let text = document.body?.textContent || document.documentElement?.textContent || "";
|
|
751
|
-
text = text.replace(/[ \t]+/g, " ");
|
|
752
|
-
text = text.replace(/\n[ \t]+/g, "\n");
|
|
753
|
-
text = text.replace(/[ \t]+\n/g, "\n");
|
|
754
|
-
text = text.replace(/\n{3,}/g, "\n\n");
|
|
755
|
-
text = text.trim();
|
|
756
|
-
return text;
|
|
757
|
-
}
|
|
758
|
-
function extractDomainFromUrl3(url) {
|
|
759
|
-
try {
|
|
760
|
-
return new URL(url).hostname;
|
|
761
|
-
} catch {
|
|
762
|
-
return "Unknown";
|
|
763
|
-
}
|
|
764
|
-
}
|
|
227
|
+
var formatToMarkdown = htmlToMarkdown;
|
|
765
228
|
|
|
766
229
|
// src/utils/content-cleaner.ts
|
|
767
|
-
import { parseHTML
|
|
230
|
+
import { parseHTML } from "linkedom";
|
|
768
231
|
var ALWAYS_REMOVE_SELECTORS = [
|
|
769
|
-
// Navigation and menus
|
|
770
|
-
"nav",
|
|
771
|
-
"header nav",
|
|
772
|
-
"footer nav",
|
|
773
|
-
".nav",
|
|
774
|
-
".navigation",
|
|
775
|
-
".menu",
|
|
776
|
-
".navbar",
|
|
777
|
-
".sidebar",
|
|
778
|
-
".aside",
|
|
779
|
-
// Header and footer elements
|
|
780
|
-
"header",
|
|
781
|
-
"footer",
|
|
782
|
-
".site-header",
|
|
783
|
-
".page-header",
|
|
784
|
-
".site-footer",
|
|
785
|
-
".page-footer",
|
|
786
|
-
// Social media and sharing
|
|
787
|
-
".social",
|
|
788
|
-
".share",
|
|
789
|
-
".sharing",
|
|
790
|
-
".twitter",
|
|
791
|
-
".facebook",
|
|
792
|
-
".linkedin",
|
|
793
|
-
".instagram",
|
|
794
|
-
// Comments and discussions
|
|
795
|
-
".comments",
|
|
796
|
-
".comment",
|
|
797
|
-
".discussion",
|
|
798
|
-
".disqus",
|
|
799
|
-
// Forms and interactive elements
|
|
800
|
-
"form",
|
|
801
|
-
"input",
|
|
802
|
-
"button:not([type='submit'])",
|
|
803
|
-
"select",
|
|
804
|
-
"textarea",
|
|
805
232
|
// Scripts and styles
|
|
806
233
|
"script",
|
|
807
234
|
"style",
|
|
808
235
|
"noscript",
|
|
236
|
+
"link[rel='stylesheet']",
|
|
809
237
|
// Hidden elements
|
|
810
238
|
"[hidden]",
|
|
239
|
+
"[aria-hidden='true']",
|
|
811
240
|
"[style*='display: none']",
|
|
812
241
|
"[style*='display:none']",
|
|
813
|
-
|
|
814
|
-
"
|
|
815
|
-
|
|
816
|
-
"
|
|
242
|
+
"[style*='visibility: hidden']",
|
|
243
|
+
"[style*='visibility:hidden']",
|
|
244
|
+
// SVG icons and decorative elements
|
|
245
|
+
"svg[aria-hidden='true']",
|
|
246
|
+
"svg.icon",
|
|
247
|
+
"svg[class*='icon']",
|
|
248
|
+
// Template and metadata
|
|
249
|
+
"template",
|
|
250
|
+
"meta",
|
|
251
|
+
// Embeds that don't convert to text
|
|
252
|
+
"iframe",
|
|
253
|
+
"canvas",
|
|
254
|
+
"object",
|
|
255
|
+
"embed",
|
|
256
|
+
// Forms (usually not main content)
|
|
257
|
+
"form",
|
|
258
|
+
"input",
|
|
259
|
+
"select",
|
|
260
|
+
"textarea",
|
|
261
|
+
"button"
|
|
262
|
+
];
|
|
263
|
+
var OVERLAY_SELECTORS = [
|
|
264
|
+
"[class*='modal']",
|
|
265
|
+
"[class*='popup']",
|
|
266
|
+
"[class*='overlay']",
|
|
267
|
+
"[class*='dialog']",
|
|
268
|
+
"[role='dialog']",
|
|
269
|
+
"[role='alertdialog']",
|
|
270
|
+
"[class*='cookie']",
|
|
271
|
+
"[class*='consent']",
|
|
272
|
+
"[class*='gdpr']",
|
|
273
|
+
"[class*='privacy-banner']",
|
|
274
|
+
"[class*='notification-bar']",
|
|
275
|
+
"[id*='cookie']",
|
|
276
|
+
"[id*='consent']",
|
|
277
|
+
"[id*='gdpr']",
|
|
278
|
+
// Fixed/sticky positioned elements
|
|
279
|
+
"[style*='position: fixed']",
|
|
280
|
+
"[style*='position:fixed']",
|
|
281
|
+
"[style*='position: sticky']",
|
|
282
|
+
"[style*='position:sticky']"
|
|
283
|
+
];
|
|
284
|
+
var NAVIGATION_SELECTORS = [
|
|
285
|
+
// Semantic elements
|
|
286
|
+
"header",
|
|
287
|
+
"footer",
|
|
288
|
+
"nav",
|
|
289
|
+
"aside",
|
|
290
|
+
// Header variations
|
|
291
|
+
".header",
|
|
292
|
+
".top",
|
|
293
|
+
".navbar",
|
|
294
|
+
"#header",
|
|
295
|
+
// Footer variations
|
|
296
|
+
".footer",
|
|
297
|
+
".bottom",
|
|
298
|
+
"#footer",
|
|
299
|
+
// Sidebars
|
|
300
|
+
".sidebar",
|
|
301
|
+
".side",
|
|
302
|
+
".aside",
|
|
303
|
+
"#sidebar",
|
|
304
|
+
// Modals/popups (backup if not caught by OVERLAY_SELECTORS)
|
|
817
305
|
".modal",
|
|
306
|
+
".popup",
|
|
307
|
+
"#modal",
|
|
818
308
|
".overlay",
|
|
819
|
-
|
|
309
|
+
// Ads
|
|
310
|
+
".ad",
|
|
311
|
+
".ads",
|
|
312
|
+
".advert",
|
|
313
|
+
"#ad",
|
|
314
|
+
// Language selectors
|
|
315
|
+
".lang-selector",
|
|
316
|
+
".language",
|
|
317
|
+
"#language-selector",
|
|
318
|
+
// Social
|
|
319
|
+
".social",
|
|
320
|
+
".social-media",
|
|
321
|
+
".social-links",
|
|
322
|
+
"#social",
|
|
323
|
+
// Navigation/menus
|
|
324
|
+
".menu",
|
|
325
|
+
".navigation",
|
|
326
|
+
"#nav",
|
|
820
327
|
// Breadcrumbs
|
|
821
|
-
".breadcrumb",
|
|
822
328
|
".breadcrumbs",
|
|
823
|
-
"
|
|
329
|
+
"#breadcrumbs",
|
|
330
|
+
// Share buttons
|
|
331
|
+
".share",
|
|
332
|
+
"#share",
|
|
333
|
+
// Widgets
|
|
334
|
+
".widget",
|
|
335
|
+
"#widget",
|
|
336
|
+
// Cookie notices (backup)
|
|
337
|
+
".cookie",
|
|
338
|
+
"#cookie"
|
|
339
|
+
];
|
|
340
|
+
var FORCE_INCLUDE_SELECTORS = [
|
|
341
|
+
// IDs
|
|
342
|
+
"#main",
|
|
343
|
+
"#content",
|
|
344
|
+
"#main-content",
|
|
345
|
+
"#article",
|
|
346
|
+
"#post",
|
|
347
|
+
"#page-content",
|
|
348
|
+
// Semantic elements
|
|
349
|
+
"main",
|
|
350
|
+
"article",
|
|
351
|
+
"[role='main']",
|
|
352
|
+
// Classes
|
|
353
|
+
".main-content",
|
|
354
|
+
".content",
|
|
355
|
+
".post-content",
|
|
356
|
+
".article-content",
|
|
357
|
+
".entry-content",
|
|
358
|
+
".page-content",
|
|
359
|
+
".article-body",
|
|
360
|
+
".post-body",
|
|
361
|
+
".story-content",
|
|
362
|
+
".blog-content"
|
|
824
363
|
];
|
|
825
364
|
var AD_SELECTORS = [
|
|
826
|
-
//
|
|
827
|
-
".
|
|
828
|
-
".ads",
|
|
829
|
-
".advertisement",
|
|
830
|
-
".promotion",
|
|
831
|
-
".sponsored",
|
|
832
|
-
"[class*='ad-']",
|
|
833
|
-
"[id*='ad-']",
|
|
834
|
-
"[class*='advert']",
|
|
835
|
-
"[id*='advert']",
|
|
836
|
-
"[class*='banner']",
|
|
837
|
-
"[id*='banner']",
|
|
365
|
+
// Google ads
|
|
366
|
+
"ins.adsbygoogle",
|
|
838
367
|
".google-ad",
|
|
839
368
|
".adsense",
|
|
369
|
+
// Generic ad containers
|
|
840
370
|
"[data-ad]",
|
|
841
371
|
"[data-ads]",
|
|
842
|
-
"
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
"
|
|
846
|
-
"
|
|
847
|
-
"
|
|
372
|
+
"[data-ad-slot]",
|
|
373
|
+
"[data-ad-client]",
|
|
374
|
+
// Common ad class patterns
|
|
375
|
+
".ad-container",
|
|
376
|
+
".ad-wrapper",
|
|
377
|
+
".advertisement",
|
|
378
|
+
".sponsored-content",
|
|
379
|
+
// Tracking pixels
|
|
380
|
+
"img[width='1'][height='1']",
|
|
381
|
+
"img[src*='pixel']",
|
|
382
|
+
"img[src*='tracking']",
|
|
383
|
+
"img[src*='analytics']"
|
|
848
384
|
];
|
|
849
|
-
function
|
|
850
|
-
const
|
|
851
|
-
const
|
|
852
|
-
|
|
385
|
+
function getLinkDensity(element) {
|
|
386
|
+
const text = element.textContent || "";
|
|
387
|
+
const textLength = text.trim().length;
|
|
388
|
+
if (textLength === 0) return 1;
|
|
389
|
+
let linkLength = 0;
|
|
390
|
+
element.querySelectorAll("a").forEach((link) => {
|
|
391
|
+
linkLength += (link.textContent || "").trim().length;
|
|
392
|
+
});
|
|
393
|
+
return linkLength / textLength;
|
|
394
|
+
}
|
|
395
|
+
function getContentScore(element) {
|
|
396
|
+
let score = 0;
|
|
397
|
+
const text = element.textContent || "";
|
|
398
|
+
const textLength = text.trim().length;
|
|
399
|
+
score += Math.min(textLength / 100, 50);
|
|
400
|
+
score += element.querySelectorAll("p").length * 3;
|
|
401
|
+
score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2;
|
|
402
|
+
score += element.querySelectorAll("img").length * 1;
|
|
403
|
+
score -= element.querySelectorAll("a").length * 0.5;
|
|
404
|
+
score -= element.querySelectorAll("li").length * 0.2;
|
|
405
|
+
const linkDensity = getLinkDensity(element);
|
|
406
|
+
if (linkDensity > 0.5) score -= 30;
|
|
407
|
+
else if (linkDensity > 0.3) score -= 15;
|
|
408
|
+
const classAndId = (element.className || "") + " " + (element.id || "");
|
|
409
|
+
if (/article|content|post|body|main|entry/i.test(classAndId)) score += 25;
|
|
410
|
+
if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) score -= 25;
|
|
411
|
+
return score;
|
|
412
|
+
}
|
|
413
|
+
function looksLikeNavigation(element) {
|
|
414
|
+
const linkDensity = getLinkDensity(element);
|
|
415
|
+
if (linkDensity > 0.5) return true;
|
|
416
|
+
const listItems = element.querySelectorAll("li");
|
|
417
|
+
const links = element.querySelectorAll("a");
|
|
418
|
+
if (listItems.length > 5 && links.length > listItems.length * 0.8) return true;
|
|
419
|
+
return false;
|
|
420
|
+
}
|
|
421
|
+
function removeElements(document, selectors) {
|
|
422
|
+
for (const selector of selectors) {
|
|
853
423
|
try {
|
|
854
424
|
document.querySelectorAll(selector).forEach((el) => el.remove());
|
|
855
425
|
} catch {
|
|
856
426
|
}
|
|
857
427
|
}
|
|
428
|
+
}
|
|
429
|
+
function removeWithProtection(document, selectorsToRemove, protectedSelectors) {
|
|
430
|
+
for (const selector of selectorsToRemove) {
|
|
431
|
+
try {
|
|
432
|
+
document.querySelectorAll(selector).forEach((element) => {
|
|
433
|
+
const isProtected = protectedSelectors.some((ps) => {
|
|
434
|
+
try {
|
|
435
|
+
return element.matches(ps);
|
|
436
|
+
} catch {
|
|
437
|
+
return false;
|
|
438
|
+
}
|
|
439
|
+
});
|
|
440
|
+
if (isProtected) return;
|
|
441
|
+
const containsProtected = protectedSelectors.some((ps) => {
|
|
442
|
+
try {
|
|
443
|
+
return element.querySelector(ps) !== null;
|
|
444
|
+
} catch {
|
|
445
|
+
return false;
|
|
446
|
+
}
|
|
447
|
+
});
|
|
448
|
+
if (containsProtected) return;
|
|
449
|
+
element.remove();
|
|
450
|
+
});
|
|
451
|
+
} catch {
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
function findMainContent(document) {
|
|
456
|
+
const isValidContent = (el) => {
|
|
457
|
+
if (!el) return false;
|
|
458
|
+
const text = el.textContent || "";
|
|
459
|
+
if (text.trim().length < 100) return false;
|
|
460
|
+
if (looksLikeNavigation(el)) return false;
|
|
461
|
+
return true;
|
|
462
|
+
};
|
|
463
|
+
const main = document.querySelector("main");
|
|
464
|
+
if (isValidContent(main) && getLinkDensity(main) < 0.4) {
|
|
465
|
+
return main;
|
|
466
|
+
}
|
|
467
|
+
const roleMain = document.querySelector('[role="main"]');
|
|
468
|
+
if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) {
|
|
469
|
+
return roleMain;
|
|
470
|
+
}
|
|
471
|
+
const articles = document.querySelectorAll("article");
|
|
472
|
+
if (articles.length === 1 && isValidContent(articles[0])) {
|
|
473
|
+
return articles[0];
|
|
474
|
+
}
|
|
475
|
+
const contentSelectors = [
|
|
476
|
+
"#content",
|
|
477
|
+
"#main-content",
|
|
478
|
+
"#main",
|
|
479
|
+
".content",
|
|
480
|
+
".main-content",
|
|
481
|
+
".post-content",
|
|
482
|
+
".article-content",
|
|
483
|
+
".entry-content",
|
|
484
|
+
".page-content",
|
|
485
|
+
".article-body",
|
|
486
|
+
".post-body",
|
|
487
|
+
".story-content",
|
|
488
|
+
".blog-content"
|
|
489
|
+
];
|
|
490
|
+
for (const selector of contentSelectors) {
|
|
491
|
+
try {
|
|
492
|
+
const el = document.querySelector(selector);
|
|
493
|
+
if (isValidContent(el) && getLinkDensity(el) < 0.4) {
|
|
494
|
+
return el;
|
|
495
|
+
}
|
|
496
|
+
} catch {
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
const candidates = [];
|
|
500
|
+
const containers = document.querySelectorAll("div, section, article");
|
|
501
|
+
containers.forEach((el) => {
|
|
502
|
+
const text = el.textContent || "";
|
|
503
|
+
if (text.trim().length < 200) return;
|
|
504
|
+
const score = getContentScore(el);
|
|
505
|
+
if (score > 0) {
|
|
506
|
+
candidates.push({ el, score });
|
|
507
|
+
}
|
|
508
|
+
});
|
|
509
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
510
|
+
if (candidates.length > 0 && candidates[0].score > 20) {
|
|
511
|
+
return candidates[0].el;
|
|
512
|
+
}
|
|
513
|
+
return null;
|
|
514
|
+
}
|
|
515
|
+
function cleanHtml(html, baseUrl, options = {}) {
|
|
516
|
+
const {
|
|
517
|
+
removeAds = true,
|
|
518
|
+
removeBase64Images = true,
|
|
519
|
+
onlyMainContent = true,
|
|
520
|
+
includeTags,
|
|
521
|
+
excludeTags
|
|
522
|
+
} = options;
|
|
523
|
+
const { document } = parseHTML(html);
|
|
524
|
+
removeElements(document, ALWAYS_REMOVE_SELECTORS);
|
|
525
|
+
removeElements(document, OVERLAY_SELECTORS);
|
|
858
526
|
if (removeAds) {
|
|
859
|
-
|
|
527
|
+
removeElements(document, AD_SELECTORS);
|
|
528
|
+
}
|
|
529
|
+
if (excludeTags && excludeTags.length > 0) {
|
|
530
|
+
removeElements(document, excludeTags);
|
|
531
|
+
}
|
|
532
|
+
if (onlyMainContent) {
|
|
533
|
+
removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS);
|
|
534
|
+
const mainContent = findMainContent(document);
|
|
535
|
+
if (mainContent) {
|
|
536
|
+
const body = document.body;
|
|
537
|
+
if (body) {
|
|
538
|
+
const clone = mainContent.cloneNode(true);
|
|
539
|
+
body.innerHTML = "";
|
|
540
|
+
body.appendChild(clone);
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
if (includeTags && includeTags.length > 0) {
|
|
545
|
+
const matchedElements = [];
|
|
546
|
+
for (const selector of includeTags) {
|
|
860
547
|
try {
|
|
861
|
-
document.querySelectorAll(selector).forEach((el) =>
|
|
548
|
+
document.querySelectorAll(selector).forEach((el) => {
|
|
549
|
+
matchedElements.push(el.cloneNode(true));
|
|
550
|
+
});
|
|
862
551
|
} catch {
|
|
863
552
|
}
|
|
864
553
|
}
|
|
554
|
+
if (matchedElements.length > 0) {
|
|
555
|
+
const body = document.body;
|
|
556
|
+
if (body) {
|
|
557
|
+
body.innerHTML = "";
|
|
558
|
+
matchedElements.forEach((el) => body.appendChild(el));
|
|
559
|
+
}
|
|
560
|
+
}
|
|
865
561
|
}
|
|
866
562
|
if (removeBase64Images) {
|
|
867
563
|
removeBase64ImagesFromDocument(document);
|
|
@@ -886,7 +582,10 @@ function removeBase64ImagesFromDocument(document) {
|
|
|
886
582
|
document.querySelectorAll("[style*='data:image']").forEach((el) => {
|
|
887
583
|
const style = el.getAttribute("style");
|
|
888
584
|
if (style) {
|
|
889
|
-
const cleanedStyle = style.replace(
|
|
585
|
+
const cleanedStyle = style.replace(
|
|
586
|
+
/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi,
|
|
587
|
+
""
|
|
588
|
+
);
|
|
890
589
|
if (cleanedStyle.trim()) {
|
|
891
590
|
el.setAttribute("style", cleanedStyle);
|
|
892
591
|
} else {
|
|
@@ -923,7 +622,7 @@ function cleanContent(html, baseUrl, options = {}) {
|
|
|
923
622
|
}
|
|
924
623
|
|
|
925
624
|
// src/utils/metadata-extractor.ts
|
|
926
|
-
import { parseHTML as
|
|
625
|
+
import { parseHTML as parseHTML2 } from "linkedom";
|
|
927
626
|
|
|
928
627
|
// src/utils/url-helpers.ts
|
|
929
628
|
import { URL as URL2 } from "url";
|
|
@@ -996,8 +695,26 @@ function isSameDomain(url, baseUrl) {
|
|
|
996
695
|
function getUrlKey(url) {
|
|
997
696
|
try {
|
|
998
697
|
const parsedUrl = new URL2(url);
|
|
698
|
+
parsedUrl.hash = "";
|
|
999
699
|
parsedUrl.search = "";
|
|
1000
|
-
|
|
700
|
+
if (parsedUrl.hostname.startsWith("www.")) {
|
|
701
|
+
parsedUrl.hostname = parsedUrl.hostname.slice(4);
|
|
702
|
+
}
|
|
703
|
+
if (parsedUrl.protocol === "http:" && parsedUrl.port === "80" || parsedUrl.protocol === "https:" && parsedUrl.port === "443") {
|
|
704
|
+
parsedUrl.port = "";
|
|
705
|
+
}
|
|
706
|
+
const indexFiles = ["index.html", "index.htm", "default.html", "default.htm", "index.php"];
|
|
707
|
+
for (const indexFile of indexFiles) {
|
|
708
|
+
if (parsedUrl.pathname.endsWith(`/${indexFile}`)) {
|
|
709
|
+
parsedUrl.pathname = parsedUrl.pathname.slice(0, -indexFile.length);
|
|
710
|
+
break;
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
let normalized = parsedUrl.toString().toLowerCase();
|
|
714
|
+
if (normalized.endsWith("/") && parsedUrl.pathname !== "/") {
|
|
715
|
+
normalized = normalized.slice(0, -1);
|
|
716
|
+
}
|
|
717
|
+
return normalized;
|
|
1001
718
|
} catch {
|
|
1002
719
|
return url.toLowerCase();
|
|
1003
720
|
}
|
|
@@ -1232,7 +949,7 @@ function extractMetadata(html, baseUrl) {
|
|
|
1232
949
|
return extractWebsiteMetadata(html, baseUrl);
|
|
1233
950
|
}
|
|
1234
951
|
function extractWebsiteMetadata(html, baseUrl) {
|
|
1235
|
-
const { document } =
|
|
952
|
+
const { document } = parseHTML2(html);
|
|
1236
953
|
const metadata = {
|
|
1237
954
|
title: null,
|
|
1238
955
|
description: null,
|
|
@@ -1514,13 +1231,15 @@ function isUrlAllowed(url, rules) {
|
|
|
1514
1231
|
var DEFAULT_OPTIONS = {
|
|
1515
1232
|
urls: [],
|
|
1516
1233
|
formats: ["markdown"],
|
|
1517
|
-
includeMetadata: true,
|
|
1518
1234
|
timeoutMs: 3e4,
|
|
1519
1235
|
includePatterns: [],
|
|
1520
1236
|
excludePatterns: [],
|
|
1521
1237
|
// Content cleaning defaults
|
|
1522
1238
|
removeAds: true,
|
|
1523
1239
|
removeBase64Images: true,
|
|
1240
|
+
onlyMainContent: true,
|
|
1241
|
+
includeTags: [],
|
|
1242
|
+
excludeTags: [],
|
|
1524
1243
|
skipTLSVerification: true,
|
|
1525
1244
|
// Batch defaults
|
|
1526
1245
|
batchConcurrency: 1,
|
|
@@ -1534,7 +1253,7 @@ var DEFAULT_OPTIONS = {
|
|
|
1534
1253
|
showChrome: false
|
|
1535
1254
|
};
|
|
1536
1255
|
function isValidFormat(format) {
|
|
1537
|
-
return format === "markdown" || format === "html"
|
|
1256
|
+
return format === "markdown" || format === "html";
|
|
1538
1257
|
}
|
|
1539
1258
|
function shouldCrawlUrl2(url, baseDomain) {
|
|
1540
1259
|
return url.hostname === baseDomain || url.hostname.endsWith(`.${baseDomain}`);
|
|
@@ -1683,14 +1402,9 @@ var Scraper = class {
|
|
|
1683
1402
|
} catch {
|
|
1684
1403
|
}
|
|
1685
1404
|
await hero.waitForPaintingStable();
|
|
1686
|
-
let hadChallenge = false;
|
|
1687
|
-
let challengeType = "none";
|
|
1688
|
-
let waitTimeMs = 0;
|
|
1689
1405
|
const initialUrl = await hero.url;
|
|
1690
1406
|
const detection = await detectChallenge(hero);
|
|
1691
1407
|
if (detection.isChallenge) {
|
|
1692
|
-
hadChallenge = true;
|
|
1693
|
-
challengeType = detection.type;
|
|
1694
1408
|
if (this.options.verbose) {
|
|
1695
1409
|
this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
|
|
1696
1410
|
}
|
|
@@ -1700,12 +1414,11 @@ var Scraper = class {
|
|
|
1700
1414
|
verbose: this.options.verbose,
|
|
1701
1415
|
initialUrl
|
|
1702
1416
|
});
|
|
1703
|
-
waitTimeMs = result2.waitedMs;
|
|
1704
1417
|
if (!result2.resolved) {
|
|
1705
1418
|
throw new Error(`Challenge not resolved: ${detection.type}`);
|
|
1706
1419
|
}
|
|
1707
1420
|
if (this.options.verbose) {
|
|
1708
|
-
this.logger.info(`Challenge resolved via ${result2.method} in ${
|
|
1421
|
+
this.logger.info(`Challenge resolved via ${result2.method} in ${result2.waitedMs}ms`);
|
|
1709
1422
|
}
|
|
1710
1423
|
}
|
|
1711
1424
|
await this.waitForFinalPage(hero, url, this.options.verbose);
|
|
@@ -1718,45 +1431,18 @@ var Scraper = class {
|
|
|
1718
1431
|
this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
|
|
1719
1432
|
}
|
|
1720
1433
|
}
|
|
1721
|
-
const pageTitle = await hero.document.title;
|
|
1722
1434
|
const html = await hero.document.documentElement.outerHTML;
|
|
1723
1435
|
const cleanedHtml = cleanContent(html, url, {
|
|
1724
1436
|
removeAds: this.options.removeAds,
|
|
1725
|
-
removeBase64Images: this.options.removeBase64Images
|
|
1437
|
+
removeBase64Images: this.options.removeBase64Images,
|
|
1438
|
+
onlyMainContent: this.options.onlyMainContent,
|
|
1439
|
+
includeTags: this.options.includeTags,
|
|
1440
|
+
excludeTags: this.options.excludeTags
|
|
1726
1441
|
});
|
|
1727
1442
|
const websiteMetadata = extractMetadata(cleanedHtml, url);
|
|
1728
1443
|
const duration = Date.now() - startTime;
|
|
1729
|
-
const
|
|
1730
|
-
const
|
|
1731
|
-
url,
|
|
1732
|
-
title: pageTitle,
|
|
1733
|
-
markdown: "",
|
|
1734
|
-
// Will be set by formatter
|
|
1735
|
-
html: cleanedHtml,
|
|
1736
|
-
fetchedAt: scrapedAt,
|
|
1737
|
-
depth: 0,
|
|
1738
|
-
hadChallenge,
|
|
1739
|
-
challengeType,
|
|
1740
|
-
waitTimeMs
|
|
1741
|
-
};
|
|
1742
|
-
const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
|
|
1743
|
-
[page],
|
|
1744
|
-
url,
|
|
1745
|
-
scrapedAt,
|
|
1746
|
-
duration,
|
|
1747
|
-
websiteMetadata,
|
|
1748
|
-
this.options.includeMetadata
|
|
1749
|
-
) : void 0;
|
|
1750
|
-
const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1751
|
-
const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1752
|
-
const text = this.options.formats.includes("text") ? formatToText(
|
|
1753
|
-
[page],
|
|
1754
|
-
url,
|
|
1755
|
-
scrapedAt,
|
|
1756
|
-
duration,
|
|
1757
|
-
websiteMetadata,
|
|
1758
|
-
this.options.includeMetadata
|
|
1759
|
-
) : void 0;
|
|
1444
|
+
const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
|
|
1445
|
+
const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
|
|
1760
1446
|
if (this.options.onProgress) {
|
|
1761
1447
|
this.options.onProgress({
|
|
1762
1448
|
completed: index + 1,
|
|
@@ -1788,8 +1474,6 @@ var Scraper = class {
|
|
|
1788
1474
|
const result = {
|
|
1789
1475
|
markdown,
|
|
1790
1476
|
html: htmlOutput,
|
|
1791
|
-
json,
|
|
1792
|
-
text,
|
|
1793
1477
|
metadata: {
|
|
1794
1478
|
baseUrl: url,
|
|
1795
1479
|
totalPages: 1,
|
|
@@ -1844,7 +1528,7 @@ async function scrape(options) {
|
|
|
1844
1528
|
}
|
|
1845
1529
|
|
|
1846
1530
|
// src/crawler.ts
|
|
1847
|
-
import { parseHTML as
|
|
1531
|
+
import { parseHTML as parseHTML3 } from "linkedom";
|
|
1848
1532
|
|
|
1849
1533
|
// src/utils/rate-limiter.ts
|
|
1850
1534
|
import pLimit2 from "p-limit";
|
|
@@ -1993,12 +1677,26 @@ var Crawler = class {
|
|
|
1993
1677
|
*/
|
|
1994
1678
|
extractLinks(html, baseUrl, depth) {
|
|
1995
1679
|
const links = [];
|
|
1996
|
-
const { document } =
|
|
1680
|
+
const { document } = parseHTML3(html);
|
|
1997
1681
|
document.querySelectorAll("a[href]").forEach((anchor) => {
|
|
1998
|
-
const
|
|
1682
|
+
const rawHref = anchor.getAttribute("href");
|
|
1683
|
+
if (!rawHref) return;
|
|
1684
|
+
const href = rawHref.trim();
|
|
1999
1685
|
if (!href) return;
|
|
2000
|
-
|
|
1686
|
+
if (href.startsWith("#")) return;
|
|
1687
|
+
const lowerHref = href.toLowerCase();
|
|
1688
|
+
if (lowerHref.startsWith("javascript:") || lowerHref.startsWith("mailto:") || lowerHref.startsWith("tel:") || lowerHref.startsWith("data:") || lowerHref.startsWith("blob:") || lowerHref.startsWith("ftp:")) {
|
|
1689
|
+
return;
|
|
1690
|
+
}
|
|
1691
|
+
let resolved = resolveUrl(href, baseUrl);
|
|
2001
1692
|
if (!resolved || !isValidUrl(resolved)) return;
|
|
1693
|
+
try {
|
|
1694
|
+
const parsed = new URL(resolved);
|
|
1695
|
+
parsed.hash = "";
|
|
1696
|
+
resolved = parsed.toString();
|
|
1697
|
+
} catch {
|
|
1698
|
+
return;
|
|
1699
|
+
}
|
|
2002
1700
|
if (!isSameDomain(resolved, this.options.url)) return;
|
|
2003
1701
|
if (!isContentUrl(resolved)) return;
|
|
2004
1702
|
if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
|
|
@@ -3046,16 +2744,251 @@ async function isDaemonRunning(port = DEFAULT_DAEMON_PORT) {
|
|
|
3046
2744
|
const client = new DaemonClient({ port, timeoutMs: 5e3 });
|
|
3047
2745
|
return client.isRunning();
|
|
3048
2746
|
}
|
|
2747
|
+
|
|
2748
|
+
// src/formatters/html.ts
|
|
2749
|
+
function formatToHTML(html) {
|
|
2750
|
+
return html;
|
|
2751
|
+
}
|
|
2752
|
+
|
|
2753
|
+
// src/errors.ts
|
|
2754
|
+
var ReaderErrorCode = /* @__PURE__ */ ((ReaderErrorCode2) => {
|
|
2755
|
+
ReaderErrorCode2["NETWORK_ERROR"] = "NETWORK_ERROR";
|
|
2756
|
+
ReaderErrorCode2["TIMEOUT"] = "TIMEOUT";
|
|
2757
|
+
ReaderErrorCode2["CONNECTION_REFUSED"] = "CONNECTION_REFUSED";
|
|
2758
|
+
ReaderErrorCode2["CLOUDFLARE_CHALLENGE"] = "CLOUDFLARE_CHALLENGE";
|
|
2759
|
+
ReaderErrorCode2["BOT_DETECTED"] = "BOT_DETECTED";
|
|
2760
|
+
ReaderErrorCode2["ACCESS_DENIED"] = "ACCESS_DENIED";
|
|
2761
|
+
ReaderErrorCode2["CONTENT_EXTRACTION_FAILED"] = "CONTENT_EXTRACTION_FAILED";
|
|
2762
|
+
ReaderErrorCode2["EMPTY_CONTENT"] = "EMPTY_CONTENT";
|
|
2763
|
+
ReaderErrorCode2["INVALID_URL"] = "INVALID_URL";
|
|
2764
|
+
ReaderErrorCode2["INVALID_OPTIONS"] = "INVALID_OPTIONS";
|
|
2765
|
+
ReaderErrorCode2["ROBOTS_BLOCKED"] = "ROBOTS_BLOCKED";
|
|
2766
|
+
ReaderErrorCode2["BROWSER_ERROR"] = "BROWSER_ERROR";
|
|
2767
|
+
ReaderErrorCode2["POOL_EXHAUSTED"] = "POOL_EXHAUSTED";
|
|
2768
|
+
ReaderErrorCode2["CLIENT_CLOSED"] = "CLIENT_CLOSED";
|
|
2769
|
+
ReaderErrorCode2["NOT_INITIALIZED"] = "NOT_INITIALIZED";
|
|
2770
|
+
ReaderErrorCode2["UNKNOWN"] = "UNKNOWN";
|
|
2771
|
+
return ReaderErrorCode2;
|
|
2772
|
+
})(ReaderErrorCode || {});
|
|
2773
|
+
var ReaderError = class extends Error {
|
|
2774
|
+
code;
|
|
2775
|
+
url;
|
|
2776
|
+
cause;
|
|
2777
|
+
timestamp;
|
|
2778
|
+
retryable;
|
|
2779
|
+
constructor(message, code, options) {
|
|
2780
|
+
super(message);
|
|
2781
|
+
this.name = "ReaderError";
|
|
2782
|
+
this.code = code;
|
|
2783
|
+
this.url = options?.url;
|
|
2784
|
+
this.cause = options?.cause;
|
|
2785
|
+
this.timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
2786
|
+
this.retryable = options?.retryable ?? false;
|
|
2787
|
+
if (Error.captureStackTrace) {
|
|
2788
|
+
Error.captureStackTrace(this, this.constructor);
|
|
2789
|
+
}
|
|
2790
|
+
}
|
|
2791
|
+
/**
|
|
2792
|
+
* Convert to a plain object for serialization
|
|
2793
|
+
*/
|
|
2794
|
+
toJSON() {
|
|
2795
|
+
return {
|
|
2796
|
+
name: this.name,
|
|
2797
|
+
code: this.code,
|
|
2798
|
+
message: this.message,
|
|
2799
|
+
url: this.url,
|
|
2800
|
+
timestamp: this.timestamp,
|
|
2801
|
+
retryable: this.retryable,
|
|
2802
|
+
cause: this.cause?.message,
|
|
2803
|
+
stack: this.stack
|
|
2804
|
+
};
|
|
2805
|
+
}
|
|
2806
|
+
};
|
|
2807
|
+
var NetworkError = class extends ReaderError {
|
|
2808
|
+
constructor(message, options) {
|
|
2809
|
+
super(message, "NETWORK_ERROR" /* NETWORK_ERROR */, {
|
|
2810
|
+
...options,
|
|
2811
|
+
retryable: true
|
|
2812
|
+
});
|
|
2813
|
+
this.name = "NetworkError";
|
|
2814
|
+
}
|
|
2815
|
+
};
|
|
2816
|
+
var TimeoutError = class extends ReaderError {
|
|
2817
|
+
timeoutMs;
|
|
2818
|
+
constructor(message, timeoutMs, options) {
|
|
2819
|
+
super(message, "TIMEOUT" /* TIMEOUT */, {
|
|
2820
|
+
...options,
|
|
2821
|
+
retryable: true
|
|
2822
|
+
});
|
|
2823
|
+
this.name = "TimeoutError";
|
|
2824
|
+
this.timeoutMs = timeoutMs;
|
|
2825
|
+
}
|
|
2826
|
+
toJSON() {
|
|
2827
|
+
return {
|
|
2828
|
+
...super.toJSON(),
|
|
2829
|
+
timeoutMs: this.timeoutMs
|
|
2830
|
+
};
|
|
2831
|
+
}
|
|
2832
|
+
};
|
|
2833
|
+
var CloudflareError = class extends ReaderError {
|
|
2834
|
+
challengeType;
|
|
2835
|
+
constructor(challengeType, options) {
|
|
2836
|
+
super(
|
|
2837
|
+
`Cloudflare ${challengeType} challenge not resolved. Consider using a residential proxy or increasing timeout.`,
|
|
2838
|
+
"CLOUDFLARE_CHALLENGE" /* CLOUDFLARE_CHALLENGE */,
|
|
2839
|
+
{
|
|
2840
|
+
...options,
|
|
2841
|
+
retryable: true
|
|
2842
|
+
}
|
|
2843
|
+
);
|
|
2844
|
+
this.name = "CloudflareError";
|
|
2845
|
+
this.challengeType = challengeType;
|
|
2846
|
+
}
|
|
2847
|
+
toJSON() {
|
|
2848
|
+
return {
|
|
2849
|
+
...super.toJSON(),
|
|
2850
|
+
challengeType: this.challengeType
|
|
2851
|
+
};
|
|
2852
|
+
}
|
|
2853
|
+
};
|
|
2854
|
+
var AccessDeniedError = class extends ReaderError {
|
|
2855
|
+
statusCode;
|
|
2856
|
+
constructor(message, options) {
|
|
2857
|
+
super(message, "ACCESS_DENIED" /* ACCESS_DENIED */, {
|
|
2858
|
+
...options,
|
|
2859
|
+
retryable: false
|
|
2860
|
+
});
|
|
2861
|
+
this.name = "AccessDeniedError";
|
|
2862
|
+
this.statusCode = options?.statusCode;
|
|
2863
|
+
}
|
|
2864
|
+
toJSON() {
|
|
2865
|
+
return {
|
|
2866
|
+
...super.toJSON(),
|
|
2867
|
+
statusCode: this.statusCode
|
|
2868
|
+
};
|
|
2869
|
+
}
|
|
2870
|
+
};
|
|
2871
|
+
var ContentExtractionError = class extends ReaderError {
|
|
2872
|
+
constructor(message, options) {
|
|
2873
|
+
super(message, "CONTENT_EXTRACTION_FAILED" /* CONTENT_EXTRACTION_FAILED */, {
|
|
2874
|
+
...options,
|
|
2875
|
+
retryable: false
|
|
2876
|
+
});
|
|
2877
|
+
this.name = "ContentExtractionError";
|
|
2878
|
+
}
|
|
2879
|
+
};
|
|
2880
|
+
var ValidationError = class extends ReaderError {
|
|
2881
|
+
field;
|
|
2882
|
+
constructor(message, options) {
|
|
2883
|
+
super(message, "INVALID_OPTIONS" /* INVALID_OPTIONS */, {
|
|
2884
|
+
url: options?.url,
|
|
2885
|
+
retryable: false
|
|
2886
|
+
});
|
|
2887
|
+
this.name = "ValidationError";
|
|
2888
|
+
this.field = options?.field;
|
|
2889
|
+
}
|
|
2890
|
+
toJSON() {
|
|
2891
|
+
return {
|
|
2892
|
+
...super.toJSON(),
|
|
2893
|
+
field: this.field
|
|
2894
|
+
};
|
|
2895
|
+
}
|
|
2896
|
+
};
|
|
2897
|
+
var InvalidUrlError = class extends ReaderError {
|
|
2898
|
+
constructor(url, reason) {
|
|
2899
|
+
super(reason ? `Invalid URL "${url}": ${reason}` : `Invalid URL: ${url}`, "INVALID_URL" /* INVALID_URL */, {
|
|
2900
|
+
url,
|
|
2901
|
+
retryable: false
|
|
2902
|
+
});
|
|
2903
|
+
this.name = "InvalidUrlError";
|
|
2904
|
+
}
|
|
2905
|
+
};
|
|
2906
|
+
var RobotsBlockedError = class extends ReaderError {
|
|
2907
|
+
constructor(url) {
|
|
2908
|
+
super(`URL blocked by robots.txt: ${url}. Set respectRobotsTxt: false to override.`, "ROBOTS_BLOCKED" /* ROBOTS_BLOCKED */, {
|
|
2909
|
+
url,
|
|
2910
|
+
retryable: false
|
|
2911
|
+
});
|
|
2912
|
+
this.name = "RobotsBlockedError";
|
|
2913
|
+
}
|
|
2914
|
+
};
|
|
2915
|
+
var BrowserPoolError = class extends ReaderError {
|
|
2916
|
+
constructor(message, options) {
|
|
2917
|
+
super(message, "BROWSER_ERROR" /* BROWSER_ERROR */, {
|
|
2918
|
+
...options,
|
|
2919
|
+
retryable: true
|
|
2920
|
+
});
|
|
2921
|
+
this.name = "BrowserPoolError";
|
|
2922
|
+
}
|
|
2923
|
+
};
|
|
2924
|
+
var ClientClosedError = class extends ReaderError {
|
|
2925
|
+
constructor() {
|
|
2926
|
+
super("ReaderClient has been closed. Create a new instance to continue.", "CLIENT_CLOSED" /* CLIENT_CLOSED */, {
|
|
2927
|
+
retryable: false
|
|
2928
|
+
});
|
|
2929
|
+
this.name = "ClientClosedError";
|
|
2930
|
+
}
|
|
2931
|
+
};
|
|
2932
|
+
var NotInitializedError = class extends ReaderError {
|
|
2933
|
+
constructor(component) {
|
|
2934
|
+
super(`${component} not initialized. This should not happen - please report this bug.`, "NOT_INITIALIZED" /* NOT_INITIALIZED */, {
|
|
2935
|
+
retryable: false
|
|
2936
|
+
});
|
|
2937
|
+
this.name = "NotInitializedError";
|
|
2938
|
+
}
|
|
2939
|
+
};
|
|
2940
|
+
function wrapError(error, url) {
|
|
2941
|
+
if (error instanceof ReaderError) {
|
|
2942
|
+
return error;
|
|
2943
|
+
}
|
|
2944
|
+
if (error instanceof Error) {
|
|
2945
|
+
const message = error.message.toLowerCase();
|
|
2946
|
+
if (message.includes("timeout") || message.includes("timed out")) {
|
|
2947
|
+
return new TimeoutError(error.message, 3e4, { url, cause: error });
|
|
2948
|
+
}
|
|
2949
|
+
if (message.includes("econnrefused") || message.includes("connection refused")) {
|
|
2950
|
+
return new NetworkError(`Connection refused: ${error.message}`, { url, cause: error });
|
|
2951
|
+
}
|
|
2952
|
+
if (message.includes("enotfound") || message.includes("dns")) {
|
|
2953
|
+
return new NetworkError(`DNS lookup failed: ${error.message}`, { url, cause: error });
|
|
2954
|
+
}
|
|
2955
|
+
if (message.includes("cloudflare") || message.includes("challenge")) {
|
|
2956
|
+
return new CloudflareError("unknown", { url, cause: error });
|
|
2957
|
+
}
|
|
2958
|
+
return new ReaderError(error.message, "UNKNOWN" /* UNKNOWN */, {
|
|
2959
|
+
url,
|
|
2960
|
+
cause: error,
|
|
2961
|
+
retryable: false
|
|
2962
|
+
});
|
|
2963
|
+
}
|
|
2964
|
+
return new ReaderError(String(error), "UNKNOWN" /* UNKNOWN */, {
|
|
2965
|
+
url,
|
|
2966
|
+
retryable: false
|
|
2967
|
+
});
|
|
2968
|
+
}
|
|
3049
2969
|
export {
|
|
2970
|
+
AccessDeniedError,
|
|
3050
2971
|
BrowserPool,
|
|
2972
|
+
BrowserPoolError,
|
|
2973
|
+
ClientClosedError,
|
|
2974
|
+
CloudflareError,
|
|
2975
|
+
ContentExtractionError,
|
|
3051
2976
|
Crawler,
|
|
3052
2977
|
DEFAULT_DAEMON_PORT,
|
|
3053
2978
|
DEFAULT_OPTIONS,
|
|
3054
2979
|
DaemonClient,
|
|
3055
2980
|
DaemonServer,
|
|
3056
2981
|
BrowserPool as HeroBrowserPool,
|
|
2982
|
+
InvalidUrlError,
|
|
2983
|
+
NetworkError,
|
|
2984
|
+
NotInitializedError,
|
|
3057
2985
|
ReaderClient,
|
|
2986
|
+
ReaderError,
|
|
2987
|
+
ReaderErrorCode,
|
|
2988
|
+
RobotsBlockedError,
|
|
3058
2989
|
Scraper,
|
|
2990
|
+
TimeoutError,
|
|
2991
|
+
ValidationError,
|
|
3059
2992
|
cleanContent,
|
|
3060
2993
|
crawl,
|
|
3061
2994
|
createHeroConfig,
|
|
@@ -3063,14 +2996,12 @@ export {
|
|
|
3063
2996
|
detectChallenge,
|
|
3064
2997
|
extractMetadata,
|
|
3065
2998
|
formatToHTML,
|
|
3066
|
-
formatToJson,
|
|
3067
|
-
formatToJsonLite,
|
|
3068
2999
|
formatToMarkdown,
|
|
3069
|
-
formatToText,
|
|
3070
3000
|
getDaemonInfo,
|
|
3071
3001
|
getPidFilePath,
|
|
3072
3002
|
getUrlKey,
|
|
3073
3003
|
handleChallenge,
|
|
3004
|
+
htmlToMarkdown,
|
|
3074
3005
|
isChallengePage,
|
|
3075
3006
|
isDaemonRunning,
|
|
3076
3007
|
isSameDomain,
|
|
@@ -3084,6 +3015,7 @@ export {
|
|
|
3084
3015
|
shouldCrawlUrl2 as shouldCrawlUrlFn,
|
|
3085
3016
|
validateUrls,
|
|
3086
3017
|
waitForChallengeResolution,
|
|
3087
|
-
waitForSelector
|
|
3018
|
+
waitForSelector,
|
|
3019
|
+
wrapError
|
|
3088
3020
|
};
|
|
3089
3021
|
//# sourceMappingURL=index.js.map
|