@vakra-dev/reader 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -26
- package/dist/cli/index.js +445 -734
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +205 -41
- package/dist/index.js +663 -715
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
|
|
2
|
+
get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
|
|
3
|
+
}) : x)(function(x) {
|
|
4
|
+
if (typeof require !== "undefined") return require.apply(this, arguments);
|
|
5
|
+
throw Error('Dynamic require of "' + x + '" is not supported');
|
|
6
|
+
});
|
|
7
|
+
|
|
1
8
|
// src/client.ts
|
|
2
9
|
import HeroCore from "@ulixee/hero-core";
|
|
3
10
|
import { TransportBridge } from "@ulixee/net";
|
|
@@ -7,27 +14,36 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
|
|
|
7
14
|
import pLimit from "p-limit";
|
|
8
15
|
|
|
9
16
|
// src/cloudflare/detector.ts
|
|
10
|
-
var
|
|
17
|
+
var CLOUDFLARE_CHALLENGE_SELECTORS = [
|
|
11
18
|
"#challenge-running",
|
|
12
19
|
"#challenge-stage",
|
|
13
20
|
"#challenge-form",
|
|
14
|
-
".cf-browser-verification"
|
|
21
|
+
".cf-browser-verification",
|
|
22
|
+
"#cf-wrapper",
|
|
23
|
+
"#cf-hcaptcha-container",
|
|
24
|
+
"#turnstile-wrapper"
|
|
15
25
|
];
|
|
16
|
-
var
|
|
17
|
-
"verifying you are human",
|
|
26
|
+
var CLOUDFLARE_TEXT_PATTERNS = [
|
|
18
27
|
"checking if the site connection is secure",
|
|
19
|
-
"this process is automatic. your browser will redirect"
|
|
28
|
+
"this process is automatic. your browser will redirect",
|
|
29
|
+
"ray id:",
|
|
30
|
+
"performance & security by cloudflare"
|
|
31
|
+
];
|
|
32
|
+
var CLOUDFLARE_INFRA_PATTERNS = [
|
|
33
|
+
"/cdn-cgi/",
|
|
34
|
+
"cloudflare",
|
|
35
|
+
"__cf_bm",
|
|
36
|
+
"cf-ray"
|
|
20
37
|
];
|
|
21
|
-
var
|
|
22
|
-
"you have been blocked",
|
|
23
|
-
"access to this page has been denied",
|
|
38
|
+
var CLOUDFLARE_BLOCKED_PATTERNS = [
|
|
24
39
|
"sorry, you have been blocked",
|
|
25
|
-
"
|
|
26
|
-
"403 forbidden"
|
|
40
|
+
"ray id:"
|
|
27
41
|
];
|
|
28
42
|
async function detectChallenge(hero) {
|
|
29
43
|
const signals = [];
|
|
30
44
|
let type = "none";
|
|
45
|
+
let hasCloudflareInfra = false;
|
|
46
|
+
let hasChallengeIndicator = false;
|
|
31
47
|
try {
|
|
32
48
|
if (!hero.document) {
|
|
33
49
|
return {
|
|
@@ -39,30 +55,51 @@ async function detectChallenge(hero) {
|
|
|
39
55
|
}
|
|
40
56
|
const html = await hero.document.documentElement.outerHTML;
|
|
41
57
|
const htmlLower = html.toLowerCase();
|
|
42
|
-
for (const
|
|
43
|
-
if (htmlLower.includes(
|
|
44
|
-
|
|
45
|
-
|
|
58
|
+
for (const pattern of CLOUDFLARE_INFRA_PATTERNS) {
|
|
59
|
+
if (htmlLower.includes(pattern)) {
|
|
60
|
+
hasCloudflareInfra = true;
|
|
61
|
+
signals.push(`Cloudflare infra: "${pattern}"`);
|
|
62
|
+
break;
|
|
46
63
|
}
|
|
47
64
|
}
|
|
48
|
-
|
|
65
|
+
if (!hasCloudflareInfra) {
|
|
66
|
+
return {
|
|
67
|
+
isChallenge: false,
|
|
68
|
+
type: "none",
|
|
69
|
+
confidence: 0,
|
|
70
|
+
signals: ["No Cloudflare infrastructure detected"]
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
|
|
74
|
+
try {
|
|
75
|
+
const element = await hero.document.querySelector(selector);
|
|
76
|
+
if (element) {
|
|
77
|
+
hasChallengeIndicator = true;
|
|
78
|
+
signals.push(`Challenge element: ${selector}`);
|
|
79
|
+
type = "js_challenge";
|
|
80
|
+
}
|
|
81
|
+
} catch {
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
|
|
49
85
|
if (htmlLower.includes(pattern)) {
|
|
86
|
+
hasChallengeIndicator = true;
|
|
50
87
|
signals.push(`Challenge text: "${pattern}"`);
|
|
51
88
|
type = type === "none" ? "js_challenge" : type;
|
|
52
89
|
}
|
|
53
90
|
}
|
|
54
91
|
if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
|
|
92
|
+
hasChallengeIndicator = true;
|
|
55
93
|
signals.push('Challenge text: "waiting for...to respond"');
|
|
56
94
|
type = type === "none" ? "js_challenge" : type;
|
|
57
95
|
}
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
}
|
|
96
|
+
const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
|
|
97
|
+
if (hasBlocked) {
|
|
98
|
+
hasChallengeIndicator = true;
|
|
99
|
+
signals.push("Cloudflare block page detected");
|
|
100
|
+
type = "blocked";
|
|
64
101
|
}
|
|
65
|
-
const isChallenge =
|
|
102
|
+
const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
|
|
66
103
|
const confidence = isChallenge ? 100 : 0;
|
|
67
104
|
return {
|
|
68
105
|
isChallenge,
|
|
@@ -179,84 +216,6 @@ var turndownService = new TurndownService({
|
|
|
179
216
|
linkStyle: "inlined",
|
|
180
217
|
linkReferenceStyle: "full"
|
|
181
218
|
});
|
|
182
|
-
function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
183
|
-
const sections = [];
|
|
184
|
-
if (includeMetadata) {
|
|
185
|
-
sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
186
|
-
}
|
|
187
|
-
if (pages.length > 1) {
|
|
188
|
-
sections.push(createMarkdownTOC(pages));
|
|
189
|
-
}
|
|
190
|
-
sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
|
|
191
|
-
return sections.join("\n\n");
|
|
192
|
-
}
|
|
193
|
-
function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
194
|
-
const title = website.title || extractDomainFromUrl(baseUrl);
|
|
195
|
-
const description = website.description || "";
|
|
196
|
-
let header = `# Website Scrape: ${title}
|
|
197
|
-
|
|
198
|
-
`;
|
|
199
|
-
header += `**Base URL:** ${baseUrl}
|
|
200
|
-
`;
|
|
201
|
-
header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
|
|
202
|
-
`;
|
|
203
|
-
header += `**Duration:** ${duration}ms
|
|
204
|
-
`;
|
|
205
|
-
header += `**Total pages:** ${totalPages}
|
|
206
|
-
`;
|
|
207
|
-
if (description) {
|
|
208
|
-
header += `**Description:** ${description}
|
|
209
|
-
`;
|
|
210
|
-
}
|
|
211
|
-
if (website.author) {
|
|
212
|
-
header += `**Author:** ${website.author}
|
|
213
|
-
`;
|
|
214
|
-
}
|
|
215
|
-
if (website.language) {
|
|
216
|
-
header += `**Language:** ${website.language}
|
|
217
|
-
`;
|
|
218
|
-
}
|
|
219
|
-
return header;
|
|
220
|
-
}
|
|
221
|
-
function createMarkdownTOC(pages) {
|
|
222
|
-
let toc = "## Table of Contents\n\n";
|
|
223
|
-
pages.forEach((page, index) => {
|
|
224
|
-
const depth = " ".repeat(page.depth);
|
|
225
|
-
const pageNumber = index + 1;
|
|
226
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
227
|
-
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
228
|
-
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
229
|
-
toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
|
|
230
|
-
`;
|
|
231
|
-
});
|
|
232
|
-
return toc;
|
|
233
|
-
}
|
|
234
|
-
function createMarkdownPage(page, pageNumber) {
|
|
235
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
236
|
-
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
237
|
-
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
238
|
-
let pageContent = `---
|
|
239
|
-
|
|
240
|
-
`;
|
|
241
|
-
pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
|
|
242
|
-
|
|
243
|
-
`;
|
|
244
|
-
pageContent += `**URL:** ${page.url}
|
|
245
|
-
`;
|
|
246
|
-
pageContent += `**Title:** ${page.title}
|
|
247
|
-
`;
|
|
248
|
-
pageContent += `**Depth:** ${page.depth}
|
|
249
|
-
`;
|
|
250
|
-
pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
|
|
251
|
-
|
|
252
|
-
`;
|
|
253
|
-
pageContent += `---
|
|
254
|
-
|
|
255
|
-
`;
|
|
256
|
-
const markdown = htmlToMarkdown(page.html);
|
|
257
|
-
pageContent += markdown;
|
|
258
|
-
return pageContent;
|
|
259
|
-
}
|
|
260
219
|
function htmlToMarkdown(html) {
|
|
261
220
|
try {
|
|
262
221
|
return turndownService.turndown(html);
|
|
@@ -265,596 +224,340 @@ function htmlToMarkdown(html) {
|
|
|
265
224
|
return html.replace(/<[^>]*>/g, "").trim();
|
|
266
225
|
}
|
|
267
226
|
}
|
|
268
|
-
|
|
269
|
-
try {
|
|
270
|
-
return new URL(url).hostname;
|
|
271
|
-
} catch {
|
|
272
|
-
return "Unknown";
|
|
273
|
-
}
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
// src/formatters/html.ts
|
|
277
|
-
function formatToHTML(pages, baseUrl, scrapedAt, duration, website) {
|
|
278
|
-
const html = `<!DOCTYPE html>
|
|
279
|
-
<html lang="${website.language || "en"}">
|
|
280
|
-
<head>
|
|
281
|
-
<meta charset="${website.charset || "UTF-8"}">
|
|
282
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
283
|
-
<title>Scrape: ${website.title || extractDomainFromUrl2(baseUrl)}</title>
|
|
284
|
-
${generateMetaTags(website)}
|
|
285
|
-
<style>
|
|
286
|
-
${generateCSS()}
|
|
287
|
-
</style>
|
|
288
|
-
</head>
|
|
289
|
-
<body>
|
|
290
|
-
<header class="header">
|
|
291
|
-
<h1>Website Scrape: ${escapeHtml(website.title || extractDomainFromUrl2(baseUrl))}</h1>
|
|
292
|
-
<div class="meta-info">
|
|
293
|
-
<p><strong>Base URL:</strong> <a href="${escapeHtml(
|
|
294
|
-
baseUrl
|
|
295
|
-
)}" target="_blank">${escapeHtml(baseUrl)}</a></p>
|
|
296
|
-
<p><strong>Scraped at:</strong> ${new Date(scrapedAt).toLocaleString()}</p>
|
|
297
|
-
<p><strong>Duration:</strong> ${duration}ms</p>
|
|
298
|
-
<p><strong>Total pages:</strong> ${pages.length}</p>
|
|
299
|
-
${website.description ? `<p><strong>Description:</strong> ${escapeHtml(website.description)}</p>` : ""}
|
|
300
|
-
${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
|
|
301
|
-
${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
|
|
302
|
-
</div>
|
|
303
|
-
</header>
|
|
304
|
-
|
|
305
|
-
${pages.length > 1 ? generateTOC(pages) : ""}
|
|
306
|
-
|
|
307
|
-
<main class="content">
|
|
308
|
-
${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
|
|
309
|
-
</main>
|
|
310
|
-
|
|
311
|
-
<footer class="footer">
|
|
312
|
-
<p>Generated by Reader JS/TS SDK</p>
|
|
313
|
-
</footer>
|
|
314
|
-
|
|
315
|
-
<script>
|
|
316
|
-
${generateJavaScript()}
|
|
317
|
-
</script>
|
|
318
|
-
</body>
|
|
319
|
-
</html>`;
|
|
320
|
-
return html;
|
|
321
|
-
}
|
|
322
|
-
function generateMetaTags(website) {
|
|
323
|
-
const tags = [];
|
|
324
|
-
if (website.description) {
|
|
325
|
-
tags.push(`<meta name="description" content="${escapeHtml(website.description)}">`);
|
|
326
|
-
}
|
|
327
|
-
if (website.author) {
|
|
328
|
-
tags.push(`<meta name="author" content="${escapeHtml(website.author)}">`);
|
|
329
|
-
}
|
|
330
|
-
if (website.keywords) {
|
|
331
|
-
tags.push(`<meta name="keywords" content="${escapeHtml(website.keywords.join(", "))}">`);
|
|
332
|
-
}
|
|
333
|
-
if (website.robots) {
|
|
334
|
-
tags.push(`<meta name="robots" content="${escapeHtml(website.robots)}">`);
|
|
335
|
-
}
|
|
336
|
-
if (website.themeColor) {
|
|
337
|
-
tags.push(`<meta name="theme-color" content="${escapeHtml(website.themeColor)}">`);
|
|
338
|
-
}
|
|
339
|
-
if (website.favicon) {
|
|
340
|
-
tags.push(`<link rel="icon" href="${escapeHtml(website.favicon)}">`);
|
|
341
|
-
}
|
|
342
|
-
if (website.canonical) {
|
|
343
|
-
tags.push(`<link rel="canonical" href="${escapeHtml(website.canonical)}">`);
|
|
344
|
-
}
|
|
345
|
-
if (website.openGraph) {
|
|
346
|
-
const og = website.openGraph;
|
|
347
|
-
if (og.title) tags.push(`<meta property="og:title" content="${escapeHtml(og.title)}">`);
|
|
348
|
-
if (og.description)
|
|
349
|
-
tags.push(`<meta property="og:description" content="${escapeHtml(og.description)}">`);
|
|
350
|
-
if (og.type) tags.push(`<meta property="og:type" content="${escapeHtml(og.type)}">`);
|
|
351
|
-
if (og.url) tags.push(`<meta property="og:url" content="${escapeHtml(og.url)}">`);
|
|
352
|
-
if (og.image) tags.push(`<meta property="og:image" content="${escapeHtml(og.image)}">`);
|
|
353
|
-
if (og.siteName)
|
|
354
|
-
tags.push(`<meta property="og:site_name" content="${escapeHtml(og.siteName)}">`);
|
|
355
|
-
if (og.locale) tags.push(`<meta property="og:locale" content="${escapeHtml(og.locale)}">`);
|
|
356
|
-
}
|
|
357
|
-
if (website.twitter) {
|
|
358
|
-
const twitter = website.twitter;
|
|
359
|
-
if (twitter.card) tags.push(`<meta name="twitter:card" content="${escapeHtml(twitter.card)}">`);
|
|
360
|
-
if (twitter.site) tags.push(`<meta name="twitter:site" content="${escapeHtml(twitter.site)}">`);
|
|
361
|
-
if (twitter.creator)
|
|
362
|
-
tags.push(`<meta name="twitter:creator" content="${escapeHtml(twitter.creator)}">`);
|
|
363
|
-
if (twitter.title)
|
|
364
|
-
tags.push(`<meta name="twitter:title" content="${escapeHtml(twitter.title)}">`);
|
|
365
|
-
if (twitter.description)
|
|
366
|
-
tags.push(`<meta name="twitter:description" content="${escapeHtml(twitter.description)}">`);
|
|
367
|
-
if (twitter.image)
|
|
368
|
-
tags.push(`<meta name="twitter:image" content="${escapeHtml(twitter.image)}">`);
|
|
369
|
-
}
|
|
370
|
-
return tags.join("\n ");
|
|
371
|
-
}
|
|
372
|
-
function generateCSS() {
|
|
373
|
-
return `
|
|
374
|
-
* {
|
|
375
|
-
margin: 0;
|
|
376
|
-
padding: 0;
|
|
377
|
-
box-sizing: border-box;
|
|
378
|
-
}
|
|
379
|
-
|
|
380
|
-
body {
|
|
381
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
382
|
-
line-height: 1.6;
|
|
383
|
-
color: #333;
|
|
384
|
-
background-color: #f8f9fa;
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
.header {
|
|
388
|
-
background: white;
|
|
389
|
-
padding: 2rem;
|
|
390
|
-
border-bottom: 1px solid #e9ecef;
|
|
391
|
-
margin-bottom: 2rem;
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
.header h1 {
|
|
395
|
-
color: #2c3e50;
|
|
396
|
-
margin-bottom: 1rem;
|
|
397
|
-
font-size: 2rem;
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
.meta-info {
|
|
401
|
-
display: grid;
|
|
402
|
-
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
|
403
|
-
gap: 0.5rem;
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
.meta-info p {
|
|
407
|
-
margin: 0.25rem 0;
|
|
408
|
-
font-size: 0.9rem;
|
|
409
|
-
color: #6c757d;
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
.toc {
|
|
413
|
-
background: white;
|
|
414
|
-
padding: 1.5rem;
|
|
415
|
-
margin: 2rem 0;
|
|
416
|
-
border-radius: 8px;
|
|
417
|
-
border: 1px solid #e9ecef;
|
|
418
|
-
}
|
|
419
|
-
|
|
420
|
-
.toc h2 {
|
|
421
|
-
color: #2c3e50;
|
|
422
|
-
margin-bottom: 1rem;
|
|
423
|
-
font-size: 1.25rem;
|
|
424
|
-
}
|
|
425
|
-
|
|
426
|
-
.toc ul {
|
|
427
|
-
list-style: none;
|
|
428
|
-
}
|
|
429
|
-
|
|
430
|
-
.toc li {
|
|
431
|
-
margin: 0.5rem 0;
|
|
432
|
-
}
|
|
433
|
-
|
|
434
|
-
.toc a {
|
|
435
|
-
color: #007bff;
|
|
436
|
-
text-decoration: none;
|
|
437
|
-
transition: color 0.2s;
|
|
438
|
-
}
|
|
439
|
-
|
|
440
|
-
.toc a:hover {
|
|
441
|
-
color: #0056b3;
|
|
442
|
-
text-decoration: underline;
|
|
443
|
-
}
|
|
444
|
-
|
|
445
|
-
.content {
|
|
446
|
-
max-width: 800px;
|
|
447
|
-
margin: 0 auto;
|
|
448
|
-
padding: 0 1rem;
|
|
449
|
-
}
|
|
450
|
-
|
|
451
|
-
.page {
|
|
452
|
-
background: white;
|
|
453
|
-
margin: 2rem 0;
|
|
454
|
-
padding: 2rem;
|
|
455
|
-
border-radius: 8px;
|
|
456
|
-
border: 1px solid #e9ecef;
|
|
457
|
-
box-shadow: 0 2px 4px rgba(0,0,0,0.05);
|
|
458
|
-
}
|
|
459
|
-
|
|
460
|
-
.page-header {
|
|
461
|
-
border-bottom: 2px solid #e9ecef;
|
|
462
|
-
padding-bottom: 1rem;
|
|
463
|
-
margin-bottom: 2rem;
|
|
464
|
-
}
|
|
465
|
-
|
|
466
|
-
.page-header h2 {
|
|
467
|
-
color: #2c3e50;
|
|
468
|
-
margin-bottom: 0.5rem;
|
|
469
|
-
font-size: 1.5rem;
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
.page-meta {
|
|
473
|
-
display: flex;
|
|
474
|
-
flex-wrap: wrap;
|
|
475
|
-
gap: 1rem;
|
|
476
|
-
font-size: 0.9rem;
|
|
477
|
-
color: #6c757d;
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
.page-content {
|
|
481
|
-
line-height: 1.8;
|
|
482
|
-
}
|
|
483
|
-
|
|
484
|
-
.page-content h1, .page-content h2, .page-content h3,
|
|
485
|
-
.page-content h4, .page-content h5, .page-content h6 {
|
|
486
|
-
color: #2c3e50;
|
|
487
|
-
margin: 1.5rem 0 0.5rem 0;
|
|
488
|
-
}
|
|
489
|
-
|
|
490
|
-
.page-content p {
|
|
491
|
-
margin: 1rem 0;
|
|
492
|
-
}
|
|
493
|
-
|
|
494
|
-
.page-content a {
|
|
495
|
-
color: #007bff;
|
|
496
|
-
text-decoration: none;
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
.page-content a:hover {
|
|
500
|
-
text-decoration: underline;
|
|
501
|
-
}
|
|
502
|
-
|
|
503
|
-
.page-content code {
|
|
504
|
-
background: #f8f9fa;
|
|
505
|
-
padding: 0.2rem 0.4rem;
|
|
506
|
-
border-radius: 4px;
|
|
507
|
-
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
|
508
|
-
font-size: 0.9em;
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
.page-content pre {
|
|
512
|
-
background: #f8f9fa;
|
|
513
|
-
padding: 1rem;
|
|
514
|
-
border-radius: 4px;
|
|
515
|
-
overflow-x: auto;
|
|
516
|
-
margin: 1rem 0;
|
|
517
|
-
}
|
|
518
|
-
|
|
519
|
-
.page-content blockquote {
|
|
520
|
-
border-left: 4px solid #007bff;
|
|
521
|
-
padding-left: 1rem;
|
|
522
|
-
margin: 1rem 0;
|
|
523
|
-
color: #6c757d;
|
|
524
|
-
}
|
|
525
|
-
|
|
526
|
-
.footer {
|
|
527
|
-
text-align: center;
|
|
528
|
-
padding: 2rem;
|
|
529
|
-
margin-top: 3rem;
|
|
530
|
-
border-top: 1px solid #e9ecef;
|
|
531
|
-
color: #6c757d;
|
|
532
|
-
font-size: 0.9rem;
|
|
533
|
-
}
|
|
534
|
-
|
|
535
|
-
@media (max-width: 768px) {
|
|
536
|
-
.header {
|
|
537
|
-
padding: 1rem;
|
|
538
|
-
}
|
|
539
|
-
|
|
540
|
-
.header h1 {
|
|
541
|
-
font-size: 1.5rem;
|
|
542
|
-
}
|
|
543
|
-
|
|
544
|
-
.page {
|
|
545
|
-
padding: 1rem;
|
|
546
|
-
}
|
|
547
|
-
|
|
548
|
-
.page-meta {
|
|
549
|
-
flex-direction: column;
|
|
550
|
-
gap: 0.5rem;
|
|
551
|
-
}
|
|
552
|
-
}
|
|
553
|
-
`.trim();
|
|
554
|
-
}
|
|
555
|
-
function generateTOC(pages) {
|
|
556
|
-
const tocItems = pages.map((page, index) => {
|
|
557
|
-
const pageNumber = index + 1;
|
|
558
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
559
|
-
const id = `page-${pageNumber}`;
|
|
560
|
-
return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
|
|
561
|
-
}).join("\n");
|
|
562
|
-
return `
|
|
563
|
-
<nav class="toc">
|
|
564
|
-
<h2>Table of Contents</h2>
|
|
565
|
-
<ul>
|
|
566
|
-
${tocItems}
|
|
567
|
-
</ul>
|
|
568
|
-
</nav>`;
|
|
569
|
-
}
|
|
570
|
-
function generatePageHTML(page, pageNumber) {
|
|
571
|
-
const id = `page-${pageNumber}`;
|
|
572
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
573
|
-
return `
|
|
574
|
-
<article class="page" id="${id}">
|
|
575
|
-
<div class="page-header">
|
|
576
|
-
<h2>${pageNumber}. ${escapeHtml(title)}</h2>
|
|
577
|
-
<div class="page-meta">
|
|
578
|
-
<span><strong>URL:</strong> <a href="${escapeHtml(
|
|
579
|
-
page.url
|
|
580
|
-
)}" target="_blank">${escapeHtml(page.url)}</a></span>
|
|
581
|
-
<span><strong>Depth:</strong> ${page.depth}</span>
|
|
582
|
-
<span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
|
|
583
|
-
</div>
|
|
584
|
-
</div>
|
|
585
|
-
<div class="page-content">
|
|
586
|
-
${page.html}
|
|
587
|
-
</div>
|
|
588
|
-
</article>`;
|
|
589
|
-
}
|
|
590
|
-
function generateJavaScript() {
|
|
591
|
-
return `
|
|
592
|
-
// Smooth scrolling for TOC links
|
|
593
|
-
document.querySelectorAll('a[href^="#"]').forEach(anchor => {
|
|
594
|
-
anchor.addEventListener('click', function (e) {
|
|
595
|
-
e.preventDefault();
|
|
596
|
-
const target = document.querySelector(this.getAttribute('href'));
|
|
597
|
-
if (target) {
|
|
598
|
-
target.scrollIntoView({
|
|
599
|
-
behavior: 'smooth',
|
|
600
|
-
block: 'start'
|
|
601
|
-
});
|
|
602
|
-
}
|
|
603
|
-
});
|
|
604
|
-
});
|
|
605
|
-
|
|
606
|
-
// Highlight current section in TOC
|
|
607
|
-
window.addEventListener('scroll', function() {
|
|
608
|
-
const pages = document.querySelectorAll('.page');
|
|
609
|
-
const tocLinks = document.querySelectorAll('.toc a');
|
|
610
|
-
|
|
611
|
-
let currentPage = null;
|
|
612
|
-
pages.forEach(page => {
|
|
613
|
-
const rect = page.getBoundingClientRect();
|
|
614
|
-
if (rect.top <= 100) {
|
|
615
|
-
currentPage = page;
|
|
616
|
-
}
|
|
617
|
-
});
|
|
618
|
-
|
|
619
|
-
tocLinks.forEach(link => {
|
|
620
|
-
link.style.fontWeight = 'normal';
|
|
621
|
-
const target = document.querySelector(link.getAttribute('href'));
|
|
622
|
-
if (target === currentPage) {
|
|
623
|
-
link.style.fontWeight = 'bold';
|
|
624
|
-
}
|
|
625
|
-
});
|
|
626
|
-
});
|
|
627
|
-
`;
|
|
628
|
-
}
|
|
629
|
-
function escapeHtml(text) {
|
|
630
|
-
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'").replace(/\//g, "/");
|
|
631
|
-
}
|
|
632
|
-
function extractDomainFromUrl2(url) {
|
|
633
|
-
try {
|
|
634
|
-
return new URL(url).hostname;
|
|
635
|
-
} catch {
|
|
636
|
-
return "Unknown";
|
|
637
|
-
}
|
|
638
|
-
}
|
|
639
|
-
|
|
640
|
-
// src/formatters/json.ts
|
|
641
|
-
function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
|
|
642
|
-
const jsonResult = {
|
|
643
|
-
metadata: {
|
|
644
|
-
baseUrl,
|
|
645
|
-
totalPages: pages.length,
|
|
646
|
-
scrapedAt,
|
|
647
|
-
duration,
|
|
648
|
-
website
|
|
649
|
-
},
|
|
650
|
-
pages: pages.map((page, index) => ({
|
|
651
|
-
index: index + 1,
|
|
652
|
-
url: page.url,
|
|
653
|
-
title: page.title,
|
|
654
|
-
markdown: page.markdown,
|
|
655
|
-
html: page.html,
|
|
656
|
-
fetchedAt: page.fetchedAt,
|
|
657
|
-
depth: page.depth,
|
|
658
|
-
wordCount: countWords(page.markdown),
|
|
659
|
-
readingTime: estimateReadingTime(page.markdown)
|
|
660
|
-
}))
|
|
661
|
-
};
|
|
662
|
-
return JSON.stringify(jsonResult, null, 2);
|
|
663
|
-
}
|
|
664
|
-
function formatToJsonLite(pages, baseUrl, scrapedAt, duration, website) {
|
|
665
|
-
const jsonResult = {
|
|
666
|
-
metadata: {
|
|
667
|
-
baseUrl,
|
|
668
|
-
totalPages: pages.length,
|
|
669
|
-
scrapedAt,
|
|
670
|
-
duration,
|
|
671
|
-
website
|
|
672
|
-
},
|
|
673
|
-
pages: pages.map((page, index) => ({
|
|
674
|
-
index: index + 1,
|
|
675
|
-
url: page.url,
|
|
676
|
-
title: page.title,
|
|
677
|
-
markdown: page.markdown,
|
|
678
|
-
fetchedAt: page.fetchedAt,
|
|
679
|
-
depth: page.depth,
|
|
680
|
-
wordCount: countWords(page.markdown),
|
|
681
|
-
readingTime: estimateReadingTime(page.markdown)
|
|
682
|
-
}))
|
|
683
|
-
};
|
|
684
|
-
return JSON.stringify(jsonResult, null, 2);
|
|
685
|
-
}
|
|
686
|
-
function countWords(markdown) {
|
|
687
|
-
const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
|
|
688
|
-
return plainText.split(/\s+/).filter((word) => word.length > 0).length;
|
|
689
|
-
}
|
|
690
|
-
function estimateReadingTime(markdown) {
|
|
691
|
-
const wordCount = countWords(markdown);
|
|
692
|
-
return Math.ceil(wordCount / 200);
|
|
693
|
-
}
|
|
694
|
-
|
|
695
|
-
// src/formatters/text.ts
|
|
696
|
-
import { parseHTML } from "linkedom";
|
|
697
|
-
function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
698
|
-
const sections = [];
|
|
699
|
-
if (includeMetadata) {
|
|
700
|
-
sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
701
|
-
}
|
|
702
|
-
sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
|
|
703
|
-
return sections.join("\n\n");
|
|
704
|
-
}
|
|
705
|
-
function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
706
|
-
const title = website.title || extractDomainFromUrl3(baseUrl);
|
|
707
|
-
const lines = [];
|
|
708
|
-
lines.push(`=== ${title} ===`);
|
|
709
|
-
lines.push("");
|
|
710
|
-
lines.push(`URL: ${baseUrl}`);
|
|
711
|
-
lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
|
|
712
|
-
lines.push(`Duration: ${duration}ms`);
|
|
713
|
-
lines.push(`Pages: ${totalPages}`);
|
|
714
|
-
if (website.description) {
|
|
715
|
-
lines.push(`Description: ${website.description}`);
|
|
716
|
-
}
|
|
717
|
-
if (website.author) {
|
|
718
|
-
lines.push(`Author: ${website.author}`);
|
|
719
|
-
}
|
|
720
|
-
if (website.language) {
|
|
721
|
-
lines.push(`Language: ${website.language}`);
|
|
722
|
-
}
|
|
723
|
-
return lines.join("\n");
|
|
724
|
-
}
|
|
725
|
-
function createTextPage(page, pageNumber, showSeparator) {
|
|
726
|
-
const lines = [];
|
|
727
|
-
if (showSeparator) {
|
|
728
|
-
lines.push("\u2500".repeat(60));
|
|
729
|
-
lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
|
|
730
|
-
lines.push(`URL: ${page.url}`);
|
|
731
|
-
lines.push("\u2500".repeat(60));
|
|
732
|
-
}
|
|
733
|
-
const plainText = htmlToPlainText(page.html);
|
|
734
|
-
lines.push(plainText);
|
|
735
|
-
return lines.join("\n");
|
|
736
|
-
}
|
|
737
|
-
function htmlToPlainText(html) {
|
|
738
|
-
const { document } = parseHTML(html);
|
|
739
|
-
const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
|
|
740
|
-
elementsToRemove.forEach((tag) => {
|
|
741
|
-
document.querySelectorAll(tag).forEach((el) => el.remove());
|
|
742
|
-
});
|
|
743
|
-
let text = document.body?.textContent || document.documentElement?.textContent || "";
|
|
744
|
-
text = text.replace(/[ \t]+/g, " ");
|
|
745
|
-
text = text.replace(/\n[ \t]+/g, "\n");
|
|
746
|
-
text = text.replace(/[ \t]+\n/g, "\n");
|
|
747
|
-
text = text.replace(/\n{3,}/g, "\n\n");
|
|
748
|
-
text = text.trim();
|
|
749
|
-
return text;
|
|
750
|
-
}
|
|
751
|
-
function extractDomainFromUrl3(url) {
|
|
752
|
-
try {
|
|
753
|
-
return new URL(url).hostname;
|
|
754
|
-
} catch {
|
|
755
|
-
return "Unknown";
|
|
756
|
-
}
|
|
757
|
-
}
|
|
227
|
+
var formatToMarkdown = htmlToMarkdown;
|
|
758
228
|
|
|
759
229
|
// src/utils/content-cleaner.ts
|
|
760
|
-
import { parseHTML
|
|
230
|
+
import { parseHTML } from "linkedom";
|
|
761
231
|
var ALWAYS_REMOVE_SELECTORS = [
|
|
762
|
-
// Navigation and menus
|
|
763
|
-
"nav",
|
|
764
|
-
"header nav",
|
|
765
|
-
"footer nav",
|
|
766
|
-
".nav",
|
|
767
|
-
".navigation",
|
|
768
|
-
".menu",
|
|
769
|
-
".navbar",
|
|
770
|
-
".sidebar",
|
|
771
|
-
".aside",
|
|
772
|
-
// Header and footer elements
|
|
773
|
-
"header",
|
|
774
|
-
"footer",
|
|
775
|
-
".site-header",
|
|
776
|
-
".page-header",
|
|
777
|
-
".site-footer",
|
|
778
|
-
".page-footer",
|
|
779
|
-
// Social media and sharing
|
|
780
|
-
".social",
|
|
781
|
-
".share",
|
|
782
|
-
".sharing",
|
|
783
|
-
".twitter",
|
|
784
|
-
".facebook",
|
|
785
|
-
".linkedin",
|
|
786
|
-
".instagram",
|
|
787
|
-
// Comments and discussions
|
|
788
|
-
".comments",
|
|
789
|
-
".comment",
|
|
790
|
-
".discussion",
|
|
791
|
-
".disqus",
|
|
792
|
-
// Forms and interactive elements
|
|
793
|
-
"form",
|
|
794
|
-
"input",
|
|
795
|
-
"button:not([type='submit'])",
|
|
796
|
-
"select",
|
|
797
|
-
"textarea",
|
|
798
232
|
// Scripts and styles
|
|
799
233
|
"script",
|
|
800
234
|
"style",
|
|
801
235
|
"noscript",
|
|
236
|
+
"link[rel='stylesheet']",
|
|
802
237
|
// Hidden elements
|
|
803
238
|
"[hidden]",
|
|
239
|
+
"[aria-hidden='true']",
|
|
804
240
|
"[style*='display: none']",
|
|
805
241
|
"[style*='display:none']",
|
|
806
|
-
|
|
807
|
-
"
|
|
808
|
-
|
|
809
|
-
"
|
|
242
|
+
"[style*='visibility: hidden']",
|
|
243
|
+
"[style*='visibility:hidden']",
|
|
244
|
+
// SVG icons and decorative elements
|
|
245
|
+
"svg[aria-hidden='true']",
|
|
246
|
+
"svg.icon",
|
|
247
|
+
"svg[class*='icon']",
|
|
248
|
+
// Template and metadata
|
|
249
|
+
"template",
|
|
250
|
+
"meta",
|
|
251
|
+
// Embeds that don't convert to text
|
|
252
|
+
"iframe",
|
|
253
|
+
"canvas",
|
|
254
|
+
"object",
|
|
255
|
+
"embed",
|
|
256
|
+
// Forms (usually not main content)
|
|
257
|
+
"form",
|
|
258
|
+
"input",
|
|
259
|
+
"select",
|
|
260
|
+
"textarea",
|
|
261
|
+
"button"
|
|
262
|
+
];
|
|
263
|
+
var OVERLAY_SELECTORS = [
|
|
264
|
+
"[class*='modal']",
|
|
265
|
+
"[class*='popup']",
|
|
266
|
+
"[class*='overlay']",
|
|
267
|
+
"[class*='dialog']",
|
|
268
|
+
"[role='dialog']",
|
|
269
|
+
"[role='alertdialog']",
|
|
270
|
+
"[class*='cookie']",
|
|
271
|
+
"[class*='consent']",
|
|
272
|
+
"[class*='gdpr']",
|
|
273
|
+
"[class*='privacy-banner']",
|
|
274
|
+
"[class*='notification-bar']",
|
|
275
|
+
"[id*='cookie']",
|
|
276
|
+
"[id*='consent']",
|
|
277
|
+
"[id*='gdpr']",
|
|
278
|
+
// Fixed/sticky positioned elements
|
|
279
|
+
"[style*='position: fixed']",
|
|
280
|
+
"[style*='position:fixed']",
|
|
281
|
+
"[style*='position: sticky']",
|
|
282
|
+
"[style*='position:sticky']"
|
|
283
|
+
];
|
|
284
|
+
var NAVIGATION_SELECTORS = [
|
|
285
|
+
// Semantic elements
|
|
286
|
+
"header",
|
|
287
|
+
"footer",
|
|
288
|
+
"nav",
|
|
289
|
+
"aside",
|
|
290
|
+
// Header variations
|
|
291
|
+
".header",
|
|
292
|
+
".top",
|
|
293
|
+
".navbar",
|
|
294
|
+
"#header",
|
|
295
|
+
// Footer variations
|
|
296
|
+
".footer",
|
|
297
|
+
".bottom",
|
|
298
|
+
"#footer",
|
|
299
|
+
// Sidebars
|
|
300
|
+
".sidebar",
|
|
301
|
+
".side",
|
|
302
|
+
".aside",
|
|
303
|
+
"#sidebar",
|
|
304
|
+
// Modals/popups (backup if not caught by OVERLAY_SELECTORS)
|
|
810
305
|
".modal",
|
|
306
|
+
".popup",
|
|
307
|
+
"#modal",
|
|
811
308
|
".overlay",
|
|
812
|
-
|
|
309
|
+
// Ads
|
|
310
|
+
".ad",
|
|
311
|
+
".ads",
|
|
312
|
+
".advert",
|
|
313
|
+
"#ad",
|
|
314
|
+
// Language selectors
|
|
315
|
+
".lang-selector",
|
|
316
|
+
".language",
|
|
317
|
+
"#language-selector",
|
|
318
|
+
// Social
|
|
319
|
+
".social",
|
|
320
|
+
".social-media",
|
|
321
|
+
".social-links",
|
|
322
|
+
"#social",
|
|
323
|
+
// Navigation/menus
|
|
324
|
+
".menu",
|
|
325
|
+
".navigation",
|
|
326
|
+
"#nav",
|
|
813
327
|
// Breadcrumbs
|
|
814
|
-
".breadcrumb",
|
|
815
328
|
".breadcrumbs",
|
|
816
|
-
"
|
|
329
|
+
"#breadcrumbs",
|
|
330
|
+
// Share buttons
|
|
331
|
+
".share",
|
|
332
|
+
"#share",
|
|
333
|
+
// Widgets
|
|
334
|
+
".widget",
|
|
335
|
+
"#widget",
|
|
336
|
+
// Cookie notices (backup)
|
|
337
|
+
".cookie",
|
|
338
|
+
"#cookie"
|
|
339
|
+
];
|
|
340
|
+
var FORCE_INCLUDE_SELECTORS = [
|
|
341
|
+
// IDs
|
|
342
|
+
"#main",
|
|
343
|
+
"#content",
|
|
344
|
+
"#main-content",
|
|
345
|
+
"#article",
|
|
346
|
+
"#post",
|
|
347
|
+
"#page-content",
|
|
348
|
+
// Semantic elements
|
|
349
|
+
"main",
|
|
350
|
+
"article",
|
|
351
|
+
"[role='main']",
|
|
352
|
+
// Classes
|
|
353
|
+
".main-content",
|
|
354
|
+
".content",
|
|
355
|
+
".post-content",
|
|
356
|
+
".article-content",
|
|
357
|
+
".entry-content",
|
|
358
|
+
".page-content",
|
|
359
|
+
".article-body",
|
|
360
|
+
".post-body",
|
|
361
|
+
".story-content",
|
|
362
|
+
".blog-content"
|
|
817
363
|
];
|
|
818
364
|
var AD_SELECTORS = [
|
|
819
|
-
//
|
|
820
|
-
".
|
|
821
|
-
".ads",
|
|
822
|
-
".advertisement",
|
|
823
|
-
".promotion",
|
|
824
|
-
".sponsored",
|
|
825
|
-
"[class*='ad-']",
|
|
826
|
-
"[id*='ad-']",
|
|
827
|
-
"[class*='advert']",
|
|
828
|
-
"[id*='advert']",
|
|
829
|
-
"[class*='banner']",
|
|
830
|
-
"[id*='banner']",
|
|
365
|
+
// Google ads
|
|
366
|
+
"ins.adsbygoogle",
|
|
831
367
|
".google-ad",
|
|
832
368
|
".adsense",
|
|
369
|
+
// Generic ad containers
|
|
833
370
|
"[data-ad]",
|
|
834
371
|
"[data-ads]",
|
|
835
|
-
"
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
"
|
|
839
|
-
"
|
|
840
|
-
"
|
|
372
|
+
"[data-ad-slot]",
|
|
373
|
+
"[data-ad-client]",
|
|
374
|
+
// Common ad class patterns
|
|
375
|
+
".ad-container",
|
|
376
|
+
".ad-wrapper",
|
|
377
|
+
".advertisement",
|
|
378
|
+
".sponsored-content",
|
|
379
|
+
// Tracking pixels
|
|
380
|
+
"img[width='1'][height='1']",
|
|
381
|
+
"img[src*='pixel']",
|
|
382
|
+
"img[src*='tracking']",
|
|
383
|
+
"img[src*='analytics']"
|
|
841
384
|
];
|
|
842
|
-
function
|
|
843
|
-
const
|
|
844
|
-
const
|
|
845
|
-
|
|
385
|
+
function getLinkDensity(element) {
|
|
386
|
+
const text = element.textContent || "";
|
|
387
|
+
const textLength = text.trim().length;
|
|
388
|
+
if (textLength === 0) return 1;
|
|
389
|
+
let linkLength = 0;
|
|
390
|
+
element.querySelectorAll("a").forEach((link) => {
|
|
391
|
+
linkLength += (link.textContent || "").trim().length;
|
|
392
|
+
});
|
|
393
|
+
return linkLength / textLength;
|
|
394
|
+
}
|
|
395
|
+
function getContentScore(element) {
|
|
396
|
+
let score = 0;
|
|
397
|
+
const text = element.textContent || "";
|
|
398
|
+
const textLength = text.trim().length;
|
|
399
|
+
score += Math.min(textLength / 100, 50);
|
|
400
|
+
score += element.querySelectorAll("p").length * 3;
|
|
401
|
+
score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2;
|
|
402
|
+
score += element.querySelectorAll("img").length * 1;
|
|
403
|
+
score -= element.querySelectorAll("a").length * 0.5;
|
|
404
|
+
score -= element.querySelectorAll("li").length * 0.2;
|
|
405
|
+
const linkDensity = getLinkDensity(element);
|
|
406
|
+
if (linkDensity > 0.5) score -= 30;
|
|
407
|
+
else if (linkDensity > 0.3) score -= 15;
|
|
408
|
+
const classAndId = (element.className || "") + " " + (element.id || "");
|
|
409
|
+
if (/article|content|post|body|main|entry/i.test(classAndId)) score += 25;
|
|
410
|
+
if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) score -= 25;
|
|
411
|
+
return score;
|
|
412
|
+
}
|
|
413
|
+
function looksLikeNavigation(element) {
|
|
414
|
+
const linkDensity = getLinkDensity(element);
|
|
415
|
+
if (linkDensity > 0.5) return true;
|
|
416
|
+
const listItems = element.querySelectorAll("li");
|
|
417
|
+
const links = element.querySelectorAll("a");
|
|
418
|
+
if (listItems.length > 5 && links.length > listItems.length * 0.8) return true;
|
|
419
|
+
return false;
|
|
420
|
+
}
|
|
421
|
+
function removeElements(document, selectors) {
|
|
422
|
+
for (const selector of selectors) {
|
|
846
423
|
try {
|
|
847
424
|
document.querySelectorAll(selector).forEach((el) => el.remove());
|
|
848
425
|
} catch {
|
|
849
426
|
}
|
|
850
427
|
}
|
|
428
|
+
}
|
|
429
|
+
function removeWithProtection(document, selectorsToRemove, protectedSelectors) {
|
|
430
|
+
for (const selector of selectorsToRemove) {
|
|
431
|
+
try {
|
|
432
|
+
document.querySelectorAll(selector).forEach((element) => {
|
|
433
|
+
const isProtected = protectedSelectors.some((ps) => {
|
|
434
|
+
try {
|
|
435
|
+
return element.matches(ps);
|
|
436
|
+
} catch {
|
|
437
|
+
return false;
|
|
438
|
+
}
|
|
439
|
+
});
|
|
440
|
+
if (isProtected) return;
|
|
441
|
+
const containsProtected = protectedSelectors.some((ps) => {
|
|
442
|
+
try {
|
|
443
|
+
return element.querySelector(ps) !== null;
|
|
444
|
+
} catch {
|
|
445
|
+
return false;
|
|
446
|
+
}
|
|
447
|
+
});
|
|
448
|
+
if (containsProtected) return;
|
|
449
|
+
element.remove();
|
|
450
|
+
});
|
|
451
|
+
} catch {
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
function findMainContent(document) {
|
|
456
|
+
const isValidContent = (el) => {
|
|
457
|
+
if (!el) return false;
|
|
458
|
+
const text = el.textContent || "";
|
|
459
|
+
if (text.trim().length < 100) return false;
|
|
460
|
+
if (looksLikeNavigation(el)) return false;
|
|
461
|
+
return true;
|
|
462
|
+
};
|
|
463
|
+
const main = document.querySelector("main");
|
|
464
|
+
if (isValidContent(main) && getLinkDensity(main) < 0.4) {
|
|
465
|
+
return main;
|
|
466
|
+
}
|
|
467
|
+
const roleMain = document.querySelector('[role="main"]');
|
|
468
|
+
if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) {
|
|
469
|
+
return roleMain;
|
|
470
|
+
}
|
|
471
|
+
const articles = document.querySelectorAll("article");
|
|
472
|
+
if (articles.length === 1 && isValidContent(articles[0])) {
|
|
473
|
+
return articles[0];
|
|
474
|
+
}
|
|
475
|
+
const contentSelectors = [
|
|
476
|
+
"#content",
|
|
477
|
+
"#main-content",
|
|
478
|
+
"#main",
|
|
479
|
+
".content",
|
|
480
|
+
".main-content",
|
|
481
|
+
".post-content",
|
|
482
|
+
".article-content",
|
|
483
|
+
".entry-content",
|
|
484
|
+
".page-content",
|
|
485
|
+
".article-body",
|
|
486
|
+
".post-body",
|
|
487
|
+
".story-content",
|
|
488
|
+
".blog-content"
|
|
489
|
+
];
|
|
490
|
+
for (const selector of contentSelectors) {
|
|
491
|
+
try {
|
|
492
|
+
const el = document.querySelector(selector);
|
|
493
|
+
if (isValidContent(el) && getLinkDensity(el) < 0.4) {
|
|
494
|
+
return el;
|
|
495
|
+
}
|
|
496
|
+
} catch {
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
const candidates = [];
|
|
500
|
+
const containers = document.querySelectorAll("div, section, article");
|
|
501
|
+
containers.forEach((el) => {
|
|
502
|
+
const text = el.textContent || "";
|
|
503
|
+
if (text.trim().length < 200) return;
|
|
504
|
+
const score = getContentScore(el);
|
|
505
|
+
if (score > 0) {
|
|
506
|
+
candidates.push({ el, score });
|
|
507
|
+
}
|
|
508
|
+
});
|
|
509
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
510
|
+
if (candidates.length > 0 && candidates[0].score > 20) {
|
|
511
|
+
return candidates[0].el;
|
|
512
|
+
}
|
|
513
|
+
return null;
|
|
514
|
+
}
|
|
515
|
+
function cleanHtml(html, baseUrl, options = {}) {
|
|
516
|
+
const {
|
|
517
|
+
removeAds = true,
|
|
518
|
+
removeBase64Images = true,
|
|
519
|
+
onlyMainContent = true,
|
|
520
|
+
includeTags,
|
|
521
|
+
excludeTags
|
|
522
|
+
} = options;
|
|
523
|
+
const { document } = parseHTML(html);
|
|
524
|
+
removeElements(document, ALWAYS_REMOVE_SELECTORS);
|
|
525
|
+
removeElements(document, OVERLAY_SELECTORS);
|
|
851
526
|
if (removeAds) {
|
|
852
|
-
|
|
527
|
+
removeElements(document, AD_SELECTORS);
|
|
528
|
+
}
|
|
529
|
+
if (excludeTags && excludeTags.length > 0) {
|
|
530
|
+
removeElements(document, excludeTags);
|
|
531
|
+
}
|
|
532
|
+
if (onlyMainContent) {
|
|
533
|
+
removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS);
|
|
534
|
+
const mainContent = findMainContent(document);
|
|
535
|
+
if (mainContent) {
|
|
536
|
+
const body = document.body;
|
|
537
|
+
if (body) {
|
|
538
|
+
const clone = mainContent.cloneNode(true);
|
|
539
|
+
body.innerHTML = "";
|
|
540
|
+
body.appendChild(clone);
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
if (includeTags && includeTags.length > 0) {
|
|
545
|
+
const matchedElements = [];
|
|
546
|
+
for (const selector of includeTags) {
|
|
853
547
|
try {
|
|
854
|
-
document.querySelectorAll(selector).forEach((el) =>
|
|
548
|
+
document.querySelectorAll(selector).forEach((el) => {
|
|
549
|
+
matchedElements.push(el.cloneNode(true));
|
|
550
|
+
});
|
|
855
551
|
} catch {
|
|
856
552
|
}
|
|
857
553
|
}
|
|
554
|
+
if (matchedElements.length > 0) {
|
|
555
|
+
const body = document.body;
|
|
556
|
+
if (body) {
|
|
557
|
+
body.innerHTML = "";
|
|
558
|
+
matchedElements.forEach((el) => body.appendChild(el));
|
|
559
|
+
}
|
|
560
|
+
}
|
|
858
561
|
}
|
|
859
562
|
if (removeBase64Images) {
|
|
860
563
|
removeBase64ImagesFromDocument(document);
|
|
@@ -879,7 +582,10 @@ function removeBase64ImagesFromDocument(document) {
|
|
|
879
582
|
document.querySelectorAll("[style*='data:image']").forEach((el) => {
|
|
880
583
|
const style = el.getAttribute("style");
|
|
881
584
|
if (style) {
|
|
882
|
-
const cleanedStyle = style.replace(
|
|
585
|
+
const cleanedStyle = style.replace(
|
|
586
|
+
/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi,
|
|
587
|
+
""
|
|
588
|
+
);
|
|
883
589
|
if (cleanedStyle.trim()) {
|
|
884
590
|
el.setAttribute("style", cleanedStyle);
|
|
885
591
|
} else {
|
|
@@ -916,7 +622,7 @@ function cleanContent(html, baseUrl, options = {}) {
|
|
|
916
622
|
}
|
|
917
623
|
|
|
918
624
|
// src/utils/metadata-extractor.ts
|
|
919
|
-
import { parseHTML as
|
|
625
|
+
import { parseHTML as parseHTML2 } from "linkedom";
|
|
920
626
|
|
|
921
627
|
// src/utils/url-helpers.ts
|
|
922
628
|
import { URL as URL2 } from "url";
|
|
@@ -989,8 +695,26 @@ function isSameDomain(url, baseUrl) {
|
|
|
989
695
|
function getUrlKey(url) {
|
|
990
696
|
try {
|
|
991
697
|
const parsedUrl = new URL2(url);
|
|
698
|
+
parsedUrl.hash = "";
|
|
992
699
|
parsedUrl.search = "";
|
|
993
|
-
|
|
700
|
+
if (parsedUrl.hostname.startsWith("www.")) {
|
|
701
|
+
parsedUrl.hostname = parsedUrl.hostname.slice(4);
|
|
702
|
+
}
|
|
703
|
+
if (parsedUrl.protocol === "http:" && parsedUrl.port === "80" || parsedUrl.protocol === "https:" && parsedUrl.port === "443") {
|
|
704
|
+
parsedUrl.port = "";
|
|
705
|
+
}
|
|
706
|
+
const indexFiles = ["index.html", "index.htm", "default.html", "default.htm", "index.php"];
|
|
707
|
+
for (const indexFile of indexFiles) {
|
|
708
|
+
if (parsedUrl.pathname.endsWith(`/${indexFile}`)) {
|
|
709
|
+
parsedUrl.pathname = parsedUrl.pathname.slice(0, -indexFile.length);
|
|
710
|
+
break;
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
let normalized = parsedUrl.toString().toLowerCase();
|
|
714
|
+
if (normalized.endsWith("/") && parsedUrl.pathname !== "/") {
|
|
715
|
+
normalized = normalized.slice(0, -1);
|
|
716
|
+
}
|
|
717
|
+
return normalized;
|
|
994
718
|
} catch {
|
|
995
719
|
return url.toLowerCase();
|
|
996
720
|
}
|
|
@@ -1225,7 +949,7 @@ function extractMetadata(html, baseUrl) {
|
|
|
1225
949
|
return extractWebsiteMetadata(html, baseUrl);
|
|
1226
950
|
}
|
|
1227
951
|
function extractWebsiteMetadata(html, baseUrl) {
|
|
1228
|
-
const { document } =
|
|
952
|
+
const { document } = parseHTML2(html);
|
|
1229
953
|
const metadata = {
|
|
1230
954
|
title: null,
|
|
1231
955
|
description: null,
|
|
@@ -1380,11 +1104,20 @@ function extractTwitterCard(document) {
|
|
|
1380
1104
|
|
|
1381
1105
|
// src/utils/logger.ts
|
|
1382
1106
|
import pino from "pino";
|
|
1107
|
+
function hasPinoPretty() {
|
|
1108
|
+
try {
|
|
1109
|
+
__require.resolve("pino-pretty");
|
|
1110
|
+
return true;
|
|
1111
|
+
} catch {
|
|
1112
|
+
return false;
|
|
1113
|
+
}
|
|
1114
|
+
}
|
|
1383
1115
|
function createLogger(name = "reader", level = process.env.LOG_LEVEL || "info") {
|
|
1116
|
+
const usePretty = process.env.NODE_ENV !== "production" && hasPinoPretty();
|
|
1384
1117
|
return pino({
|
|
1385
1118
|
name,
|
|
1386
1119
|
level,
|
|
1387
|
-
transport:
|
|
1120
|
+
transport: usePretty ? {
|
|
1388
1121
|
target: "pino-pretty",
|
|
1389
1122
|
options: {
|
|
1390
1123
|
colorize: true,
|
|
@@ -1498,13 +1231,15 @@ function isUrlAllowed(url, rules) {
|
|
|
1498
1231
|
var DEFAULT_OPTIONS = {
|
|
1499
1232
|
urls: [],
|
|
1500
1233
|
formats: ["markdown"],
|
|
1501
|
-
includeMetadata: true,
|
|
1502
1234
|
timeoutMs: 3e4,
|
|
1503
1235
|
includePatterns: [],
|
|
1504
1236
|
excludePatterns: [],
|
|
1505
1237
|
// Content cleaning defaults
|
|
1506
1238
|
removeAds: true,
|
|
1507
1239
|
removeBase64Images: true,
|
|
1240
|
+
onlyMainContent: true,
|
|
1241
|
+
includeTags: [],
|
|
1242
|
+
excludeTags: [],
|
|
1508
1243
|
skipTLSVerification: true,
|
|
1509
1244
|
// Batch defaults
|
|
1510
1245
|
batchConcurrency: 1,
|
|
@@ -1518,7 +1253,7 @@ var DEFAULT_OPTIONS = {
|
|
|
1518
1253
|
showChrome: false
|
|
1519
1254
|
};
|
|
1520
1255
|
function isValidFormat(format) {
|
|
1521
|
-
return format === "markdown" || format === "html"
|
|
1256
|
+
return format === "markdown" || format === "html";
|
|
1522
1257
|
}
|
|
1523
1258
|
function shouldCrawlUrl2(url, baseDomain) {
|
|
1524
1259
|
return url.hostname === baseDomain || url.hostname.endsWith(`.${baseDomain}`);
|
|
@@ -1667,14 +1402,9 @@ var Scraper = class {
|
|
|
1667
1402
|
} catch {
|
|
1668
1403
|
}
|
|
1669
1404
|
await hero.waitForPaintingStable();
|
|
1670
|
-
let hadChallenge = false;
|
|
1671
|
-
let challengeType = "none";
|
|
1672
|
-
let waitTimeMs = 0;
|
|
1673
1405
|
const initialUrl = await hero.url;
|
|
1674
1406
|
const detection = await detectChallenge(hero);
|
|
1675
1407
|
if (detection.isChallenge) {
|
|
1676
|
-
hadChallenge = true;
|
|
1677
|
-
challengeType = detection.type;
|
|
1678
1408
|
if (this.options.verbose) {
|
|
1679
1409
|
this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
|
|
1680
1410
|
}
|
|
@@ -1684,12 +1414,11 @@ var Scraper = class {
|
|
|
1684
1414
|
verbose: this.options.verbose,
|
|
1685
1415
|
initialUrl
|
|
1686
1416
|
});
|
|
1687
|
-
waitTimeMs = result2.waitedMs;
|
|
1688
1417
|
if (!result2.resolved) {
|
|
1689
1418
|
throw new Error(`Challenge not resolved: ${detection.type}`);
|
|
1690
1419
|
}
|
|
1691
1420
|
if (this.options.verbose) {
|
|
1692
|
-
this.logger.info(`Challenge resolved via ${result2.method} in ${
|
|
1421
|
+
this.logger.info(`Challenge resolved via ${result2.method} in ${result2.waitedMs}ms`);
|
|
1693
1422
|
}
|
|
1694
1423
|
}
|
|
1695
1424
|
await this.waitForFinalPage(hero, url, this.options.verbose);
|
|
@@ -1702,45 +1431,18 @@ var Scraper = class {
|
|
|
1702
1431
|
this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
|
|
1703
1432
|
}
|
|
1704
1433
|
}
|
|
1705
|
-
const pageTitle = await hero.document.title;
|
|
1706
1434
|
const html = await hero.document.documentElement.outerHTML;
|
|
1707
1435
|
const cleanedHtml = cleanContent(html, url, {
|
|
1708
1436
|
removeAds: this.options.removeAds,
|
|
1709
|
-
removeBase64Images: this.options.removeBase64Images
|
|
1437
|
+
removeBase64Images: this.options.removeBase64Images,
|
|
1438
|
+
onlyMainContent: this.options.onlyMainContent,
|
|
1439
|
+
includeTags: this.options.includeTags,
|
|
1440
|
+
excludeTags: this.options.excludeTags
|
|
1710
1441
|
});
|
|
1711
1442
|
const websiteMetadata = extractMetadata(cleanedHtml, url);
|
|
1712
1443
|
const duration = Date.now() - startTime;
|
|
1713
|
-
const
|
|
1714
|
-
const
|
|
1715
|
-
url,
|
|
1716
|
-
title: pageTitle,
|
|
1717
|
-
markdown: "",
|
|
1718
|
-
// Will be set by formatter
|
|
1719
|
-
html: cleanedHtml,
|
|
1720
|
-
fetchedAt: scrapedAt,
|
|
1721
|
-
depth: 0,
|
|
1722
|
-
hadChallenge,
|
|
1723
|
-
challengeType,
|
|
1724
|
-
waitTimeMs
|
|
1725
|
-
};
|
|
1726
|
-
const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
|
|
1727
|
-
[page],
|
|
1728
|
-
url,
|
|
1729
|
-
scrapedAt,
|
|
1730
|
-
duration,
|
|
1731
|
-
websiteMetadata,
|
|
1732
|
-
this.options.includeMetadata
|
|
1733
|
-
) : void 0;
|
|
1734
|
-
const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1735
|
-
const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1736
|
-
const text = this.options.formats.includes("text") ? formatToText(
|
|
1737
|
-
[page],
|
|
1738
|
-
url,
|
|
1739
|
-
scrapedAt,
|
|
1740
|
-
duration,
|
|
1741
|
-
websiteMetadata,
|
|
1742
|
-
this.options.includeMetadata
|
|
1743
|
-
) : void 0;
|
|
1444
|
+
const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
|
|
1445
|
+
const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
|
|
1744
1446
|
if (this.options.onProgress) {
|
|
1745
1447
|
this.options.onProgress({
|
|
1746
1448
|
completed: index + 1,
|
|
@@ -1772,8 +1474,6 @@ var Scraper = class {
|
|
|
1772
1474
|
const result = {
|
|
1773
1475
|
markdown,
|
|
1774
1476
|
html: htmlOutput,
|
|
1775
|
-
json,
|
|
1776
|
-
text,
|
|
1777
1477
|
metadata: {
|
|
1778
1478
|
baseUrl: url,
|
|
1779
1479
|
totalPages: 1,
|
|
@@ -1828,7 +1528,7 @@ async function scrape(options) {
|
|
|
1828
1528
|
}
|
|
1829
1529
|
|
|
1830
1530
|
// src/crawler.ts
|
|
1831
|
-
import { parseHTML as
|
|
1531
|
+
import { parseHTML as parseHTML3 } from "linkedom";
|
|
1832
1532
|
|
|
1833
1533
|
// src/utils/rate-limiter.ts
|
|
1834
1534
|
import pLimit2 from "p-limit";
|
|
@@ -1977,12 +1677,26 @@ var Crawler = class {
|
|
|
1977
1677
|
*/
|
|
1978
1678
|
extractLinks(html, baseUrl, depth) {
|
|
1979
1679
|
const links = [];
|
|
1980
|
-
const { document } =
|
|
1680
|
+
const { document } = parseHTML3(html);
|
|
1981
1681
|
document.querySelectorAll("a[href]").forEach((anchor) => {
|
|
1982
|
-
const
|
|
1682
|
+
const rawHref = anchor.getAttribute("href");
|
|
1683
|
+
if (!rawHref) return;
|
|
1684
|
+
const href = rawHref.trim();
|
|
1983
1685
|
if (!href) return;
|
|
1984
|
-
|
|
1686
|
+
if (href.startsWith("#")) return;
|
|
1687
|
+
const lowerHref = href.toLowerCase();
|
|
1688
|
+
if (lowerHref.startsWith("javascript:") || lowerHref.startsWith("mailto:") || lowerHref.startsWith("tel:") || lowerHref.startsWith("data:") || lowerHref.startsWith("blob:") || lowerHref.startsWith("ftp:")) {
|
|
1689
|
+
return;
|
|
1690
|
+
}
|
|
1691
|
+
let resolved = resolveUrl(href, baseUrl);
|
|
1985
1692
|
if (!resolved || !isValidUrl(resolved)) return;
|
|
1693
|
+
try {
|
|
1694
|
+
const parsed = new URL(resolved);
|
|
1695
|
+
parsed.hash = "";
|
|
1696
|
+
resolved = parsed.toString();
|
|
1697
|
+
} catch {
|
|
1698
|
+
return;
|
|
1699
|
+
}
|
|
1986
1700
|
if (!isSameDomain(resolved, this.options.url)) return;
|
|
1987
1701
|
if (!isContentUrl(resolved)) return;
|
|
1988
1702
|
if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
|
|
@@ -3030,16 +2744,251 @@ async function isDaemonRunning(port = DEFAULT_DAEMON_PORT) {
|
|
|
3030
2744
|
const client = new DaemonClient({ port, timeoutMs: 5e3 });
|
|
3031
2745
|
return client.isRunning();
|
|
3032
2746
|
}
|
|
2747
|
+
|
|
2748
|
+
// src/formatters/html.ts
|
|
2749
|
+
function formatToHTML(html) {
|
|
2750
|
+
return html;
|
|
2751
|
+
}
|
|
2752
|
+
|
|
2753
|
+
// src/errors.ts
|
|
2754
|
+
var ReaderErrorCode = /* @__PURE__ */ ((ReaderErrorCode2) => {
|
|
2755
|
+
ReaderErrorCode2["NETWORK_ERROR"] = "NETWORK_ERROR";
|
|
2756
|
+
ReaderErrorCode2["TIMEOUT"] = "TIMEOUT";
|
|
2757
|
+
ReaderErrorCode2["CONNECTION_REFUSED"] = "CONNECTION_REFUSED";
|
|
2758
|
+
ReaderErrorCode2["CLOUDFLARE_CHALLENGE"] = "CLOUDFLARE_CHALLENGE";
|
|
2759
|
+
ReaderErrorCode2["BOT_DETECTED"] = "BOT_DETECTED";
|
|
2760
|
+
ReaderErrorCode2["ACCESS_DENIED"] = "ACCESS_DENIED";
|
|
2761
|
+
ReaderErrorCode2["CONTENT_EXTRACTION_FAILED"] = "CONTENT_EXTRACTION_FAILED";
|
|
2762
|
+
ReaderErrorCode2["EMPTY_CONTENT"] = "EMPTY_CONTENT";
|
|
2763
|
+
ReaderErrorCode2["INVALID_URL"] = "INVALID_URL";
|
|
2764
|
+
ReaderErrorCode2["INVALID_OPTIONS"] = "INVALID_OPTIONS";
|
|
2765
|
+
ReaderErrorCode2["ROBOTS_BLOCKED"] = "ROBOTS_BLOCKED";
|
|
2766
|
+
ReaderErrorCode2["BROWSER_ERROR"] = "BROWSER_ERROR";
|
|
2767
|
+
ReaderErrorCode2["POOL_EXHAUSTED"] = "POOL_EXHAUSTED";
|
|
2768
|
+
ReaderErrorCode2["CLIENT_CLOSED"] = "CLIENT_CLOSED";
|
|
2769
|
+
ReaderErrorCode2["NOT_INITIALIZED"] = "NOT_INITIALIZED";
|
|
2770
|
+
ReaderErrorCode2["UNKNOWN"] = "UNKNOWN";
|
|
2771
|
+
return ReaderErrorCode2;
|
|
2772
|
+
})(ReaderErrorCode || {});
|
|
2773
|
+
var ReaderError = class extends Error {
|
|
2774
|
+
code;
|
|
2775
|
+
url;
|
|
2776
|
+
cause;
|
|
2777
|
+
timestamp;
|
|
2778
|
+
retryable;
|
|
2779
|
+
constructor(message, code, options) {
|
|
2780
|
+
super(message);
|
|
2781
|
+
this.name = "ReaderError";
|
|
2782
|
+
this.code = code;
|
|
2783
|
+
this.url = options?.url;
|
|
2784
|
+
this.cause = options?.cause;
|
|
2785
|
+
this.timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
2786
|
+
this.retryable = options?.retryable ?? false;
|
|
2787
|
+
if (Error.captureStackTrace) {
|
|
2788
|
+
Error.captureStackTrace(this, this.constructor);
|
|
2789
|
+
}
|
|
2790
|
+
}
|
|
2791
|
+
/**
|
|
2792
|
+
* Convert to a plain object for serialization
|
|
2793
|
+
*/
|
|
2794
|
+
toJSON() {
|
|
2795
|
+
return {
|
|
2796
|
+
name: this.name,
|
|
2797
|
+
code: this.code,
|
|
2798
|
+
message: this.message,
|
|
2799
|
+
url: this.url,
|
|
2800
|
+
timestamp: this.timestamp,
|
|
2801
|
+
retryable: this.retryable,
|
|
2802
|
+
cause: this.cause?.message,
|
|
2803
|
+
stack: this.stack
|
|
2804
|
+
};
|
|
2805
|
+
}
|
|
2806
|
+
};
|
|
2807
|
+
var NetworkError = class extends ReaderError {
|
|
2808
|
+
constructor(message, options) {
|
|
2809
|
+
super(message, "NETWORK_ERROR" /* NETWORK_ERROR */, {
|
|
2810
|
+
...options,
|
|
2811
|
+
retryable: true
|
|
2812
|
+
});
|
|
2813
|
+
this.name = "NetworkError";
|
|
2814
|
+
}
|
|
2815
|
+
};
|
|
2816
|
+
var TimeoutError = class extends ReaderError {
|
|
2817
|
+
timeoutMs;
|
|
2818
|
+
constructor(message, timeoutMs, options) {
|
|
2819
|
+
super(message, "TIMEOUT" /* TIMEOUT */, {
|
|
2820
|
+
...options,
|
|
2821
|
+
retryable: true
|
|
2822
|
+
});
|
|
2823
|
+
this.name = "TimeoutError";
|
|
2824
|
+
this.timeoutMs = timeoutMs;
|
|
2825
|
+
}
|
|
2826
|
+
toJSON() {
|
|
2827
|
+
return {
|
|
2828
|
+
...super.toJSON(),
|
|
2829
|
+
timeoutMs: this.timeoutMs
|
|
2830
|
+
};
|
|
2831
|
+
}
|
|
2832
|
+
};
|
|
2833
|
+
var CloudflareError = class extends ReaderError {
|
|
2834
|
+
challengeType;
|
|
2835
|
+
constructor(challengeType, options) {
|
|
2836
|
+
super(
|
|
2837
|
+
`Cloudflare ${challengeType} challenge not resolved. Consider using a residential proxy or increasing timeout.`,
|
|
2838
|
+
"CLOUDFLARE_CHALLENGE" /* CLOUDFLARE_CHALLENGE */,
|
|
2839
|
+
{
|
|
2840
|
+
...options,
|
|
2841
|
+
retryable: true
|
|
2842
|
+
}
|
|
2843
|
+
);
|
|
2844
|
+
this.name = "CloudflareError";
|
|
2845
|
+
this.challengeType = challengeType;
|
|
2846
|
+
}
|
|
2847
|
+
toJSON() {
|
|
2848
|
+
return {
|
|
2849
|
+
...super.toJSON(),
|
|
2850
|
+
challengeType: this.challengeType
|
|
2851
|
+
};
|
|
2852
|
+
}
|
|
2853
|
+
};
|
|
2854
|
+
var AccessDeniedError = class extends ReaderError {
|
|
2855
|
+
statusCode;
|
|
2856
|
+
constructor(message, options) {
|
|
2857
|
+
super(message, "ACCESS_DENIED" /* ACCESS_DENIED */, {
|
|
2858
|
+
...options,
|
|
2859
|
+
retryable: false
|
|
2860
|
+
});
|
|
2861
|
+
this.name = "AccessDeniedError";
|
|
2862
|
+
this.statusCode = options?.statusCode;
|
|
2863
|
+
}
|
|
2864
|
+
toJSON() {
|
|
2865
|
+
return {
|
|
2866
|
+
...super.toJSON(),
|
|
2867
|
+
statusCode: this.statusCode
|
|
2868
|
+
};
|
|
2869
|
+
}
|
|
2870
|
+
};
|
|
2871
|
+
var ContentExtractionError = class extends ReaderError {
|
|
2872
|
+
constructor(message, options) {
|
|
2873
|
+
super(message, "CONTENT_EXTRACTION_FAILED" /* CONTENT_EXTRACTION_FAILED */, {
|
|
2874
|
+
...options,
|
|
2875
|
+
retryable: false
|
|
2876
|
+
});
|
|
2877
|
+
this.name = "ContentExtractionError";
|
|
2878
|
+
}
|
|
2879
|
+
};
|
|
2880
|
+
var ValidationError = class extends ReaderError {
|
|
2881
|
+
field;
|
|
2882
|
+
constructor(message, options) {
|
|
2883
|
+
super(message, "INVALID_OPTIONS" /* INVALID_OPTIONS */, {
|
|
2884
|
+
url: options?.url,
|
|
2885
|
+
retryable: false
|
|
2886
|
+
});
|
|
2887
|
+
this.name = "ValidationError";
|
|
2888
|
+
this.field = options?.field;
|
|
2889
|
+
}
|
|
2890
|
+
toJSON() {
|
|
2891
|
+
return {
|
|
2892
|
+
...super.toJSON(),
|
|
2893
|
+
field: this.field
|
|
2894
|
+
};
|
|
2895
|
+
}
|
|
2896
|
+
};
|
|
2897
|
+
var InvalidUrlError = class extends ReaderError {
|
|
2898
|
+
constructor(url, reason) {
|
|
2899
|
+
super(reason ? `Invalid URL "${url}": ${reason}` : `Invalid URL: ${url}`, "INVALID_URL" /* INVALID_URL */, {
|
|
2900
|
+
url,
|
|
2901
|
+
retryable: false
|
|
2902
|
+
});
|
|
2903
|
+
this.name = "InvalidUrlError";
|
|
2904
|
+
}
|
|
2905
|
+
};
|
|
2906
|
+
var RobotsBlockedError = class extends ReaderError {
|
|
2907
|
+
constructor(url) {
|
|
2908
|
+
super(`URL blocked by robots.txt: ${url}. Set respectRobotsTxt: false to override.`, "ROBOTS_BLOCKED" /* ROBOTS_BLOCKED */, {
|
|
2909
|
+
url,
|
|
2910
|
+
retryable: false
|
|
2911
|
+
});
|
|
2912
|
+
this.name = "RobotsBlockedError";
|
|
2913
|
+
}
|
|
2914
|
+
};
|
|
2915
|
+
var BrowserPoolError = class extends ReaderError {
|
|
2916
|
+
constructor(message, options) {
|
|
2917
|
+
super(message, "BROWSER_ERROR" /* BROWSER_ERROR */, {
|
|
2918
|
+
...options,
|
|
2919
|
+
retryable: true
|
|
2920
|
+
});
|
|
2921
|
+
this.name = "BrowserPoolError";
|
|
2922
|
+
}
|
|
2923
|
+
};
|
|
2924
|
+
var ClientClosedError = class extends ReaderError {
|
|
2925
|
+
constructor() {
|
|
2926
|
+
super("ReaderClient has been closed. Create a new instance to continue.", "CLIENT_CLOSED" /* CLIENT_CLOSED */, {
|
|
2927
|
+
retryable: false
|
|
2928
|
+
});
|
|
2929
|
+
this.name = "ClientClosedError";
|
|
2930
|
+
}
|
|
2931
|
+
};
|
|
2932
|
+
var NotInitializedError = class extends ReaderError {
|
|
2933
|
+
constructor(component) {
|
|
2934
|
+
super(`${component} not initialized. This should not happen - please report this bug.`, "NOT_INITIALIZED" /* NOT_INITIALIZED */, {
|
|
2935
|
+
retryable: false
|
|
2936
|
+
});
|
|
2937
|
+
this.name = "NotInitializedError";
|
|
2938
|
+
}
|
|
2939
|
+
};
|
|
2940
|
+
function wrapError(error, url) {
|
|
2941
|
+
if (error instanceof ReaderError) {
|
|
2942
|
+
return error;
|
|
2943
|
+
}
|
|
2944
|
+
if (error instanceof Error) {
|
|
2945
|
+
const message = error.message.toLowerCase();
|
|
2946
|
+
if (message.includes("timeout") || message.includes("timed out")) {
|
|
2947
|
+
return new TimeoutError(error.message, 3e4, { url, cause: error });
|
|
2948
|
+
}
|
|
2949
|
+
if (message.includes("econnrefused") || message.includes("connection refused")) {
|
|
2950
|
+
return new NetworkError(`Connection refused: ${error.message}`, { url, cause: error });
|
|
2951
|
+
}
|
|
2952
|
+
if (message.includes("enotfound") || message.includes("dns")) {
|
|
2953
|
+
return new NetworkError(`DNS lookup failed: ${error.message}`, { url, cause: error });
|
|
2954
|
+
}
|
|
2955
|
+
if (message.includes("cloudflare") || message.includes("challenge")) {
|
|
2956
|
+
return new CloudflareError("unknown", { url, cause: error });
|
|
2957
|
+
}
|
|
2958
|
+
return new ReaderError(error.message, "UNKNOWN" /* UNKNOWN */, {
|
|
2959
|
+
url,
|
|
2960
|
+
cause: error,
|
|
2961
|
+
retryable: false
|
|
2962
|
+
});
|
|
2963
|
+
}
|
|
2964
|
+
return new ReaderError(String(error), "UNKNOWN" /* UNKNOWN */, {
|
|
2965
|
+
url,
|
|
2966
|
+
retryable: false
|
|
2967
|
+
});
|
|
2968
|
+
}
|
|
3033
2969
|
export {
|
|
2970
|
+
AccessDeniedError,
|
|
3034
2971
|
BrowserPool,
|
|
2972
|
+
BrowserPoolError,
|
|
2973
|
+
ClientClosedError,
|
|
2974
|
+
CloudflareError,
|
|
2975
|
+
ContentExtractionError,
|
|
3035
2976
|
Crawler,
|
|
3036
2977
|
DEFAULT_DAEMON_PORT,
|
|
3037
2978
|
DEFAULT_OPTIONS,
|
|
3038
2979
|
DaemonClient,
|
|
3039
2980
|
DaemonServer,
|
|
3040
2981
|
BrowserPool as HeroBrowserPool,
|
|
2982
|
+
InvalidUrlError,
|
|
2983
|
+
NetworkError,
|
|
2984
|
+
NotInitializedError,
|
|
3041
2985
|
ReaderClient,
|
|
2986
|
+
ReaderError,
|
|
2987
|
+
ReaderErrorCode,
|
|
2988
|
+
RobotsBlockedError,
|
|
3042
2989
|
Scraper,
|
|
2990
|
+
TimeoutError,
|
|
2991
|
+
ValidationError,
|
|
3043
2992
|
cleanContent,
|
|
3044
2993
|
crawl,
|
|
3045
2994
|
createHeroConfig,
|
|
@@ -3047,14 +2996,12 @@ export {
|
|
|
3047
2996
|
detectChallenge,
|
|
3048
2997
|
extractMetadata,
|
|
3049
2998
|
formatToHTML,
|
|
3050
|
-
formatToJson,
|
|
3051
|
-
formatToJsonLite,
|
|
3052
2999
|
formatToMarkdown,
|
|
3053
|
-
formatToText,
|
|
3054
3000
|
getDaemonInfo,
|
|
3055
3001
|
getPidFilePath,
|
|
3056
3002
|
getUrlKey,
|
|
3057
3003
|
handleChallenge,
|
|
3004
|
+
htmlToMarkdown,
|
|
3058
3005
|
isChallengePage,
|
|
3059
3006
|
isDaemonRunning,
|
|
3060
3007
|
isSameDomain,
|
|
@@ -3068,6 +3015,7 @@ export {
|
|
|
3068
3015
|
shouldCrawlUrl2 as shouldCrawlUrlFn,
|
|
3069
3016
|
validateUrls,
|
|
3070
3017
|
waitForChallengeResolution,
|
|
3071
|
-
waitForSelector
|
|
3018
|
+
waitForSelector,
|
|
3019
|
+
wrapError
|
|
3072
3020
|
};
|
|
3073
3021
|
//# sourceMappingURL=index.js.map
|