@vakra-dev/reader 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -26
- package/dist/cli/index.js +429 -733
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +205 -41
- package/dist/index.js +646 -714
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli/index.js
CHANGED
|
@@ -18,27 +18,36 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
|
|
|
18
18
|
import pLimit from "p-limit";
|
|
19
19
|
|
|
20
20
|
// src/cloudflare/detector.ts
|
|
21
|
-
var
|
|
21
|
+
var CLOUDFLARE_CHALLENGE_SELECTORS = [
|
|
22
22
|
"#challenge-running",
|
|
23
23
|
"#challenge-stage",
|
|
24
24
|
"#challenge-form",
|
|
25
|
-
".cf-browser-verification"
|
|
25
|
+
".cf-browser-verification",
|
|
26
|
+
"#cf-wrapper",
|
|
27
|
+
"#cf-hcaptcha-container",
|
|
28
|
+
"#turnstile-wrapper"
|
|
26
29
|
];
|
|
27
|
-
var
|
|
28
|
-
"verifying you are human",
|
|
30
|
+
var CLOUDFLARE_TEXT_PATTERNS = [
|
|
29
31
|
"checking if the site connection is secure",
|
|
30
|
-
"this process is automatic. your browser will redirect"
|
|
32
|
+
"this process is automatic. your browser will redirect",
|
|
33
|
+
"ray id:",
|
|
34
|
+
"performance & security by cloudflare"
|
|
31
35
|
];
|
|
32
|
-
var
|
|
33
|
-
"
|
|
34
|
-
"
|
|
36
|
+
var CLOUDFLARE_INFRA_PATTERNS = [
|
|
37
|
+
"/cdn-cgi/",
|
|
38
|
+
"cloudflare",
|
|
39
|
+
"__cf_bm",
|
|
40
|
+
"cf-ray"
|
|
41
|
+
];
|
|
42
|
+
var CLOUDFLARE_BLOCKED_PATTERNS = [
|
|
35
43
|
"sorry, you have been blocked",
|
|
36
|
-
"
|
|
37
|
-
"403 forbidden"
|
|
44
|
+
"ray id:"
|
|
38
45
|
];
|
|
39
46
|
async function detectChallenge(hero) {
|
|
40
47
|
const signals = [];
|
|
41
48
|
let type = "none";
|
|
49
|
+
let hasCloudflareInfra = false;
|
|
50
|
+
let hasChallengeIndicator = false;
|
|
42
51
|
try {
|
|
43
52
|
if (!hero.document) {
|
|
44
53
|
return {
|
|
@@ -50,30 +59,51 @@ async function detectChallenge(hero) {
|
|
|
50
59
|
}
|
|
51
60
|
const html = await hero.document.documentElement.outerHTML;
|
|
52
61
|
const htmlLower = html.toLowerCase();
|
|
53
|
-
for (const
|
|
54
|
-
if (htmlLower.includes(
|
|
55
|
-
|
|
56
|
-
|
|
62
|
+
for (const pattern of CLOUDFLARE_INFRA_PATTERNS) {
|
|
63
|
+
if (htmlLower.includes(pattern)) {
|
|
64
|
+
hasCloudflareInfra = true;
|
|
65
|
+
signals.push(`Cloudflare infra: "${pattern}"`);
|
|
66
|
+
break;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
if (!hasCloudflareInfra) {
|
|
70
|
+
return {
|
|
71
|
+
isChallenge: false,
|
|
72
|
+
type: "none",
|
|
73
|
+
confidence: 0,
|
|
74
|
+
signals: ["No Cloudflare infrastructure detected"]
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
|
|
78
|
+
try {
|
|
79
|
+
const element = await hero.document.querySelector(selector);
|
|
80
|
+
if (element) {
|
|
81
|
+
hasChallengeIndicator = true;
|
|
82
|
+
signals.push(`Challenge element: ${selector}`);
|
|
83
|
+
type = "js_challenge";
|
|
84
|
+
}
|
|
85
|
+
} catch {
|
|
57
86
|
}
|
|
58
87
|
}
|
|
59
|
-
for (const pattern of
|
|
88
|
+
for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
|
|
60
89
|
if (htmlLower.includes(pattern)) {
|
|
90
|
+
hasChallengeIndicator = true;
|
|
61
91
|
signals.push(`Challenge text: "${pattern}"`);
|
|
62
92
|
type = type === "none" ? "js_challenge" : type;
|
|
63
93
|
}
|
|
64
94
|
}
|
|
65
95
|
if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
|
|
96
|
+
hasChallengeIndicator = true;
|
|
66
97
|
signals.push('Challenge text: "waiting for...to respond"');
|
|
67
98
|
type = type === "none" ? "js_challenge" : type;
|
|
68
99
|
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
}
|
|
100
|
+
const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
|
|
101
|
+
if (hasBlocked) {
|
|
102
|
+
hasChallengeIndicator = true;
|
|
103
|
+
signals.push("Cloudflare block page detected");
|
|
104
|
+
type = "blocked";
|
|
75
105
|
}
|
|
76
|
-
const isChallenge =
|
|
106
|
+
const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
|
|
77
107
|
const confidence = isChallenge ? 100 : 0;
|
|
78
108
|
return {
|
|
79
109
|
isChallenge,
|
|
@@ -156,84 +186,6 @@ var turndownService = new TurndownService({
|
|
|
156
186
|
linkStyle: "inlined",
|
|
157
187
|
linkReferenceStyle: "full"
|
|
158
188
|
});
|
|
159
|
-
function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
160
|
-
const sections = [];
|
|
161
|
-
if (includeMetadata) {
|
|
162
|
-
sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
163
|
-
}
|
|
164
|
-
if (pages.length > 1) {
|
|
165
|
-
sections.push(createMarkdownTOC(pages));
|
|
166
|
-
}
|
|
167
|
-
sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
|
|
168
|
-
return sections.join("\n\n");
|
|
169
|
-
}
|
|
170
|
-
function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
171
|
-
const title = website.title || extractDomainFromUrl(baseUrl);
|
|
172
|
-
const description = website.description || "";
|
|
173
|
-
let header = `# Website Scrape: ${title}
|
|
174
|
-
|
|
175
|
-
`;
|
|
176
|
-
header += `**Base URL:** ${baseUrl}
|
|
177
|
-
`;
|
|
178
|
-
header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
|
|
179
|
-
`;
|
|
180
|
-
header += `**Duration:** ${duration}ms
|
|
181
|
-
`;
|
|
182
|
-
header += `**Total pages:** ${totalPages}
|
|
183
|
-
`;
|
|
184
|
-
if (description) {
|
|
185
|
-
header += `**Description:** ${description}
|
|
186
|
-
`;
|
|
187
|
-
}
|
|
188
|
-
if (website.author) {
|
|
189
|
-
header += `**Author:** ${website.author}
|
|
190
|
-
`;
|
|
191
|
-
}
|
|
192
|
-
if (website.language) {
|
|
193
|
-
header += `**Language:** ${website.language}
|
|
194
|
-
`;
|
|
195
|
-
}
|
|
196
|
-
return header;
|
|
197
|
-
}
|
|
198
|
-
function createMarkdownTOC(pages) {
|
|
199
|
-
let toc = "## Table of Contents\n\n";
|
|
200
|
-
pages.forEach((page, index) => {
|
|
201
|
-
const depth = " ".repeat(page.depth);
|
|
202
|
-
const pageNumber = index + 1;
|
|
203
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
204
|
-
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
205
|
-
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
206
|
-
toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
|
|
207
|
-
`;
|
|
208
|
-
});
|
|
209
|
-
return toc;
|
|
210
|
-
}
|
|
211
|
-
function createMarkdownPage(page, pageNumber) {
|
|
212
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
213
|
-
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
214
|
-
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
215
|
-
let pageContent = `---
|
|
216
|
-
|
|
217
|
-
`;
|
|
218
|
-
pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
|
|
219
|
-
|
|
220
|
-
`;
|
|
221
|
-
pageContent += `**URL:** ${page.url}
|
|
222
|
-
`;
|
|
223
|
-
pageContent += `**Title:** ${page.title}
|
|
224
|
-
`;
|
|
225
|
-
pageContent += `**Depth:** ${page.depth}
|
|
226
|
-
`;
|
|
227
|
-
pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
|
|
228
|
-
|
|
229
|
-
`;
|
|
230
|
-
pageContent += `---
|
|
231
|
-
|
|
232
|
-
`;
|
|
233
|
-
const markdown = htmlToMarkdown(page.html);
|
|
234
|
-
pageContent += markdown;
|
|
235
|
-
return pageContent;
|
|
236
|
-
}
|
|
237
189
|
function htmlToMarkdown(html) {
|
|
238
190
|
try {
|
|
239
191
|
return turndownService.turndown(html);
|
|
@@ -242,574 +194,339 @@ function htmlToMarkdown(html) {
|
|
|
242
194
|
return html.replace(/<[^>]*>/g, "").trim();
|
|
243
195
|
}
|
|
244
196
|
}
|
|
245
|
-
function extractDomainFromUrl(url) {
|
|
246
|
-
try {
|
|
247
|
-
return new URL(url).hostname;
|
|
248
|
-
} catch {
|
|
249
|
-
return "Unknown";
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
// src/formatters/html.ts
|
|
254
|
-
function formatToHTML(pages, baseUrl, scrapedAt, duration, website) {
|
|
255
|
-
const html = `<!DOCTYPE html>
|
|
256
|
-
<html lang="${website.language || "en"}">
|
|
257
|
-
<head>
|
|
258
|
-
<meta charset="${website.charset || "UTF-8"}">
|
|
259
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
260
|
-
<title>Scrape: ${website.title || extractDomainFromUrl2(baseUrl)}</title>
|
|
261
|
-
${generateMetaTags(website)}
|
|
262
|
-
<style>
|
|
263
|
-
${generateCSS()}
|
|
264
|
-
</style>
|
|
265
|
-
</head>
|
|
266
|
-
<body>
|
|
267
|
-
<header class="header">
|
|
268
|
-
<h1>Website Scrape: ${escapeHtml(website.title || extractDomainFromUrl2(baseUrl))}</h1>
|
|
269
|
-
<div class="meta-info">
|
|
270
|
-
<p><strong>Base URL:</strong> <a href="${escapeHtml(
|
|
271
|
-
baseUrl
|
|
272
|
-
)}" target="_blank">${escapeHtml(baseUrl)}</a></p>
|
|
273
|
-
<p><strong>Scraped at:</strong> ${new Date(scrapedAt).toLocaleString()}</p>
|
|
274
|
-
<p><strong>Duration:</strong> ${duration}ms</p>
|
|
275
|
-
<p><strong>Total pages:</strong> ${pages.length}</p>
|
|
276
|
-
${website.description ? `<p><strong>Description:</strong> ${escapeHtml(website.description)}</p>` : ""}
|
|
277
|
-
${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
|
|
278
|
-
${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
|
|
279
|
-
</div>
|
|
280
|
-
</header>
|
|
281
|
-
|
|
282
|
-
${pages.length > 1 ? generateTOC(pages) : ""}
|
|
283
|
-
|
|
284
|
-
<main class="content">
|
|
285
|
-
${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
|
|
286
|
-
</main>
|
|
287
|
-
|
|
288
|
-
<footer class="footer">
|
|
289
|
-
<p>Generated by Reader JS/TS SDK</p>
|
|
290
|
-
</footer>
|
|
291
|
-
|
|
292
|
-
<script>
|
|
293
|
-
${generateJavaScript()}
|
|
294
|
-
</script>
|
|
295
|
-
</body>
|
|
296
|
-
</html>`;
|
|
297
|
-
return html;
|
|
298
|
-
}
|
|
299
|
-
function generateMetaTags(website) {
|
|
300
|
-
const tags = [];
|
|
301
|
-
if (website.description) {
|
|
302
|
-
tags.push(`<meta name="description" content="${escapeHtml(website.description)}">`);
|
|
303
|
-
}
|
|
304
|
-
if (website.author) {
|
|
305
|
-
tags.push(`<meta name="author" content="${escapeHtml(website.author)}">`);
|
|
306
|
-
}
|
|
307
|
-
if (website.keywords) {
|
|
308
|
-
tags.push(`<meta name="keywords" content="${escapeHtml(website.keywords.join(", "))}">`);
|
|
309
|
-
}
|
|
310
|
-
if (website.robots) {
|
|
311
|
-
tags.push(`<meta name="robots" content="${escapeHtml(website.robots)}">`);
|
|
312
|
-
}
|
|
313
|
-
if (website.themeColor) {
|
|
314
|
-
tags.push(`<meta name="theme-color" content="${escapeHtml(website.themeColor)}">`);
|
|
315
|
-
}
|
|
316
|
-
if (website.favicon) {
|
|
317
|
-
tags.push(`<link rel="icon" href="${escapeHtml(website.favicon)}">`);
|
|
318
|
-
}
|
|
319
|
-
if (website.canonical) {
|
|
320
|
-
tags.push(`<link rel="canonical" href="${escapeHtml(website.canonical)}">`);
|
|
321
|
-
}
|
|
322
|
-
if (website.openGraph) {
|
|
323
|
-
const og = website.openGraph;
|
|
324
|
-
if (og.title) tags.push(`<meta property="og:title" content="${escapeHtml(og.title)}">`);
|
|
325
|
-
if (og.description)
|
|
326
|
-
tags.push(`<meta property="og:description" content="${escapeHtml(og.description)}">`);
|
|
327
|
-
if (og.type) tags.push(`<meta property="og:type" content="${escapeHtml(og.type)}">`);
|
|
328
|
-
if (og.url) tags.push(`<meta property="og:url" content="${escapeHtml(og.url)}">`);
|
|
329
|
-
if (og.image) tags.push(`<meta property="og:image" content="${escapeHtml(og.image)}">`);
|
|
330
|
-
if (og.siteName)
|
|
331
|
-
tags.push(`<meta property="og:site_name" content="${escapeHtml(og.siteName)}">`);
|
|
332
|
-
if (og.locale) tags.push(`<meta property="og:locale" content="${escapeHtml(og.locale)}">`);
|
|
333
|
-
}
|
|
334
|
-
if (website.twitter) {
|
|
335
|
-
const twitter = website.twitter;
|
|
336
|
-
if (twitter.card) tags.push(`<meta name="twitter:card" content="${escapeHtml(twitter.card)}">`);
|
|
337
|
-
if (twitter.site) tags.push(`<meta name="twitter:site" content="${escapeHtml(twitter.site)}">`);
|
|
338
|
-
if (twitter.creator)
|
|
339
|
-
tags.push(`<meta name="twitter:creator" content="${escapeHtml(twitter.creator)}">`);
|
|
340
|
-
if (twitter.title)
|
|
341
|
-
tags.push(`<meta name="twitter:title" content="${escapeHtml(twitter.title)}">`);
|
|
342
|
-
if (twitter.description)
|
|
343
|
-
tags.push(`<meta name="twitter:description" content="${escapeHtml(twitter.description)}">`);
|
|
344
|
-
if (twitter.image)
|
|
345
|
-
tags.push(`<meta name="twitter:image" content="${escapeHtml(twitter.image)}">`);
|
|
346
|
-
}
|
|
347
|
-
return tags.join("\n ");
|
|
348
|
-
}
|
|
349
|
-
function generateCSS() {
|
|
350
|
-
return `
|
|
351
|
-
* {
|
|
352
|
-
margin: 0;
|
|
353
|
-
padding: 0;
|
|
354
|
-
box-sizing: border-box;
|
|
355
|
-
}
|
|
356
|
-
|
|
357
|
-
body {
|
|
358
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
359
|
-
line-height: 1.6;
|
|
360
|
-
color: #333;
|
|
361
|
-
background-color: #f8f9fa;
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
.header {
|
|
365
|
-
background: white;
|
|
366
|
-
padding: 2rem;
|
|
367
|
-
border-bottom: 1px solid #e9ecef;
|
|
368
|
-
margin-bottom: 2rem;
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
.header h1 {
|
|
372
|
-
color: #2c3e50;
|
|
373
|
-
margin-bottom: 1rem;
|
|
374
|
-
font-size: 2rem;
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
.meta-info {
|
|
378
|
-
display: grid;
|
|
379
|
-
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
|
380
|
-
gap: 0.5rem;
|
|
381
|
-
}
|
|
382
|
-
|
|
383
|
-
.meta-info p {
|
|
384
|
-
margin: 0.25rem 0;
|
|
385
|
-
font-size: 0.9rem;
|
|
386
|
-
color: #6c757d;
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
.toc {
|
|
390
|
-
background: white;
|
|
391
|
-
padding: 1.5rem;
|
|
392
|
-
margin: 2rem 0;
|
|
393
|
-
border-radius: 8px;
|
|
394
|
-
border: 1px solid #e9ecef;
|
|
395
|
-
}
|
|
396
|
-
|
|
397
|
-
.toc h2 {
|
|
398
|
-
color: #2c3e50;
|
|
399
|
-
margin-bottom: 1rem;
|
|
400
|
-
font-size: 1.25rem;
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
.toc ul {
|
|
404
|
-
list-style: none;
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
.toc li {
|
|
408
|
-
margin: 0.5rem 0;
|
|
409
|
-
}
|
|
410
|
-
|
|
411
|
-
.toc a {
|
|
412
|
-
color: #007bff;
|
|
413
|
-
text-decoration: none;
|
|
414
|
-
transition: color 0.2s;
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
.toc a:hover {
|
|
418
|
-
color: #0056b3;
|
|
419
|
-
text-decoration: underline;
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
.content {
|
|
423
|
-
max-width: 800px;
|
|
424
|
-
margin: 0 auto;
|
|
425
|
-
padding: 0 1rem;
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
.page {
|
|
429
|
-
background: white;
|
|
430
|
-
margin: 2rem 0;
|
|
431
|
-
padding: 2rem;
|
|
432
|
-
border-radius: 8px;
|
|
433
|
-
border: 1px solid #e9ecef;
|
|
434
|
-
box-shadow: 0 2px 4px rgba(0,0,0,0.05);
|
|
435
|
-
}
|
|
436
|
-
|
|
437
|
-
.page-header {
|
|
438
|
-
border-bottom: 2px solid #e9ecef;
|
|
439
|
-
padding-bottom: 1rem;
|
|
440
|
-
margin-bottom: 2rem;
|
|
441
|
-
}
|
|
442
|
-
|
|
443
|
-
.page-header h2 {
|
|
444
|
-
color: #2c3e50;
|
|
445
|
-
margin-bottom: 0.5rem;
|
|
446
|
-
font-size: 1.5rem;
|
|
447
|
-
}
|
|
448
|
-
|
|
449
|
-
.page-meta {
|
|
450
|
-
display: flex;
|
|
451
|
-
flex-wrap: wrap;
|
|
452
|
-
gap: 1rem;
|
|
453
|
-
font-size: 0.9rem;
|
|
454
|
-
color: #6c757d;
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
.page-content {
|
|
458
|
-
line-height: 1.8;
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
.page-content h1, .page-content h2, .page-content h3,
|
|
462
|
-
.page-content h4, .page-content h5, .page-content h6 {
|
|
463
|
-
color: #2c3e50;
|
|
464
|
-
margin: 1.5rem 0 0.5rem 0;
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
.page-content p {
|
|
468
|
-
margin: 1rem 0;
|
|
469
|
-
}
|
|
470
|
-
|
|
471
|
-
.page-content a {
|
|
472
|
-
color: #007bff;
|
|
473
|
-
text-decoration: none;
|
|
474
|
-
}
|
|
475
|
-
|
|
476
|
-
.page-content a:hover {
|
|
477
|
-
text-decoration: underline;
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
.page-content code {
|
|
481
|
-
background: #f8f9fa;
|
|
482
|
-
padding: 0.2rem 0.4rem;
|
|
483
|
-
border-radius: 4px;
|
|
484
|
-
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
|
485
|
-
font-size: 0.9em;
|
|
486
|
-
}
|
|
487
|
-
|
|
488
|
-
.page-content pre {
|
|
489
|
-
background: #f8f9fa;
|
|
490
|
-
padding: 1rem;
|
|
491
|
-
border-radius: 4px;
|
|
492
|
-
overflow-x: auto;
|
|
493
|
-
margin: 1rem 0;
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
.page-content blockquote {
|
|
497
|
-
border-left: 4px solid #007bff;
|
|
498
|
-
padding-left: 1rem;
|
|
499
|
-
margin: 1rem 0;
|
|
500
|
-
color: #6c757d;
|
|
501
|
-
}
|
|
502
|
-
|
|
503
|
-
.footer {
|
|
504
|
-
text-align: center;
|
|
505
|
-
padding: 2rem;
|
|
506
|
-
margin-top: 3rem;
|
|
507
|
-
border-top: 1px solid #e9ecef;
|
|
508
|
-
color: #6c757d;
|
|
509
|
-
font-size: 0.9rem;
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
@media (max-width: 768px) {
|
|
513
|
-
.header {
|
|
514
|
-
padding: 1rem;
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
.header h1 {
|
|
518
|
-
font-size: 1.5rem;
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
.page {
|
|
522
|
-
padding: 1rem;
|
|
523
|
-
}
|
|
524
|
-
|
|
525
|
-
.page-meta {
|
|
526
|
-
flex-direction: column;
|
|
527
|
-
gap: 0.5rem;
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
`.trim();
|
|
531
|
-
}
|
|
532
|
-
function generateTOC(pages) {
|
|
533
|
-
const tocItems = pages.map((page, index) => {
|
|
534
|
-
const pageNumber = index + 1;
|
|
535
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
536
|
-
const id = `page-${pageNumber}`;
|
|
537
|
-
return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
|
|
538
|
-
}).join("\n");
|
|
539
|
-
return `
|
|
540
|
-
<nav class="toc">
|
|
541
|
-
<h2>Table of Contents</h2>
|
|
542
|
-
<ul>
|
|
543
|
-
${tocItems}
|
|
544
|
-
</ul>
|
|
545
|
-
</nav>`;
|
|
546
|
-
}
|
|
547
|
-
function generatePageHTML(page, pageNumber) {
|
|
548
|
-
const id = `page-${pageNumber}`;
|
|
549
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
550
|
-
return `
|
|
551
|
-
<article class="page" id="${id}">
|
|
552
|
-
<div class="page-header">
|
|
553
|
-
<h2>${pageNumber}. ${escapeHtml(title)}</h2>
|
|
554
|
-
<div class="page-meta">
|
|
555
|
-
<span><strong>URL:</strong> <a href="${escapeHtml(
|
|
556
|
-
page.url
|
|
557
|
-
)}" target="_blank">${escapeHtml(page.url)}</a></span>
|
|
558
|
-
<span><strong>Depth:</strong> ${page.depth}</span>
|
|
559
|
-
<span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
|
|
560
|
-
</div>
|
|
561
|
-
</div>
|
|
562
|
-
<div class="page-content">
|
|
563
|
-
${page.html}
|
|
564
|
-
</div>
|
|
565
|
-
</article>`;
|
|
566
|
-
}
|
|
567
|
-
function generateJavaScript() {
|
|
568
|
-
return `
|
|
569
|
-
// Smooth scrolling for TOC links
|
|
570
|
-
document.querySelectorAll('a[href^="#"]').forEach(anchor => {
|
|
571
|
-
anchor.addEventListener('click', function (e) {
|
|
572
|
-
e.preventDefault();
|
|
573
|
-
const target = document.querySelector(this.getAttribute('href'));
|
|
574
|
-
if (target) {
|
|
575
|
-
target.scrollIntoView({
|
|
576
|
-
behavior: 'smooth',
|
|
577
|
-
block: 'start'
|
|
578
|
-
});
|
|
579
|
-
}
|
|
580
|
-
});
|
|
581
|
-
});
|
|
582
|
-
|
|
583
|
-
// Highlight current section in TOC
|
|
584
|
-
window.addEventListener('scroll', function() {
|
|
585
|
-
const pages = document.querySelectorAll('.page');
|
|
586
|
-
const tocLinks = document.querySelectorAll('.toc a');
|
|
587
|
-
|
|
588
|
-
let currentPage = null;
|
|
589
|
-
pages.forEach(page => {
|
|
590
|
-
const rect = page.getBoundingClientRect();
|
|
591
|
-
if (rect.top <= 100) {
|
|
592
|
-
currentPage = page;
|
|
593
|
-
}
|
|
594
|
-
});
|
|
595
|
-
|
|
596
|
-
tocLinks.forEach(link => {
|
|
597
|
-
link.style.fontWeight = 'normal';
|
|
598
|
-
const target = document.querySelector(link.getAttribute('href'));
|
|
599
|
-
if (target === currentPage) {
|
|
600
|
-
link.style.fontWeight = 'bold';
|
|
601
|
-
}
|
|
602
|
-
});
|
|
603
|
-
});
|
|
604
|
-
`;
|
|
605
|
-
}
|
|
606
|
-
function escapeHtml(text) {
|
|
607
|
-
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'").replace(/\//g, "/");
|
|
608
|
-
}
|
|
609
|
-
function extractDomainFromUrl2(url) {
|
|
610
|
-
try {
|
|
611
|
-
return new URL(url).hostname;
|
|
612
|
-
} catch {
|
|
613
|
-
return "Unknown";
|
|
614
|
-
}
|
|
615
|
-
}
|
|
616
|
-
|
|
617
|
-
// src/formatters/json.ts
|
|
618
|
-
function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
|
|
619
|
-
const jsonResult = {
|
|
620
|
-
metadata: {
|
|
621
|
-
baseUrl,
|
|
622
|
-
totalPages: pages.length,
|
|
623
|
-
scrapedAt,
|
|
624
|
-
duration,
|
|
625
|
-
website
|
|
626
|
-
},
|
|
627
|
-
pages: pages.map((page, index) => ({
|
|
628
|
-
index: index + 1,
|
|
629
|
-
url: page.url,
|
|
630
|
-
title: page.title,
|
|
631
|
-
markdown: page.markdown,
|
|
632
|
-
html: page.html,
|
|
633
|
-
fetchedAt: page.fetchedAt,
|
|
634
|
-
depth: page.depth,
|
|
635
|
-
wordCount: countWords(page.markdown),
|
|
636
|
-
readingTime: estimateReadingTime(page.markdown)
|
|
637
|
-
}))
|
|
638
|
-
};
|
|
639
|
-
return JSON.stringify(jsonResult, null, 2);
|
|
640
|
-
}
|
|
641
|
-
function countWords(markdown) {
|
|
642
|
-
const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
|
|
643
|
-
return plainText.split(/\s+/).filter((word) => word.length > 0).length;
|
|
644
|
-
}
|
|
645
|
-
function estimateReadingTime(markdown) {
|
|
646
|
-
const wordCount = countWords(markdown);
|
|
647
|
-
return Math.ceil(wordCount / 200);
|
|
648
|
-
}
|
|
649
|
-
|
|
650
|
-
// src/formatters/text.ts
|
|
651
|
-
import { parseHTML } from "linkedom";
|
|
652
|
-
function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
653
|
-
const sections = [];
|
|
654
|
-
if (includeMetadata) {
|
|
655
|
-
sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
656
|
-
}
|
|
657
|
-
sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
|
|
658
|
-
return sections.join("\n\n");
|
|
659
|
-
}
|
|
660
|
-
function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
661
|
-
const title = website.title || extractDomainFromUrl3(baseUrl);
|
|
662
|
-
const lines = [];
|
|
663
|
-
lines.push(`=== ${title} ===`);
|
|
664
|
-
lines.push("");
|
|
665
|
-
lines.push(`URL: ${baseUrl}`);
|
|
666
|
-
lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
|
|
667
|
-
lines.push(`Duration: ${duration}ms`);
|
|
668
|
-
lines.push(`Pages: ${totalPages}`);
|
|
669
|
-
if (website.description) {
|
|
670
|
-
lines.push(`Description: ${website.description}`);
|
|
671
|
-
}
|
|
672
|
-
if (website.author) {
|
|
673
|
-
lines.push(`Author: ${website.author}`);
|
|
674
|
-
}
|
|
675
|
-
if (website.language) {
|
|
676
|
-
lines.push(`Language: ${website.language}`);
|
|
677
|
-
}
|
|
678
|
-
return lines.join("\n");
|
|
679
|
-
}
|
|
680
|
-
function createTextPage(page, pageNumber, showSeparator) {
|
|
681
|
-
const lines = [];
|
|
682
|
-
if (showSeparator) {
|
|
683
|
-
lines.push("\u2500".repeat(60));
|
|
684
|
-
lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
|
|
685
|
-
lines.push(`URL: ${page.url}`);
|
|
686
|
-
lines.push("\u2500".repeat(60));
|
|
687
|
-
}
|
|
688
|
-
const plainText = htmlToPlainText(page.html);
|
|
689
|
-
lines.push(plainText);
|
|
690
|
-
return lines.join("\n");
|
|
691
|
-
}
|
|
692
|
-
function htmlToPlainText(html) {
|
|
693
|
-
const { document } = parseHTML(html);
|
|
694
|
-
const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
|
|
695
|
-
elementsToRemove.forEach((tag) => {
|
|
696
|
-
document.querySelectorAll(tag).forEach((el) => el.remove());
|
|
697
|
-
});
|
|
698
|
-
let text = document.body?.textContent || document.documentElement?.textContent || "";
|
|
699
|
-
text = text.replace(/[ \t]+/g, " ");
|
|
700
|
-
text = text.replace(/\n[ \t]+/g, "\n");
|
|
701
|
-
text = text.replace(/[ \t]+\n/g, "\n");
|
|
702
|
-
text = text.replace(/\n{3,}/g, "\n\n");
|
|
703
|
-
text = text.trim();
|
|
704
|
-
return text;
|
|
705
|
-
}
|
|
706
|
-
function extractDomainFromUrl3(url) {
|
|
707
|
-
try {
|
|
708
|
-
return new URL(url).hostname;
|
|
709
|
-
} catch {
|
|
710
|
-
return "Unknown";
|
|
711
|
-
}
|
|
712
|
-
}
|
|
713
197
|
|
|
714
198
|
// src/utils/content-cleaner.ts
|
|
715
|
-
import { parseHTML
|
|
199
|
+
import { parseHTML } from "linkedom";
|
|
716
200
|
var ALWAYS_REMOVE_SELECTORS = [
|
|
717
|
-
// Navigation and menus
|
|
718
|
-
"nav",
|
|
719
|
-
"header nav",
|
|
720
|
-
"footer nav",
|
|
721
|
-
".nav",
|
|
722
|
-
".navigation",
|
|
723
|
-
".menu",
|
|
724
|
-
".navbar",
|
|
725
|
-
".sidebar",
|
|
726
|
-
".aside",
|
|
727
|
-
// Header and footer elements
|
|
728
|
-
"header",
|
|
729
|
-
"footer",
|
|
730
|
-
".site-header",
|
|
731
|
-
".page-header",
|
|
732
|
-
".site-footer",
|
|
733
|
-
".page-footer",
|
|
734
|
-
// Social media and sharing
|
|
735
|
-
".social",
|
|
736
|
-
".share",
|
|
737
|
-
".sharing",
|
|
738
|
-
".twitter",
|
|
739
|
-
".facebook",
|
|
740
|
-
".linkedin",
|
|
741
|
-
".instagram",
|
|
742
|
-
// Comments and discussions
|
|
743
|
-
".comments",
|
|
744
|
-
".comment",
|
|
745
|
-
".discussion",
|
|
746
|
-
".disqus",
|
|
747
|
-
// Forms and interactive elements
|
|
748
|
-
"form",
|
|
749
|
-
"input",
|
|
750
|
-
"button:not([type='submit'])",
|
|
751
|
-
"select",
|
|
752
|
-
"textarea",
|
|
753
201
|
// Scripts and styles
|
|
754
202
|
"script",
|
|
755
203
|
"style",
|
|
756
204
|
"noscript",
|
|
205
|
+
"link[rel='stylesheet']",
|
|
757
206
|
// Hidden elements
|
|
758
207
|
"[hidden]",
|
|
208
|
+
"[aria-hidden='true']",
|
|
759
209
|
"[style*='display: none']",
|
|
760
210
|
"[style*='display:none']",
|
|
761
|
-
|
|
762
|
-
"
|
|
763
|
-
|
|
764
|
-
"
|
|
211
|
+
"[style*='visibility: hidden']",
|
|
212
|
+
"[style*='visibility:hidden']",
|
|
213
|
+
// SVG icons and decorative elements
|
|
214
|
+
"svg[aria-hidden='true']",
|
|
215
|
+
"svg.icon",
|
|
216
|
+
"svg[class*='icon']",
|
|
217
|
+
// Template and metadata
|
|
218
|
+
"template",
|
|
219
|
+
"meta",
|
|
220
|
+
// Embeds that don't convert to text
|
|
221
|
+
"iframe",
|
|
222
|
+
"canvas",
|
|
223
|
+
"object",
|
|
224
|
+
"embed",
|
|
225
|
+
// Forms (usually not main content)
|
|
226
|
+
"form",
|
|
227
|
+
"input",
|
|
228
|
+
"select",
|
|
229
|
+
"textarea",
|
|
230
|
+
"button"
|
|
231
|
+
];
|
|
232
|
+
var OVERLAY_SELECTORS = [
|
|
233
|
+
"[class*='modal']",
|
|
234
|
+
"[class*='popup']",
|
|
235
|
+
"[class*='overlay']",
|
|
236
|
+
"[class*='dialog']",
|
|
237
|
+
"[role='dialog']",
|
|
238
|
+
"[role='alertdialog']",
|
|
239
|
+
"[class*='cookie']",
|
|
240
|
+
"[class*='consent']",
|
|
241
|
+
"[class*='gdpr']",
|
|
242
|
+
"[class*='privacy-banner']",
|
|
243
|
+
"[class*='notification-bar']",
|
|
244
|
+
"[id*='cookie']",
|
|
245
|
+
"[id*='consent']",
|
|
246
|
+
"[id*='gdpr']",
|
|
247
|
+
// Fixed/sticky positioned elements
|
|
248
|
+
"[style*='position: fixed']",
|
|
249
|
+
"[style*='position:fixed']",
|
|
250
|
+
"[style*='position: sticky']",
|
|
251
|
+
"[style*='position:sticky']"
|
|
252
|
+
];
|
|
253
|
+
var NAVIGATION_SELECTORS = [
|
|
254
|
+
// Semantic elements
|
|
255
|
+
"header",
|
|
256
|
+
"footer",
|
|
257
|
+
"nav",
|
|
258
|
+
"aside",
|
|
259
|
+
// Header variations
|
|
260
|
+
".header",
|
|
261
|
+
".top",
|
|
262
|
+
".navbar",
|
|
263
|
+
"#header",
|
|
264
|
+
// Footer variations
|
|
265
|
+
".footer",
|
|
266
|
+
".bottom",
|
|
267
|
+
"#footer",
|
|
268
|
+
// Sidebars
|
|
269
|
+
".sidebar",
|
|
270
|
+
".side",
|
|
271
|
+
".aside",
|
|
272
|
+
"#sidebar",
|
|
273
|
+
// Modals/popups (backup if not caught by OVERLAY_SELECTORS)
|
|
765
274
|
".modal",
|
|
275
|
+
".popup",
|
|
276
|
+
"#modal",
|
|
766
277
|
".overlay",
|
|
767
|
-
|
|
278
|
+
// Ads
|
|
279
|
+
".ad",
|
|
280
|
+
".ads",
|
|
281
|
+
".advert",
|
|
282
|
+
"#ad",
|
|
283
|
+
// Language selectors
|
|
284
|
+
".lang-selector",
|
|
285
|
+
".language",
|
|
286
|
+
"#language-selector",
|
|
287
|
+
// Social
|
|
288
|
+
".social",
|
|
289
|
+
".social-media",
|
|
290
|
+
".social-links",
|
|
291
|
+
"#social",
|
|
292
|
+
// Navigation/menus
|
|
293
|
+
".menu",
|
|
294
|
+
".navigation",
|
|
295
|
+
"#nav",
|
|
768
296
|
// Breadcrumbs
|
|
769
|
-
".breadcrumb",
|
|
770
297
|
".breadcrumbs",
|
|
771
|
-
"
|
|
298
|
+
"#breadcrumbs",
|
|
299
|
+
// Share buttons
|
|
300
|
+
".share",
|
|
301
|
+
"#share",
|
|
302
|
+
// Widgets
|
|
303
|
+
".widget",
|
|
304
|
+
"#widget",
|
|
305
|
+
// Cookie notices (backup)
|
|
306
|
+
".cookie",
|
|
307
|
+
"#cookie"
|
|
308
|
+
];
|
|
309
|
+
var FORCE_INCLUDE_SELECTORS = [
|
|
310
|
+
// IDs
|
|
311
|
+
"#main",
|
|
312
|
+
"#content",
|
|
313
|
+
"#main-content",
|
|
314
|
+
"#article",
|
|
315
|
+
"#post",
|
|
316
|
+
"#page-content",
|
|
317
|
+
// Semantic elements
|
|
318
|
+
"main",
|
|
319
|
+
"article",
|
|
320
|
+
"[role='main']",
|
|
321
|
+
// Classes
|
|
322
|
+
".main-content",
|
|
323
|
+
".content",
|
|
324
|
+
".post-content",
|
|
325
|
+
".article-content",
|
|
326
|
+
".entry-content",
|
|
327
|
+
".page-content",
|
|
328
|
+
".article-body",
|
|
329
|
+
".post-body",
|
|
330
|
+
".story-content",
|
|
331
|
+
".blog-content"
|
|
772
332
|
];
|
|
773
333
|
var AD_SELECTORS = [
|
|
774
|
-
//
|
|
775
|
-
".
|
|
776
|
-
".ads",
|
|
777
|
-
".advertisement",
|
|
778
|
-
".promotion",
|
|
779
|
-
".sponsored",
|
|
780
|
-
"[class*='ad-']",
|
|
781
|
-
"[id*='ad-']",
|
|
782
|
-
"[class*='advert']",
|
|
783
|
-
"[id*='advert']",
|
|
784
|
-
"[class*='banner']",
|
|
785
|
-
"[id*='banner']",
|
|
334
|
+
// Google ads
|
|
335
|
+
"ins.adsbygoogle",
|
|
786
336
|
".google-ad",
|
|
787
337
|
".adsense",
|
|
338
|
+
// Generic ad containers
|
|
788
339
|
"[data-ad]",
|
|
789
340
|
"[data-ads]",
|
|
790
|
-
"
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
"
|
|
794
|
-
"
|
|
795
|
-
"
|
|
341
|
+
"[data-ad-slot]",
|
|
342
|
+
"[data-ad-client]",
|
|
343
|
+
// Common ad class patterns
|
|
344
|
+
".ad-container",
|
|
345
|
+
".ad-wrapper",
|
|
346
|
+
".advertisement",
|
|
347
|
+
".sponsored-content",
|
|
348
|
+
// Tracking pixels
|
|
349
|
+
"img[width='1'][height='1']",
|
|
350
|
+
"img[src*='pixel']",
|
|
351
|
+
"img[src*='tracking']",
|
|
352
|
+
"img[src*='analytics']"
|
|
796
353
|
];
|
|
797
|
-
function
|
|
798
|
-
const
|
|
799
|
-
const
|
|
800
|
-
|
|
354
|
+
function getLinkDensity(element) {
|
|
355
|
+
const text = element.textContent || "";
|
|
356
|
+
const textLength = text.trim().length;
|
|
357
|
+
if (textLength === 0) return 1;
|
|
358
|
+
let linkLength = 0;
|
|
359
|
+
element.querySelectorAll("a").forEach((link) => {
|
|
360
|
+
linkLength += (link.textContent || "").trim().length;
|
|
361
|
+
});
|
|
362
|
+
return linkLength / textLength;
|
|
363
|
+
}
|
|
364
|
+
function getContentScore(element) {
|
|
365
|
+
let score = 0;
|
|
366
|
+
const text = element.textContent || "";
|
|
367
|
+
const textLength = text.trim().length;
|
|
368
|
+
score += Math.min(textLength / 100, 50);
|
|
369
|
+
score += element.querySelectorAll("p").length * 3;
|
|
370
|
+
score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2;
|
|
371
|
+
score += element.querySelectorAll("img").length * 1;
|
|
372
|
+
score -= element.querySelectorAll("a").length * 0.5;
|
|
373
|
+
score -= element.querySelectorAll("li").length * 0.2;
|
|
374
|
+
const linkDensity = getLinkDensity(element);
|
|
375
|
+
if (linkDensity > 0.5) score -= 30;
|
|
376
|
+
else if (linkDensity > 0.3) score -= 15;
|
|
377
|
+
const classAndId = (element.className || "") + " " + (element.id || "");
|
|
378
|
+
if (/article|content|post|body|main|entry/i.test(classAndId)) score += 25;
|
|
379
|
+
if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) score -= 25;
|
|
380
|
+
return score;
|
|
381
|
+
}
|
|
382
|
+
function looksLikeNavigation(element) {
|
|
383
|
+
const linkDensity = getLinkDensity(element);
|
|
384
|
+
if (linkDensity > 0.5) return true;
|
|
385
|
+
const listItems = element.querySelectorAll("li");
|
|
386
|
+
const links = element.querySelectorAll("a");
|
|
387
|
+
if (listItems.length > 5 && links.length > listItems.length * 0.8) return true;
|
|
388
|
+
return false;
|
|
389
|
+
}
|
|
390
|
+
function removeElements(document, selectors) {
|
|
391
|
+
for (const selector of selectors) {
|
|
801
392
|
try {
|
|
802
393
|
document.querySelectorAll(selector).forEach((el) => el.remove());
|
|
803
394
|
} catch {
|
|
804
395
|
}
|
|
805
396
|
}
|
|
397
|
+
}
|
|
398
|
+
function removeWithProtection(document, selectorsToRemove, protectedSelectors) {
|
|
399
|
+
for (const selector of selectorsToRemove) {
|
|
400
|
+
try {
|
|
401
|
+
document.querySelectorAll(selector).forEach((element) => {
|
|
402
|
+
const isProtected = protectedSelectors.some((ps) => {
|
|
403
|
+
try {
|
|
404
|
+
return element.matches(ps);
|
|
405
|
+
} catch {
|
|
406
|
+
return false;
|
|
407
|
+
}
|
|
408
|
+
});
|
|
409
|
+
if (isProtected) return;
|
|
410
|
+
const containsProtected = protectedSelectors.some((ps) => {
|
|
411
|
+
try {
|
|
412
|
+
return element.querySelector(ps) !== null;
|
|
413
|
+
} catch {
|
|
414
|
+
return false;
|
|
415
|
+
}
|
|
416
|
+
});
|
|
417
|
+
if (containsProtected) return;
|
|
418
|
+
element.remove();
|
|
419
|
+
});
|
|
420
|
+
} catch {
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
function findMainContent(document) {
|
|
425
|
+
const isValidContent = (el) => {
|
|
426
|
+
if (!el) return false;
|
|
427
|
+
const text = el.textContent || "";
|
|
428
|
+
if (text.trim().length < 100) return false;
|
|
429
|
+
if (looksLikeNavigation(el)) return false;
|
|
430
|
+
return true;
|
|
431
|
+
};
|
|
432
|
+
const main = document.querySelector("main");
|
|
433
|
+
if (isValidContent(main) && getLinkDensity(main) < 0.4) {
|
|
434
|
+
return main;
|
|
435
|
+
}
|
|
436
|
+
const roleMain = document.querySelector('[role="main"]');
|
|
437
|
+
if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) {
|
|
438
|
+
return roleMain;
|
|
439
|
+
}
|
|
440
|
+
const articles = document.querySelectorAll("article");
|
|
441
|
+
if (articles.length === 1 && isValidContent(articles[0])) {
|
|
442
|
+
return articles[0];
|
|
443
|
+
}
|
|
444
|
+
const contentSelectors = [
|
|
445
|
+
"#content",
|
|
446
|
+
"#main-content",
|
|
447
|
+
"#main",
|
|
448
|
+
".content",
|
|
449
|
+
".main-content",
|
|
450
|
+
".post-content",
|
|
451
|
+
".article-content",
|
|
452
|
+
".entry-content",
|
|
453
|
+
".page-content",
|
|
454
|
+
".article-body",
|
|
455
|
+
".post-body",
|
|
456
|
+
".story-content",
|
|
457
|
+
".blog-content"
|
|
458
|
+
];
|
|
459
|
+
for (const selector of contentSelectors) {
|
|
460
|
+
try {
|
|
461
|
+
const el = document.querySelector(selector);
|
|
462
|
+
if (isValidContent(el) && getLinkDensity(el) < 0.4) {
|
|
463
|
+
return el;
|
|
464
|
+
}
|
|
465
|
+
} catch {
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
const candidates = [];
|
|
469
|
+
const containers = document.querySelectorAll("div, section, article");
|
|
470
|
+
containers.forEach((el) => {
|
|
471
|
+
const text = el.textContent || "";
|
|
472
|
+
if (text.trim().length < 200) return;
|
|
473
|
+
const score = getContentScore(el);
|
|
474
|
+
if (score > 0) {
|
|
475
|
+
candidates.push({ el, score });
|
|
476
|
+
}
|
|
477
|
+
});
|
|
478
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
479
|
+
if (candidates.length > 0 && candidates[0].score > 20) {
|
|
480
|
+
return candidates[0].el;
|
|
481
|
+
}
|
|
482
|
+
return null;
|
|
483
|
+
}
|
|
484
|
+
function cleanHtml(html, baseUrl, options = {}) {
|
|
485
|
+
const {
|
|
486
|
+
removeAds = true,
|
|
487
|
+
removeBase64Images = true,
|
|
488
|
+
onlyMainContent = true,
|
|
489
|
+
includeTags,
|
|
490
|
+
excludeTags
|
|
491
|
+
} = options;
|
|
492
|
+
const { document } = parseHTML(html);
|
|
493
|
+
removeElements(document, ALWAYS_REMOVE_SELECTORS);
|
|
494
|
+
removeElements(document, OVERLAY_SELECTORS);
|
|
806
495
|
if (removeAds) {
|
|
807
|
-
|
|
496
|
+
removeElements(document, AD_SELECTORS);
|
|
497
|
+
}
|
|
498
|
+
if (excludeTags && excludeTags.length > 0) {
|
|
499
|
+
removeElements(document, excludeTags);
|
|
500
|
+
}
|
|
501
|
+
if (onlyMainContent) {
|
|
502
|
+
removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS);
|
|
503
|
+
const mainContent = findMainContent(document);
|
|
504
|
+
if (mainContent) {
|
|
505
|
+
const body = document.body;
|
|
506
|
+
if (body) {
|
|
507
|
+
const clone = mainContent.cloneNode(true);
|
|
508
|
+
body.innerHTML = "";
|
|
509
|
+
body.appendChild(clone);
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
if (includeTags && includeTags.length > 0) {
|
|
514
|
+
const matchedElements = [];
|
|
515
|
+
for (const selector of includeTags) {
|
|
808
516
|
try {
|
|
809
|
-
document.querySelectorAll(selector).forEach((el) =>
|
|
517
|
+
document.querySelectorAll(selector).forEach((el) => {
|
|
518
|
+
matchedElements.push(el.cloneNode(true));
|
|
519
|
+
});
|
|
810
520
|
} catch {
|
|
811
521
|
}
|
|
812
522
|
}
|
|
523
|
+
if (matchedElements.length > 0) {
|
|
524
|
+
const body = document.body;
|
|
525
|
+
if (body) {
|
|
526
|
+
body.innerHTML = "";
|
|
527
|
+
matchedElements.forEach((el) => body.appendChild(el));
|
|
528
|
+
}
|
|
529
|
+
}
|
|
813
530
|
}
|
|
814
531
|
if (removeBase64Images) {
|
|
815
532
|
removeBase64ImagesFromDocument(document);
|
|
@@ -834,7 +551,10 @@ function removeBase64ImagesFromDocument(document) {
|
|
|
834
551
|
document.querySelectorAll("[style*='data:image']").forEach((el) => {
|
|
835
552
|
const style = el.getAttribute("style");
|
|
836
553
|
if (style) {
|
|
837
|
-
const cleanedStyle = style.replace(
|
|
554
|
+
const cleanedStyle = style.replace(
|
|
555
|
+
/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi,
|
|
556
|
+
""
|
|
557
|
+
);
|
|
838
558
|
if (cleanedStyle.trim()) {
|
|
839
559
|
el.setAttribute("style", cleanedStyle);
|
|
840
560
|
} else {
|
|
@@ -871,7 +591,7 @@ function cleanContent(html, baseUrl, options = {}) {
|
|
|
871
591
|
}
|
|
872
592
|
|
|
873
593
|
// src/utils/metadata-extractor.ts
|
|
874
|
-
import { parseHTML as
|
|
594
|
+
import { parseHTML as parseHTML2 } from "linkedom";
|
|
875
595
|
|
|
876
596
|
// src/utils/url-helpers.ts
|
|
877
597
|
import { URL as URL2 } from "url";
|
|
@@ -944,8 +664,26 @@ function isSameDomain(url, baseUrl) {
|
|
|
944
664
|
function getUrlKey(url) {
|
|
945
665
|
try {
|
|
946
666
|
const parsedUrl = new URL2(url);
|
|
667
|
+
parsedUrl.hash = "";
|
|
947
668
|
parsedUrl.search = "";
|
|
948
|
-
|
|
669
|
+
if (parsedUrl.hostname.startsWith("www.")) {
|
|
670
|
+
parsedUrl.hostname = parsedUrl.hostname.slice(4);
|
|
671
|
+
}
|
|
672
|
+
if (parsedUrl.protocol === "http:" && parsedUrl.port === "80" || parsedUrl.protocol === "https:" && parsedUrl.port === "443") {
|
|
673
|
+
parsedUrl.port = "";
|
|
674
|
+
}
|
|
675
|
+
const indexFiles = ["index.html", "index.htm", "default.html", "default.htm", "index.php"];
|
|
676
|
+
for (const indexFile of indexFiles) {
|
|
677
|
+
if (parsedUrl.pathname.endsWith(`/${indexFile}`)) {
|
|
678
|
+
parsedUrl.pathname = parsedUrl.pathname.slice(0, -indexFile.length);
|
|
679
|
+
break;
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
let normalized = parsedUrl.toString().toLowerCase();
|
|
683
|
+
if (normalized.endsWith("/") && parsedUrl.pathname !== "/") {
|
|
684
|
+
normalized = normalized.slice(0, -1);
|
|
685
|
+
}
|
|
686
|
+
return normalized;
|
|
949
687
|
} catch {
|
|
950
688
|
return url.toLowerCase();
|
|
951
689
|
}
|
|
@@ -1012,7 +750,7 @@ function extractMetadata(html, baseUrl) {
|
|
|
1012
750
|
return extractWebsiteMetadata(html, baseUrl);
|
|
1013
751
|
}
|
|
1014
752
|
function extractWebsiteMetadata(html, baseUrl) {
|
|
1015
|
-
const { document } =
|
|
753
|
+
const { document } = parseHTML2(html);
|
|
1016
754
|
const metadata = {
|
|
1017
755
|
title: null,
|
|
1018
756
|
description: null,
|
|
@@ -1294,13 +1032,15 @@ function isUrlAllowed(url, rules) {
|
|
|
1294
1032
|
var DEFAULT_OPTIONS = {
|
|
1295
1033
|
urls: [],
|
|
1296
1034
|
formats: ["markdown"],
|
|
1297
|
-
includeMetadata: true,
|
|
1298
1035
|
timeoutMs: 3e4,
|
|
1299
1036
|
includePatterns: [],
|
|
1300
1037
|
excludePatterns: [],
|
|
1301
1038
|
// Content cleaning defaults
|
|
1302
1039
|
removeAds: true,
|
|
1303
1040
|
removeBase64Images: true,
|
|
1041
|
+
onlyMainContent: true,
|
|
1042
|
+
includeTags: [],
|
|
1043
|
+
excludeTags: [],
|
|
1304
1044
|
skipTLSVerification: true,
|
|
1305
1045
|
// Batch defaults
|
|
1306
1046
|
batchConcurrency: 1,
|
|
@@ -1457,14 +1197,9 @@ var Scraper = class {
|
|
|
1457
1197
|
} catch {
|
|
1458
1198
|
}
|
|
1459
1199
|
await hero.waitForPaintingStable();
|
|
1460
|
-
let hadChallenge = false;
|
|
1461
|
-
let challengeType = "none";
|
|
1462
|
-
let waitTimeMs = 0;
|
|
1463
1200
|
const initialUrl = await hero.url;
|
|
1464
1201
|
const detection = await detectChallenge(hero);
|
|
1465
1202
|
if (detection.isChallenge) {
|
|
1466
|
-
hadChallenge = true;
|
|
1467
|
-
challengeType = detection.type;
|
|
1468
1203
|
if (this.options.verbose) {
|
|
1469
1204
|
this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
|
|
1470
1205
|
}
|
|
@@ -1474,12 +1209,11 @@ var Scraper = class {
|
|
|
1474
1209
|
verbose: this.options.verbose,
|
|
1475
1210
|
initialUrl
|
|
1476
1211
|
});
|
|
1477
|
-
waitTimeMs = result2.waitedMs;
|
|
1478
1212
|
if (!result2.resolved) {
|
|
1479
1213
|
throw new Error(`Challenge not resolved: ${detection.type}`);
|
|
1480
1214
|
}
|
|
1481
1215
|
if (this.options.verbose) {
|
|
1482
|
-
this.logger.info(`Challenge resolved via ${result2.method} in ${
|
|
1216
|
+
this.logger.info(`Challenge resolved via ${result2.method} in ${result2.waitedMs}ms`);
|
|
1483
1217
|
}
|
|
1484
1218
|
}
|
|
1485
1219
|
await this.waitForFinalPage(hero, url, this.options.verbose);
|
|
@@ -1492,45 +1226,18 @@ var Scraper = class {
|
|
|
1492
1226
|
this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
|
|
1493
1227
|
}
|
|
1494
1228
|
}
|
|
1495
|
-
const pageTitle = await hero.document.title;
|
|
1496
1229
|
const html = await hero.document.documentElement.outerHTML;
|
|
1497
1230
|
const cleanedHtml = cleanContent(html, url, {
|
|
1498
1231
|
removeAds: this.options.removeAds,
|
|
1499
|
-
removeBase64Images: this.options.removeBase64Images
|
|
1232
|
+
removeBase64Images: this.options.removeBase64Images,
|
|
1233
|
+
onlyMainContent: this.options.onlyMainContent,
|
|
1234
|
+
includeTags: this.options.includeTags,
|
|
1235
|
+
excludeTags: this.options.excludeTags
|
|
1500
1236
|
});
|
|
1501
1237
|
const websiteMetadata = extractMetadata(cleanedHtml, url);
|
|
1502
1238
|
const duration = Date.now() - startTime;
|
|
1503
|
-
const
|
|
1504
|
-
const
|
|
1505
|
-
url,
|
|
1506
|
-
title: pageTitle,
|
|
1507
|
-
markdown: "",
|
|
1508
|
-
// Will be set by formatter
|
|
1509
|
-
html: cleanedHtml,
|
|
1510
|
-
fetchedAt: scrapedAt,
|
|
1511
|
-
depth: 0,
|
|
1512
|
-
hadChallenge,
|
|
1513
|
-
challengeType,
|
|
1514
|
-
waitTimeMs
|
|
1515
|
-
};
|
|
1516
|
-
const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
|
|
1517
|
-
[page],
|
|
1518
|
-
url,
|
|
1519
|
-
scrapedAt,
|
|
1520
|
-
duration,
|
|
1521
|
-
websiteMetadata,
|
|
1522
|
-
this.options.includeMetadata
|
|
1523
|
-
) : void 0;
|
|
1524
|
-
const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1525
|
-
const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1526
|
-
const text = this.options.formats.includes("text") ? formatToText(
|
|
1527
|
-
[page],
|
|
1528
|
-
url,
|
|
1529
|
-
scrapedAt,
|
|
1530
|
-
duration,
|
|
1531
|
-
websiteMetadata,
|
|
1532
|
-
this.options.includeMetadata
|
|
1533
|
-
) : void 0;
|
|
1239
|
+
const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
|
|
1240
|
+
const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
|
|
1534
1241
|
if (this.options.onProgress) {
|
|
1535
1242
|
this.options.onProgress({
|
|
1536
1243
|
completed: index + 1,
|
|
@@ -1562,8 +1269,6 @@ var Scraper = class {
|
|
|
1562
1269
|
const result = {
|
|
1563
1270
|
markdown,
|
|
1564
1271
|
html: htmlOutput,
|
|
1565
|
-
json,
|
|
1566
|
-
text,
|
|
1567
1272
|
metadata: {
|
|
1568
1273
|
baseUrl: url,
|
|
1569
1274
|
totalPages: 1,
|
|
@@ -1618,7 +1323,7 @@ async function scrape(options) {
|
|
|
1618
1323
|
}
|
|
1619
1324
|
|
|
1620
1325
|
// src/crawler.ts
|
|
1621
|
-
import { parseHTML as
|
|
1326
|
+
import { parseHTML as parseHTML3 } from "linkedom";
|
|
1622
1327
|
|
|
1623
1328
|
// src/utils/rate-limiter.ts
|
|
1624
1329
|
import pLimit2 from "p-limit";
|
|
@@ -1767,12 +1472,26 @@ var Crawler = class {
|
|
|
1767
1472
|
*/
|
|
1768
1473
|
extractLinks(html, baseUrl, depth) {
|
|
1769
1474
|
const links = [];
|
|
1770
|
-
const { document } =
|
|
1475
|
+
const { document } = parseHTML3(html);
|
|
1771
1476
|
document.querySelectorAll("a[href]").forEach((anchor) => {
|
|
1772
|
-
const
|
|
1477
|
+
const rawHref = anchor.getAttribute("href");
|
|
1478
|
+
if (!rawHref) return;
|
|
1479
|
+
const href = rawHref.trim();
|
|
1773
1480
|
if (!href) return;
|
|
1774
|
-
|
|
1481
|
+
if (href.startsWith("#")) return;
|
|
1482
|
+
const lowerHref = href.toLowerCase();
|
|
1483
|
+
if (lowerHref.startsWith("javascript:") || lowerHref.startsWith("mailto:") || lowerHref.startsWith("tel:") || lowerHref.startsWith("data:") || lowerHref.startsWith("blob:") || lowerHref.startsWith("ftp:")) {
|
|
1484
|
+
return;
|
|
1485
|
+
}
|
|
1486
|
+
let resolved = resolveUrl(href, baseUrl);
|
|
1775
1487
|
if (!resolved || !isValidUrl(resolved)) return;
|
|
1488
|
+
try {
|
|
1489
|
+
const parsed = new URL(resolved);
|
|
1490
|
+
parsed.hash = "";
|
|
1491
|
+
resolved = parsed.toString();
|
|
1492
|
+
} catch {
|
|
1493
|
+
return;
|
|
1494
|
+
}
|
|
1776
1495
|
if (!isSameDomain(resolved, this.options.url)) return;
|
|
1777
1496
|
if (!isContentUrl(resolved)) return;
|
|
1778
1497
|
if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
|
|
@@ -2883,9 +2602,9 @@ program.command("status").description("Check daemon status").option("-p, --port
|
|
|
2883
2602
|
});
|
|
2884
2603
|
program.command("scrape <urls...>").description("Scrape one or more URLs").option(
|
|
2885
2604
|
"-f, --format <formats>",
|
|
2886
|
-
"
|
|
2605
|
+
"Content formats to include (comma-separated: markdown,html)",
|
|
2887
2606
|
"markdown"
|
|
2888
|
-
).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--
|
|
2607
|
+
).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").option("--no-main-content", "Disable main content extraction (include full page)").option("--include-tags <selectors>", "CSS selectors for elements to include (comma-separated)").option("--exclude-tags <selectors>", "CSS selectors for elements to exclude (comma-separated)").action(async (urls, options) => {
|
|
2889
2608
|
const port = parseInt(options.port, 10);
|
|
2890
2609
|
const useStandalone = options.standalone || false;
|
|
2891
2610
|
let useDaemon = false;
|
|
@@ -2902,7 +2621,7 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
|
|
|
2902
2621
|
}) : null;
|
|
2903
2622
|
try {
|
|
2904
2623
|
const formats = options.format.split(",").map((f) => f.trim());
|
|
2905
|
-
const validFormats = ["markdown", "html"
|
|
2624
|
+
const validFormats = ["markdown", "html"];
|
|
2906
2625
|
for (const format of formats) {
|
|
2907
2626
|
if (!validFormats.includes(format)) {
|
|
2908
2627
|
console.error(
|
|
@@ -2915,6 +2634,8 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
|
|
|
2915
2634
|
console.error(`Scraping ${urls.length} URL(s)...`);
|
|
2916
2635
|
console.error(`Formats: ${formats.join(", ")}`);
|
|
2917
2636
|
}
|
|
2637
|
+
const includeTags = options.includeTags ? options.includeTags.split(",").map((s) => s.trim()) : void 0;
|
|
2638
|
+
const excludeTags = options.excludeTags ? options.excludeTags.split(",").map((s) => s.trim()) : void 0;
|
|
2918
2639
|
const scrapeOptions = {
|
|
2919
2640
|
urls,
|
|
2920
2641
|
formats,
|
|
@@ -2923,33 +2644,26 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
|
|
|
2923
2644
|
batchTimeoutMs: parseInt(options.batchTimeout, 10),
|
|
2924
2645
|
proxy: options.proxy ? { url: options.proxy } : void 0,
|
|
2925
2646
|
userAgent: options.userAgent,
|
|
2926
|
-
includeMetadata: options.metadata !== false,
|
|
2927
2647
|
verbose: options.verbose || false,
|
|
2928
2648
|
showChrome: options.showChrome || false,
|
|
2649
|
+
// Content cleaning options
|
|
2650
|
+
onlyMainContent: options.mainContent !== false,
|
|
2651
|
+
// --no-main-content sets this to false
|
|
2652
|
+
includeTags,
|
|
2653
|
+
excludeTags,
|
|
2929
2654
|
onProgress: options.verbose ? ({ completed, total, currentUrl }) => {
|
|
2930
2655
|
console.error(`[${completed}/${total}] ${currentUrl}`);
|
|
2931
2656
|
} : void 0
|
|
2932
2657
|
};
|
|
2933
2658
|
const result = useDaemon ? await daemonClient.scrape(scrapeOptions) : await standaloneClient.scrape(scrapeOptions);
|
|
2934
|
-
|
|
2935
|
-
for (const site of result.data) {
|
|
2936
|
-
if (formats.includes("markdown") && site.markdown) {
|
|
2937
|
-
output += site.markdown + "\n\n";
|
|
2938
|
-
} else if (formats.includes("text") && site.text) {
|
|
2939
|
-
output += site.text + "\n\n";
|
|
2940
|
-
} else if (formats.includes("html") && site.html) {
|
|
2941
|
-
output += site.html + "\n\n";
|
|
2942
|
-
} else if (formats.includes("json") && site.json) {
|
|
2943
|
-
output += site.json + "\n\n";
|
|
2944
|
-
}
|
|
2945
|
-
}
|
|
2659
|
+
const output = JSON.stringify(result, null, 2);
|
|
2946
2660
|
if (options.output) {
|
|
2947
|
-
writeFileSync(options.output, output
|
|
2661
|
+
writeFileSync(options.output, output);
|
|
2948
2662
|
if (options.verbose) {
|
|
2949
2663
|
console.error(`Output written to ${options.output}`);
|
|
2950
2664
|
}
|
|
2951
2665
|
} else {
|
|
2952
|
-
console.log(output
|
|
2666
|
+
console.log(output);
|
|
2953
2667
|
}
|
|
2954
2668
|
if (options.verbose) {
|
|
2955
2669
|
console.error(`
|
|
@@ -2972,7 +2686,7 @@ Summary:`);
|
|
|
2972
2686
|
}
|
|
2973
2687
|
}
|
|
2974
2688
|
});
|
|
2975
|
-
program.command("crawl <url>").description("Crawl a website to discover and optionally scrape pages").option("-d, --depth <n>", "Maximum crawl depth", "1").option("-m, --max-pages <n>", "Maximum pages to discover", "20").option("-s, --scrape", "Also scrape content of discovered pages").option("-f, --format <formats>", "
|
|
2689
|
+
program.command("crawl <url>").description("Crawl a website to discover and optionally scrape pages").option("-d, --depth <n>", "Maximum crawl depth", "1").option("-m, --max-pages <n>", "Maximum pages to discover", "20").option("-s, --scrape", "Also scrape content of discovered pages").option("-f, --format <formats>", "Content formats when scraping (comma-separated: markdown,html)", "markdown").option("-o, --output <file>", "Output file (stdout if omitted)").option("--delay <ms>", "Delay between requests in milliseconds", "1000").option("-t, --timeout <ms>", "Total timeout for crawl operation in milliseconds").option("--include <patterns>", "URL patterns to include (comma-separated regex)").option("--exclude <patterns>", "URL patterns to exclude (comma-separated regex)").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").action(async (url, options) => {
|
|
2976
2690
|
const port = parseInt(options.port, 10);
|
|
2977
2691
|
const useStandalone = options.standalone || false;
|
|
2978
2692
|
let useDaemon = false;
|
|
@@ -3008,38 +2722,20 @@ program.command("crawl <url>").description("Crawl a website to discover and opti
|
|
|
3008
2722
|
verbose: options.verbose || false,
|
|
3009
2723
|
showChrome: options.showChrome || false
|
|
3010
2724
|
};
|
|
3011
|
-
const
|
|
3012
|
-
|
|
3013
|
-
|
|
3014
|
-
|
|
3015
|
-
|
|
3016
|
-
|
|
3017
|
-
|
|
3018
|
-
} else if (formats.includes("text") && site.text) {
|
|
3019
|
-
output += site.text + "\n\n";
|
|
3020
|
-
} else if (formats.includes("html") && site.html) {
|
|
3021
|
-
output += site.html + "\n\n";
|
|
3022
|
-
} else if (formats.includes("json") && site.json) {
|
|
3023
|
-
output += site.json + "\n\n";
|
|
3024
|
-
}
|
|
3025
|
-
}
|
|
3026
|
-
} else {
|
|
3027
|
-
output = JSON.stringify(
|
|
3028
|
-
{
|
|
3029
|
-
urls: result.urls,
|
|
3030
|
-
metadata: result.metadata
|
|
3031
|
-
},
|
|
3032
|
-
null,
|
|
3033
|
-
2
|
|
3034
|
-
);
|
|
3035
|
-
}
|
|
2725
|
+
const formats = options.format.split(",").map((f) => f.trim());
|
|
2726
|
+
const crawlOptionsWithFormats = {
|
|
2727
|
+
...crawlOptions,
|
|
2728
|
+
formats
|
|
2729
|
+
};
|
|
2730
|
+
const result = useDaemon ? await daemonClient.crawl(crawlOptionsWithFormats) : await standaloneClient.crawl(crawlOptionsWithFormats);
|
|
2731
|
+
const output = JSON.stringify(result, null, 2);
|
|
3036
2732
|
if (options.output) {
|
|
3037
|
-
writeFileSync(options.output, output
|
|
2733
|
+
writeFileSync(options.output, output);
|
|
3038
2734
|
if (options.verbose) {
|
|
3039
2735
|
console.error(`Output written to ${options.output}`);
|
|
3040
2736
|
}
|
|
3041
2737
|
} else {
|
|
3042
|
-
console.log(output
|
|
2738
|
+
console.log(output);
|
|
3043
2739
|
}
|
|
3044
2740
|
if (options.verbose) {
|
|
3045
2741
|
console.error(`
|