@vakra-dev/reader 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -26
- package/dist/cli/index.js +445 -734
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +205 -41
- package/dist/index.js +663 -715
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli/index.js
CHANGED
|
@@ -1,4 +1,10 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
|
|
3
|
+
get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
|
|
4
|
+
}) : x)(function(x) {
|
|
5
|
+
if (typeof require !== "undefined") return require.apply(this, arguments);
|
|
6
|
+
throw Error('Dynamic require of "' + x + '" is not supported');
|
|
7
|
+
});
|
|
2
8
|
|
|
3
9
|
// src/cli/index.ts
|
|
4
10
|
import { Command } from "commander";
|
|
@@ -12,27 +18,36 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
|
|
|
12
18
|
import pLimit from "p-limit";
|
|
13
19
|
|
|
14
20
|
// src/cloudflare/detector.ts
|
|
15
|
-
var
|
|
21
|
+
var CLOUDFLARE_CHALLENGE_SELECTORS = [
|
|
16
22
|
"#challenge-running",
|
|
17
23
|
"#challenge-stage",
|
|
18
24
|
"#challenge-form",
|
|
19
|
-
".cf-browser-verification"
|
|
25
|
+
".cf-browser-verification",
|
|
26
|
+
"#cf-wrapper",
|
|
27
|
+
"#cf-hcaptcha-container",
|
|
28
|
+
"#turnstile-wrapper"
|
|
20
29
|
];
|
|
21
|
-
var
|
|
22
|
-
"verifying you are human",
|
|
30
|
+
var CLOUDFLARE_TEXT_PATTERNS = [
|
|
23
31
|
"checking if the site connection is secure",
|
|
24
|
-
"this process is automatic. your browser will redirect"
|
|
32
|
+
"this process is automatic. your browser will redirect",
|
|
33
|
+
"ray id:",
|
|
34
|
+
"performance & security by cloudflare"
|
|
35
|
+
];
|
|
36
|
+
var CLOUDFLARE_INFRA_PATTERNS = [
|
|
37
|
+
"/cdn-cgi/",
|
|
38
|
+
"cloudflare",
|
|
39
|
+
"__cf_bm",
|
|
40
|
+
"cf-ray"
|
|
25
41
|
];
|
|
26
|
-
var
|
|
27
|
-
"you have been blocked",
|
|
28
|
-
"access to this page has been denied",
|
|
42
|
+
var CLOUDFLARE_BLOCKED_PATTERNS = [
|
|
29
43
|
"sorry, you have been blocked",
|
|
30
|
-
"
|
|
31
|
-
"403 forbidden"
|
|
44
|
+
"ray id:"
|
|
32
45
|
];
|
|
33
46
|
async function detectChallenge(hero) {
|
|
34
47
|
const signals = [];
|
|
35
48
|
let type = "none";
|
|
49
|
+
let hasCloudflareInfra = false;
|
|
50
|
+
let hasChallengeIndicator = false;
|
|
36
51
|
try {
|
|
37
52
|
if (!hero.document) {
|
|
38
53
|
return {
|
|
@@ -44,30 +59,51 @@ async function detectChallenge(hero) {
|
|
|
44
59
|
}
|
|
45
60
|
const html = await hero.document.documentElement.outerHTML;
|
|
46
61
|
const htmlLower = html.toLowerCase();
|
|
47
|
-
for (const
|
|
48
|
-
if (htmlLower.includes(
|
|
49
|
-
|
|
50
|
-
|
|
62
|
+
for (const pattern of CLOUDFLARE_INFRA_PATTERNS) {
|
|
63
|
+
if (htmlLower.includes(pattern)) {
|
|
64
|
+
hasCloudflareInfra = true;
|
|
65
|
+
signals.push(`Cloudflare infra: "${pattern}"`);
|
|
66
|
+
break;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
if (!hasCloudflareInfra) {
|
|
70
|
+
return {
|
|
71
|
+
isChallenge: false,
|
|
72
|
+
type: "none",
|
|
73
|
+
confidence: 0,
|
|
74
|
+
signals: ["No Cloudflare infrastructure detected"]
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
|
|
78
|
+
try {
|
|
79
|
+
const element = await hero.document.querySelector(selector);
|
|
80
|
+
if (element) {
|
|
81
|
+
hasChallengeIndicator = true;
|
|
82
|
+
signals.push(`Challenge element: ${selector}`);
|
|
83
|
+
type = "js_challenge";
|
|
84
|
+
}
|
|
85
|
+
} catch {
|
|
51
86
|
}
|
|
52
87
|
}
|
|
53
|
-
for (const pattern of
|
|
88
|
+
for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
|
|
54
89
|
if (htmlLower.includes(pattern)) {
|
|
90
|
+
hasChallengeIndicator = true;
|
|
55
91
|
signals.push(`Challenge text: "${pattern}"`);
|
|
56
92
|
type = type === "none" ? "js_challenge" : type;
|
|
57
93
|
}
|
|
58
94
|
}
|
|
59
95
|
if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
|
|
96
|
+
hasChallengeIndicator = true;
|
|
60
97
|
signals.push('Challenge text: "waiting for...to respond"');
|
|
61
98
|
type = type === "none" ? "js_challenge" : type;
|
|
62
99
|
}
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
}
|
|
100
|
+
const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
|
|
101
|
+
if (hasBlocked) {
|
|
102
|
+
hasChallengeIndicator = true;
|
|
103
|
+
signals.push("Cloudflare block page detected");
|
|
104
|
+
type = "blocked";
|
|
69
105
|
}
|
|
70
|
-
const isChallenge =
|
|
106
|
+
const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
|
|
71
107
|
const confidence = isChallenge ? 100 : 0;
|
|
72
108
|
return {
|
|
73
109
|
isChallenge,
|
|
@@ -150,84 +186,6 @@ var turndownService = new TurndownService({
|
|
|
150
186
|
linkStyle: "inlined",
|
|
151
187
|
linkReferenceStyle: "full"
|
|
152
188
|
});
|
|
153
|
-
function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
154
|
-
const sections = [];
|
|
155
|
-
if (includeMetadata) {
|
|
156
|
-
sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
157
|
-
}
|
|
158
|
-
if (pages.length > 1) {
|
|
159
|
-
sections.push(createMarkdownTOC(pages));
|
|
160
|
-
}
|
|
161
|
-
sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
|
|
162
|
-
return sections.join("\n\n");
|
|
163
|
-
}
|
|
164
|
-
function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
165
|
-
const title = website.title || extractDomainFromUrl(baseUrl);
|
|
166
|
-
const description = website.description || "";
|
|
167
|
-
let header = `# Website Scrape: ${title}
|
|
168
|
-
|
|
169
|
-
`;
|
|
170
|
-
header += `**Base URL:** ${baseUrl}
|
|
171
|
-
`;
|
|
172
|
-
header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
|
|
173
|
-
`;
|
|
174
|
-
header += `**Duration:** ${duration}ms
|
|
175
|
-
`;
|
|
176
|
-
header += `**Total pages:** ${totalPages}
|
|
177
|
-
`;
|
|
178
|
-
if (description) {
|
|
179
|
-
header += `**Description:** ${description}
|
|
180
|
-
`;
|
|
181
|
-
}
|
|
182
|
-
if (website.author) {
|
|
183
|
-
header += `**Author:** ${website.author}
|
|
184
|
-
`;
|
|
185
|
-
}
|
|
186
|
-
if (website.language) {
|
|
187
|
-
header += `**Language:** ${website.language}
|
|
188
|
-
`;
|
|
189
|
-
}
|
|
190
|
-
return header;
|
|
191
|
-
}
|
|
192
|
-
function createMarkdownTOC(pages) {
|
|
193
|
-
let toc = "## Table of Contents\n\n";
|
|
194
|
-
pages.forEach((page, index) => {
|
|
195
|
-
const depth = " ".repeat(page.depth);
|
|
196
|
-
const pageNumber = index + 1;
|
|
197
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
198
|
-
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
199
|
-
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
200
|
-
toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
|
|
201
|
-
`;
|
|
202
|
-
});
|
|
203
|
-
return toc;
|
|
204
|
-
}
|
|
205
|
-
function createMarkdownPage(page, pageNumber) {
|
|
206
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
207
|
-
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
208
|
-
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
209
|
-
let pageContent = `---
|
|
210
|
-
|
|
211
|
-
`;
|
|
212
|
-
pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
|
|
213
|
-
|
|
214
|
-
`;
|
|
215
|
-
pageContent += `**URL:** ${page.url}
|
|
216
|
-
`;
|
|
217
|
-
pageContent += `**Title:** ${page.title}
|
|
218
|
-
`;
|
|
219
|
-
pageContent += `**Depth:** ${page.depth}
|
|
220
|
-
`;
|
|
221
|
-
pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
|
|
222
|
-
|
|
223
|
-
`;
|
|
224
|
-
pageContent += `---
|
|
225
|
-
|
|
226
|
-
`;
|
|
227
|
-
const markdown = htmlToMarkdown(page.html);
|
|
228
|
-
pageContent += markdown;
|
|
229
|
-
return pageContent;
|
|
230
|
-
}
|
|
231
189
|
function htmlToMarkdown(html) {
|
|
232
190
|
try {
|
|
233
191
|
return turndownService.turndown(html);
|
|
@@ -236,574 +194,339 @@ function htmlToMarkdown(html) {
|
|
|
236
194
|
return html.replace(/<[^>]*>/g, "").trim();
|
|
237
195
|
}
|
|
238
196
|
}
|
|
239
|
-
function extractDomainFromUrl(url) {
|
|
240
|
-
try {
|
|
241
|
-
return new URL(url).hostname;
|
|
242
|
-
} catch {
|
|
243
|
-
return "Unknown";
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
// src/formatters/html.ts
|
|
248
|
-
function formatToHTML(pages, baseUrl, scrapedAt, duration, website) {
|
|
249
|
-
const html = `<!DOCTYPE html>
|
|
250
|
-
<html lang="${website.language || "en"}">
|
|
251
|
-
<head>
|
|
252
|
-
<meta charset="${website.charset || "UTF-8"}">
|
|
253
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
254
|
-
<title>Scrape: ${website.title || extractDomainFromUrl2(baseUrl)}</title>
|
|
255
|
-
${generateMetaTags(website)}
|
|
256
|
-
<style>
|
|
257
|
-
${generateCSS()}
|
|
258
|
-
</style>
|
|
259
|
-
</head>
|
|
260
|
-
<body>
|
|
261
|
-
<header class="header">
|
|
262
|
-
<h1>Website Scrape: ${escapeHtml(website.title || extractDomainFromUrl2(baseUrl))}</h1>
|
|
263
|
-
<div class="meta-info">
|
|
264
|
-
<p><strong>Base URL:</strong> <a href="${escapeHtml(
|
|
265
|
-
baseUrl
|
|
266
|
-
)}" target="_blank">${escapeHtml(baseUrl)}</a></p>
|
|
267
|
-
<p><strong>Scraped at:</strong> ${new Date(scrapedAt).toLocaleString()}</p>
|
|
268
|
-
<p><strong>Duration:</strong> ${duration}ms</p>
|
|
269
|
-
<p><strong>Total pages:</strong> ${pages.length}</p>
|
|
270
|
-
${website.description ? `<p><strong>Description:</strong> ${escapeHtml(website.description)}</p>` : ""}
|
|
271
|
-
${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
|
|
272
|
-
${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
|
|
273
|
-
</div>
|
|
274
|
-
</header>
|
|
275
|
-
|
|
276
|
-
${pages.length > 1 ? generateTOC(pages) : ""}
|
|
277
|
-
|
|
278
|
-
<main class="content">
|
|
279
|
-
${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
|
|
280
|
-
</main>
|
|
281
|
-
|
|
282
|
-
<footer class="footer">
|
|
283
|
-
<p>Generated by Reader JS/TS SDK</p>
|
|
284
|
-
</footer>
|
|
285
|
-
|
|
286
|
-
<script>
|
|
287
|
-
${generateJavaScript()}
|
|
288
|
-
</script>
|
|
289
|
-
</body>
|
|
290
|
-
</html>`;
|
|
291
|
-
return html;
|
|
292
|
-
}
|
|
293
|
-
function generateMetaTags(website) {
|
|
294
|
-
const tags = [];
|
|
295
|
-
if (website.description) {
|
|
296
|
-
tags.push(`<meta name="description" content="${escapeHtml(website.description)}">`);
|
|
297
|
-
}
|
|
298
|
-
if (website.author) {
|
|
299
|
-
tags.push(`<meta name="author" content="${escapeHtml(website.author)}">`);
|
|
300
|
-
}
|
|
301
|
-
if (website.keywords) {
|
|
302
|
-
tags.push(`<meta name="keywords" content="${escapeHtml(website.keywords.join(", "))}">`);
|
|
303
|
-
}
|
|
304
|
-
if (website.robots) {
|
|
305
|
-
tags.push(`<meta name="robots" content="${escapeHtml(website.robots)}">`);
|
|
306
|
-
}
|
|
307
|
-
if (website.themeColor) {
|
|
308
|
-
tags.push(`<meta name="theme-color" content="${escapeHtml(website.themeColor)}">`);
|
|
309
|
-
}
|
|
310
|
-
if (website.favicon) {
|
|
311
|
-
tags.push(`<link rel="icon" href="${escapeHtml(website.favicon)}">`);
|
|
312
|
-
}
|
|
313
|
-
if (website.canonical) {
|
|
314
|
-
tags.push(`<link rel="canonical" href="${escapeHtml(website.canonical)}">`);
|
|
315
|
-
}
|
|
316
|
-
if (website.openGraph) {
|
|
317
|
-
const og = website.openGraph;
|
|
318
|
-
if (og.title) tags.push(`<meta property="og:title" content="${escapeHtml(og.title)}">`);
|
|
319
|
-
if (og.description)
|
|
320
|
-
tags.push(`<meta property="og:description" content="${escapeHtml(og.description)}">`);
|
|
321
|
-
if (og.type) tags.push(`<meta property="og:type" content="${escapeHtml(og.type)}">`);
|
|
322
|
-
if (og.url) tags.push(`<meta property="og:url" content="${escapeHtml(og.url)}">`);
|
|
323
|
-
if (og.image) tags.push(`<meta property="og:image" content="${escapeHtml(og.image)}">`);
|
|
324
|
-
if (og.siteName)
|
|
325
|
-
tags.push(`<meta property="og:site_name" content="${escapeHtml(og.siteName)}">`);
|
|
326
|
-
if (og.locale) tags.push(`<meta property="og:locale" content="${escapeHtml(og.locale)}">`);
|
|
327
|
-
}
|
|
328
|
-
if (website.twitter) {
|
|
329
|
-
const twitter = website.twitter;
|
|
330
|
-
if (twitter.card) tags.push(`<meta name="twitter:card" content="${escapeHtml(twitter.card)}">`);
|
|
331
|
-
if (twitter.site) tags.push(`<meta name="twitter:site" content="${escapeHtml(twitter.site)}">`);
|
|
332
|
-
if (twitter.creator)
|
|
333
|
-
tags.push(`<meta name="twitter:creator" content="${escapeHtml(twitter.creator)}">`);
|
|
334
|
-
if (twitter.title)
|
|
335
|
-
tags.push(`<meta name="twitter:title" content="${escapeHtml(twitter.title)}">`);
|
|
336
|
-
if (twitter.description)
|
|
337
|
-
tags.push(`<meta name="twitter:description" content="${escapeHtml(twitter.description)}">`);
|
|
338
|
-
if (twitter.image)
|
|
339
|
-
tags.push(`<meta name="twitter:image" content="${escapeHtml(twitter.image)}">`);
|
|
340
|
-
}
|
|
341
|
-
return tags.join("\n ");
|
|
342
|
-
}
|
|
343
|
-
function generateCSS() {
|
|
344
|
-
return `
|
|
345
|
-
* {
|
|
346
|
-
margin: 0;
|
|
347
|
-
padding: 0;
|
|
348
|
-
box-sizing: border-box;
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
body {
|
|
352
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
353
|
-
line-height: 1.6;
|
|
354
|
-
color: #333;
|
|
355
|
-
background-color: #f8f9fa;
|
|
356
|
-
}
|
|
357
|
-
|
|
358
|
-
.header {
|
|
359
|
-
background: white;
|
|
360
|
-
padding: 2rem;
|
|
361
|
-
border-bottom: 1px solid #e9ecef;
|
|
362
|
-
margin-bottom: 2rem;
|
|
363
|
-
}
|
|
364
|
-
|
|
365
|
-
.header h1 {
|
|
366
|
-
color: #2c3e50;
|
|
367
|
-
margin-bottom: 1rem;
|
|
368
|
-
font-size: 2rem;
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
.meta-info {
|
|
372
|
-
display: grid;
|
|
373
|
-
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
|
374
|
-
gap: 0.5rem;
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
.meta-info p {
|
|
378
|
-
margin: 0.25rem 0;
|
|
379
|
-
font-size: 0.9rem;
|
|
380
|
-
color: #6c757d;
|
|
381
|
-
}
|
|
382
|
-
|
|
383
|
-
.toc {
|
|
384
|
-
background: white;
|
|
385
|
-
padding: 1.5rem;
|
|
386
|
-
margin: 2rem 0;
|
|
387
|
-
border-radius: 8px;
|
|
388
|
-
border: 1px solid #e9ecef;
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
.toc h2 {
|
|
392
|
-
color: #2c3e50;
|
|
393
|
-
margin-bottom: 1rem;
|
|
394
|
-
font-size: 1.25rem;
|
|
395
|
-
}
|
|
396
|
-
|
|
397
|
-
.toc ul {
|
|
398
|
-
list-style: none;
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
.toc li {
|
|
402
|
-
margin: 0.5rem 0;
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
.toc a {
|
|
406
|
-
color: #007bff;
|
|
407
|
-
text-decoration: none;
|
|
408
|
-
transition: color 0.2s;
|
|
409
|
-
}
|
|
410
|
-
|
|
411
|
-
.toc a:hover {
|
|
412
|
-
color: #0056b3;
|
|
413
|
-
text-decoration: underline;
|
|
414
|
-
}
|
|
415
|
-
|
|
416
|
-
.content {
|
|
417
|
-
max-width: 800px;
|
|
418
|
-
margin: 0 auto;
|
|
419
|
-
padding: 0 1rem;
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
.page {
|
|
423
|
-
background: white;
|
|
424
|
-
margin: 2rem 0;
|
|
425
|
-
padding: 2rem;
|
|
426
|
-
border-radius: 8px;
|
|
427
|
-
border: 1px solid #e9ecef;
|
|
428
|
-
box-shadow: 0 2px 4px rgba(0,0,0,0.05);
|
|
429
|
-
}
|
|
430
|
-
|
|
431
|
-
.page-header {
|
|
432
|
-
border-bottom: 2px solid #e9ecef;
|
|
433
|
-
padding-bottom: 1rem;
|
|
434
|
-
margin-bottom: 2rem;
|
|
435
|
-
}
|
|
436
|
-
|
|
437
|
-
.page-header h2 {
|
|
438
|
-
color: #2c3e50;
|
|
439
|
-
margin-bottom: 0.5rem;
|
|
440
|
-
font-size: 1.5rem;
|
|
441
|
-
}
|
|
442
|
-
|
|
443
|
-
.page-meta {
|
|
444
|
-
display: flex;
|
|
445
|
-
flex-wrap: wrap;
|
|
446
|
-
gap: 1rem;
|
|
447
|
-
font-size: 0.9rem;
|
|
448
|
-
color: #6c757d;
|
|
449
|
-
}
|
|
450
|
-
|
|
451
|
-
.page-content {
|
|
452
|
-
line-height: 1.8;
|
|
453
|
-
}
|
|
454
|
-
|
|
455
|
-
.page-content h1, .page-content h2, .page-content h3,
|
|
456
|
-
.page-content h4, .page-content h5, .page-content h6 {
|
|
457
|
-
color: #2c3e50;
|
|
458
|
-
margin: 1.5rem 0 0.5rem 0;
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
.page-content p {
|
|
462
|
-
margin: 1rem 0;
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
.page-content a {
|
|
466
|
-
color: #007bff;
|
|
467
|
-
text-decoration: none;
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
.page-content a:hover {
|
|
471
|
-
text-decoration: underline;
|
|
472
|
-
}
|
|
473
|
-
|
|
474
|
-
.page-content code {
|
|
475
|
-
background: #f8f9fa;
|
|
476
|
-
padding: 0.2rem 0.4rem;
|
|
477
|
-
border-radius: 4px;
|
|
478
|
-
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
|
479
|
-
font-size: 0.9em;
|
|
480
|
-
}
|
|
481
|
-
|
|
482
|
-
.page-content pre {
|
|
483
|
-
background: #f8f9fa;
|
|
484
|
-
padding: 1rem;
|
|
485
|
-
border-radius: 4px;
|
|
486
|
-
overflow-x: auto;
|
|
487
|
-
margin: 1rem 0;
|
|
488
|
-
}
|
|
489
|
-
|
|
490
|
-
.page-content blockquote {
|
|
491
|
-
border-left: 4px solid #007bff;
|
|
492
|
-
padding-left: 1rem;
|
|
493
|
-
margin: 1rem 0;
|
|
494
|
-
color: #6c757d;
|
|
495
|
-
}
|
|
496
|
-
|
|
497
|
-
.footer {
|
|
498
|
-
text-align: center;
|
|
499
|
-
padding: 2rem;
|
|
500
|
-
margin-top: 3rem;
|
|
501
|
-
border-top: 1px solid #e9ecef;
|
|
502
|
-
color: #6c757d;
|
|
503
|
-
font-size: 0.9rem;
|
|
504
|
-
}
|
|
505
|
-
|
|
506
|
-
@media (max-width: 768px) {
|
|
507
|
-
.header {
|
|
508
|
-
padding: 1rem;
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
.header h1 {
|
|
512
|
-
font-size: 1.5rem;
|
|
513
|
-
}
|
|
514
|
-
|
|
515
|
-
.page {
|
|
516
|
-
padding: 1rem;
|
|
517
|
-
}
|
|
518
|
-
|
|
519
|
-
.page-meta {
|
|
520
|
-
flex-direction: column;
|
|
521
|
-
gap: 0.5rem;
|
|
522
|
-
}
|
|
523
|
-
}
|
|
524
|
-
`.trim();
|
|
525
|
-
}
|
|
526
|
-
function generateTOC(pages) {
|
|
527
|
-
const tocItems = pages.map((page, index) => {
|
|
528
|
-
const pageNumber = index + 1;
|
|
529
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
530
|
-
const id = `page-${pageNumber}`;
|
|
531
|
-
return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
|
|
532
|
-
}).join("\n");
|
|
533
|
-
return `
|
|
534
|
-
<nav class="toc">
|
|
535
|
-
<h2>Table of Contents</h2>
|
|
536
|
-
<ul>
|
|
537
|
-
${tocItems}
|
|
538
|
-
</ul>
|
|
539
|
-
</nav>`;
|
|
540
|
-
}
|
|
541
|
-
function generatePageHTML(page, pageNumber) {
|
|
542
|
-
const id = `page-${pageNumber}`;
|
|
543
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
544
|
-
return `
|
|
545
|
-
<article class="page" id="${id}">
|
|
546
|
-
<div class="page-header">
|
|
547
|
-
<h2>${pageNumber}. ${escapeHtml(title)}</h2>
|
|
548
|
-
<div class="page-meta">
|
|
549
|
-
<span><strong>URL:</strong> <a href="${escapeHtml(
|
|
550
|
-
page.url
|
|
551
|
-
)}" target="_blank">${escapeHtml(page.url)}</a></span>
|
|
552
|
-
<span><strong>Depth:</strong> ${page.depth}</span>
|
|
553
|
-
<span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
|
|
554
|
-
</div>
|
|
555
|
-
</div>
|
|
556
|
-
<div class="page-content">
|
|
557
|
-
${page.html}
|
|
558
|
-
</div>
|
|
559
|
-
</article>`;
|
|
560
|
-
}
|
|
561
|
-
function generateJavaScript() {
|
|
562
|
-
return `
|
|
563
|
-
// Smooth scrolling for TOC links
|
|
564
|
-
document.querySelectorAll('a[href^="#"]').forEach(anchor => {
|
|
565
|
-
anchor.addEventListener('click', function (e) {
|
|
566
|
-
e.preventDefault();
|
|
567
|
-
const target = document.querySelector(this.getAttribute('href'));
|
|
568
|
-
if (target) {
|
|
569
|
-
target.scrollIntoView({
|
|
570
|
-
behavior: 'smooth',
|
|
571
|
-
block: 'start'
|
|
572
|
-
});
|
|
573
|
-
}
|
|
574
|
-
});
|
|
575
|
-
});
|
|
576
|
-
|
|
577
|
-
// Highlight current section in TOC
|
|
578
|
-
window.addEventListener('scroll', function() {
|
|
579
|
-
const pages = document.querySelectorAll('.page');
|
|
580
|
-
const tocLinks = document.querySelectorAll('.toc a');
|
|
581
|
-
|
|
582
|
-
let currentPage = null;
|
|
583
|
-
pages.forEach(page => {
|
|
584
|
-
const rect = page.getBoundingClientRect();
|
|
585
|
-
if (rect.top <= 100) {
|
|
586
|
-
currentPage = page;
|
|
587
|
-
}
|
|
588
|
-
});
|
|
589
|
-
|
|
590
|
-
tocLinks.forEach(link => {
|
|
591
|
-
link.style.fontWeight = 'normal';
|
|
592
|
-
const target = document.querySelector(link.getAttribute('href'));
|
|
593
|
-
if (target === currentPage) {
|
|
594
|
-
link.style.fontWeight = 'bold';
|
|
595
|
-
}
|
|
596
|
-
});
|
|
597
|
-
});
|
|
598
|
-
`;
|
|
599
|
-
}
|
|
600
|
-
function escapeHtml(text) {
|
|
601
|
-
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'").replace(/\//g, "/");
|
|
602
|
-
}
|
|
603
|
-
function extractDomainFromUrl2(url) {
|
|
604
|
-
try {
|
|
605
|
-
return new URL(url).hostname;
|
|
606
|
-
} catch {
|
|
607
|
-
return "Unknown";
|
|
608
|
-
}
|
|
609
|
-
}
|
|
610
|
-
|
|
611
|
-
// src/formatters/json.ts
|
|
612
|
-
function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
|
|
613
|
-
const jsonResult = {
|
|
614
|
-
metadata: {
|
|
615
|
-
baseUrl,
|
|
616
|
-
totalPages: pages.length,
|
|
617
|
-
scrapedAt,
|
|
618
|
-
duration,
|
|
619
|
-
website
|
|
620
|
-
},
|
|
621
|
-
pages: pages.map((page, index) => ({
|
|
622
|
-
index: index + 1,
|
|
623
|
-
url: page.url,
|
|
624
|
-
title: page.title,
|
|
625
|
-
markdown: page.markdown,
|
|
626
|
-
html: page.html,
|
|
627
|
-
fetchedAt: page.fetchedAt,
|
|
628
|
-
depth: page.depth,
|
|
629
|
-
wordCount: countWords(page.markdown),
|
|
630
|
-
readingTime: estimateReadingTime(page.markdown)
|
|
631
|
-
}))
|
|
632
|
-
};
|
|
633
|
-
return JSON.stringify(jsonResult, null, 2);
|
|
634
|
-
}
|
|
635
|
-
function countWords(markdown) {
|
|
636
|
-
const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
|
|
637
|
-
return plainText.split(/\s+/).filter((word) => word.length > 0).length;
|
|
638
|
-
}
|
|
639
|
-
function estimateReadingTime(markdown) {
|
|
640
|
-
const wordCount = countWords(markdown);
|
|
641
|
-
return Math.ceil(wordCount / 200);
|
|
642
|
-
}
|
|
643
|
-
|
|
644
|
-
// src/formatters/text.ts
|
|
645
|
-
import { parseHTML } from "linkedom";
|
|
646
|
-
function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
647
|
-
const sections = [];
|
|
648
|
-
if (includeMetadata) {
|
|
649
|
-
sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
650
|
-
}
|
|
651
|
-
sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
|
|
652
|
-
return sections.join("\n\n");
|
|
653
|
-
}
|
|
654
|
-
function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
655
|
-
const title = website.title || extractDomainFromUrl3(baseUrl);
|
|
656
|
-
const lines = [];
|
|
657
|
-
lines.push(`=== ${title} ===`);
|
|
658
|
-
lines.push("");
|
|
659
|
-
lines.push(`URL: ${baseUrl}`);
|
|
660
|
-
lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
|
|
661
|
-
lines.push(`Duration: ${duration}ms`);
|
|
662
|
-
lines.push(`Pages: ${totalPages}`);
|
|
663
|
-
if (website.description) {
|
|
664
|
-
lines.push(`Description: ${website.description}`);
|
|
665
|
-
}
|
|
666
|
-
if (website.author) {
|
|
667
|
-
lines.push(`Author: ${website.author}`);
|
|
668
|
-
}
|
|
669
|
-
if (website.language) {
|
|
670
|
-
lines.push(`Language: ${website.language}`);
|
|
671
|
-
}
|
|
672
|
-
return lines.join("\n");
|
|
673
|
-
}
|
|
674
|
-
function createTextPage(page, pageNumber, showSeparator) {
|
|
675
|
-
const lines = [];
|
|
676
|
-
if (showSeparator) {
|
|
677
|
-
lines.push("\u2500".repeat(60));
|
|
678
|
-
lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
|
|
679
|
-
lines.push(`URL: ${page.url}`);
|
|
680
|
-
lines.push("\u2500".repeat(60));
|
|
681
|
-
}
|
|
682
|
-
const plainText = htmlToPlainText(page.html);
|
|
683
|
-
lines.push(plainText);
|
|
684
|
-
return lines.join("\n");
|
|
685
|
-
}
|
|
686
|
-
function htmlToPlainText(html) {
|
|
687
|
-
const { document } = parseHTML(html);
|
|
688
|
-
const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
|
|
689
|
-
elementsToRemove.forEach((tag) => {
|
|
690
|
-
document.querySelectorAll(tag).forEach((el) => el.remove());
|
|
691
|
-
});
|
|
692
|
-
let text = document.body?.textContent || document.documentElement?.textContent || "";
|
|
693
|
-
text = text.replace(/[ \t]+/g, " ");
|
|
694
|
-
text = text.replace(/\n[ \t]+/g, "\n");
|
|
695
|
-
text = text.replace(/[ \t]+\n/g, "\n");
|
|
696
|
-
text = text.replace(/\n{3,}/g, "\n\n");
|
|
697
|
-
text = text.trim();
|
|
698
|
-
return text;
|
|
699
|
-
}
|
|
700
|
-
function extractDomainFromUrl3(url) {
|
|
701
|
-
try {
|
|
702
|
-
return new URL(url).hostname;
|
|
703
|
-
} catch {
|
|
704
|
-
return "Unknown";
|
|
705
|
-
}
|
|
706
|
-
}
|
|
707
197
|
|
|
708
198
|
// src/utils/content-cleaner.ts
|
|
709
|
-
import { parseHTML
|
|
199
|
+
import { parseHTML } from "linkedom";
|
|
710
200
|
var ALWAYS_REMOVE_SELECTORS = [
|
|
711
|
-
// Navigation and menus
|
|
712
|
-
"nav",
|
|
713
|
-
"header nav",
|
|
714
|
-
"footer nav",
|
|
715
|
-
".nav",
|
|
716
|
-
".navigation",
|
|
717
|
-
".menu",
|
|
718
|
-
".navbar",
|
|
719
|
-
".sidebar",
|
|
720
|
-
".aside",
|
|
721
|
-
// Header and footer elements
|
|
722
|
-
"header",
|
|
723
|
-
"footer",
|
|
724
|
-
".site-header",
|
|
725
|
-
".page-header",
|
|
726
|
-
".site-footer",
|
|
727
|
-
".page-footer",
|
|
728
|
-
// Social media and sharing
|
|
729
|
-
".social",
|
|
730
|
-
".share",
|
|
731
|
-
".sharing",
|
|
732
|
-
".twitter",
|
|
733
|
-
".facebook",
|
|
734
|
-
".linkedin",
|
|
735
|
-
".instagram",
|
|
736
|
-
// Comments and discussions
|
|
737
|
-
".comments",
|
|
738
|
-
".comment",
|
|
739
|
-
".discussion",
|
|
740
|
-
".disqus",
|
|
741
|
-
// Forms and interactive elements
|
|
742
|
-
"form",
|
|
743
|
-
"input",
|
|
744
|
-
"button:not([type='submit'])",
|
|
745
|
-
"select",
|
|
746
|
-
"textarea",
|
|
747
201
|
// Scripts and styles
|
|
748
202
|
"script",
|
|
749
203
|
"style",
|
|
750
204
|
"noscript",
|
|
205
|
+
"link[rel='stylesheet']",
|
|
751
206
|
// Hidden elements
|
|
752
207
|
"[hidden]",
|
|
208
|
+
"[aria-hidden='true']",
|
|
753
209
|
"[style*='display: none']",
|
|
754
210
|
"[style*='display:none']",
|
|
755
|
-
|
|
756
|
-
"
|
|
757
|
-
|
|
758
|
-
"
|
|
211
|
+
"[style*='visibility: hidden']",
|
|
212
|
+
"[style*='visibility:hidden']",
|
|
213
|
+
// SVG icons and decorative elements
|
|
214
|
+
"svg[aria-hidden='true']",
|
|
215
|
+
"svg.icon",
|
|
216
|
+
"svg[class*='icon']",
|
|
217
|
+
// Template and metadata
|
|
218
|
+
"template",
|
|
219
|
+
"meta",
|
|
220
|
+
// Embeds that don't convert to text
|
|
221
|
+
"iframe",
|
|
222
|
+
"canvas",
|
|
223
|
+
"object",
|
|
224
|
+
"embed",
|
|
225
|
+
// Forms (usually not main content)
|
|
226
|
+
"form",
|
|
227
|
+
"input",
|
|
228
|
+
"select",
|
|
229
|
+
"textarea",
|
|
230
|
+
"button"
|
|
231
|
+
];
|
|
232
|
+
var OVERLAY_SELECTORS = [
|
|
233
|
+
"[class*='modal']",
|
|
234
|
+
"[class*='popup']",
|
|
235
|
+
"[class*='overlay']",
|
|
236
|
+
"[class*='dialog']",
|
|
237
|
+
"[role='dialog']",
|
|
238
|
+
"[role='alertdialog']",
|
|
239
|
+
"[class*='cookie']",
|
|
240
|
+
"[class*='consent']",
|
|
241
|
+
"[class*='gdpr']",
|
|
242
|
+
"[class*='privacy-banner']",
|
|
243
|
+
"[class*='notification-bar']",
|
|
244
|
+
"[id*='cookie']",
|
|
245
|
+
"[id*='consent']",
|
|
246
|
+
"[id*='gdpr']",
|
|
247
|
+
// Fixed/sticky positioned elements
|
|
248
|
+
"[style*='position: fixed']",
|
|
249
|
+
"[style*='position:fixed']",
|
|
250
|
+
"[style*='position: sticky']",
|
|
251
|
+
"[style*='position:sticky']"
|
|
252
|
+
];
|
|
253
|
+
var NAVIGATION_SELECTORS = [
|
|
254
|
+
// Semantic elements
|
|
255
|
+
"header",
|
|
256
|
+
"footer",
|
|
257
|
+
"nav",
|
|
258
|
+
"aside",
|
|
259
|
+
// Header variations
|
|
260
|
+
".header",
|
|
261
|
+
".top",
|
|
262
|
+
".navbar",
|
|
263
|
+
"#header",
|
|
264
|
+
// Footer variations
|
|
265
|
+
".footer",
|
|
266
|
+
".bottom",
|
|
267
|
+
"#footer",
|
|
268
|
+
// Sidebars
|
|
269
|
+
".sidebar",
|
|
270
|
+
".side",
|
|
271
|
+
".aside",
|
|
272
|
+
"#sidebar",
|
|
273
|
+
// Modals/popups (backup if not caught by OVERLAY_SELECTORS)
|
|
759
274
|
".modal",
|
|
275
|
+
".popup",
|
|
276
|
+
"#modal",
|
|
760
277
|
".overlay",
|
|
761
|
-
|
|
278
|
+
// Ads
|
|
279
|
+
".ad",
|
|
280
|
+
".ads",
|
|
281
|
+
".advert",
|
|
282
|
+
"#ad",
|
|
283
|
+
// Language selectors
|
|
284
|
+
".lang-selector",
|
|
285
|
+
".language",
|
|
286
|
+
"#language-selector",
|
|
287
|
+
// Social
|
|
288
|
+
".social",
|
|
289
|
+
".social-media",
|
|
290
|
+
".social-links",
|
|
291
|
+
"#social",
|
|
292
|
+
// Navigation/menus
|
|
293
|
+
".menu",
|
|
294
|
+
".navigation",
|
|
295
|
+
"#nav",
|
|
762
296
|
// Breadcrumbs
|
|
763
|
-
".breadcrumb",
|
|
764
297
|
".breadcrumbs",
|
|
765
|
-
"
|
|
298
|
+
"#breadcrumbs",
|
|
299
|
+
// Share buttons
|
|
300
|
+
".share",
|
|
301
|
+
"#share",
|
|
302
|
+
// Widgets
|
|
303
|
+
".widget",
|
|
304
|
+
"#widget",
|
|
305
|
+
// Cookie notices (backup)
|
|
306
|
+
".cookie",
|
|
307
|
+
"#cookie"
|
|
308
|
+
];
|
|
309
|
+
var FORCE_INCLUDE_SELECTORS = [
|
|
310
|
+
// IDs
|
|
311
|
+
"#main",
|
|
312
|
+
"#content",
|
|
313
|
+
"#main-content",
|
|
314
|
+
"#article",
|
|
315
|
+
"#post",
|
|
316
|
+
"#page-content",
|
|
317
|
+
// Semantic elements
|
|
318
|
+
"main",
|
|
319
|
+
"article",
|
|
320
|
+
"[role='main']",
|
|
321
|
+
// Classes
|
|
322
|
+
".main-content",
|
|
323
|
+
".content",
|
|
324
|
+
".post-content",
|
|
325
|
+
".article-content",
|
|
326
|
+
".entry-content",
|
|
327
|
+
".page-content",
|
|
328
|
+
".article-body",
|
|
329
|
+
".post-body",
|
|
330
|
+
".story-content",
|
|
331
|
+
".blog-content"
|
|
766
332
|
];
|
|
767
333
|
var AD_SELECTORS = [
|
|
768
|
-
//
|
|
769
|
-
".
|
|
770
|
-
".ads",
|
|
771
|
-
".advertisement",
|
|
772
|
-
".promotion",
|
|
773
|
-
".sponsored",
|
|
774
|
-
"[class*='ad-']",
|
|
775
|
-
"[id*='ad-']",
|
|
776
|
-
"[class*='advert']",
|
|
777
|
-
"[id*='advert']",
|
|
778
|
-
"[class*='banner']",
|
|
779
|
-
"[id*='banner']",
|
|
334
|
+
// Google ads
|
|
335
|
+
"ins.adsbygoogle",
|
|
780
336
|
".google-ad",
|
|
781
337
|
".adsense",
|
|
338
|
+
// Generic ad containers
|
|
782
339
|
"[data-ad]",
|
|
783
340
|
"[data-ads]",
|
|
784
|
-
"
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
"
|
|
788
|
-
"
|
|
789
|
-
"
|
|
341
|
+
"[data-ad-slot]",
|
|
342
|
+
"[data-ad-client]",
|
|
343
|
+
// Common ad class patterns
|
|
344
|
+
".ad-container",
|
|
345
|
+
".ad-wrapper",
|
|
346
|
+
".advertisement",
|
|
347
|
+
".sponsored-content",
|
|
348
|
+
// Tracking pixels
|
|
349
|
+
"img[width='1'][height='1']",
|
|
350
|
+
"img[src*='pixel']",
|
|
351
|
+
"img[src*='tracking']",
|
|
352
|
+
"img[src*='analytics']"
|
|
790
353
|
];
|
|
791
|
-
function
|
|
792
|
-
const
|
|
793
|
-
const
|
|
794
|
-
|
|
354
|
+
function getLinkDensity(element) {
|
|
355
|
+
const text = element.textContent || "";
|
|
356
|
+
const textLength = text.trim().length;
|
|
357
|
+
if (textLength === 0) return 1;
|
|
358
|
+
let linkLength = 0;
|
|
359
|
+
element.querySelectorAll("a").forEach((link) => {
|
|
360
|
+
linkLength += (link.textContent || "").trim().length;
|
|
361
|
+
});
|
|
362
|
+
return linkLength / textLength;
|
|
363
|
+
}
|
|
364
|
+
function getContentScore(element) {
|
|
365
|
+
let score = 0;
|
|
366
|
+
const text = element.textContent || "";
|
|
367
|
+
const textLength = text.trim().length;
|
|
368
|
+
score += Math.min(textLength / 100, 50);
|
|
369
|
+
score += element.querySelectorAll("p").length * 3;
|
|
370
|
+
score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2;
|
|
371
|
+
score += element.querySelectorAll("img").length * 1;
|
|
372
|
+
score -= element.querySelectorAll("a").length * 0.5;
|
|
373
|
+
score -= element.querySelectorAll("li").length * 0.2;
|
|
374
|
+
const linkDensity = getLinkDensity(element);
|
|
375
|
+
if (linkDensity > 0.5) score -= 30;
|
|
376
|
+
else if (linkDensity > 0.3) score -= 15;
|
|
377
|
+
const classAndId = (element.className || "") + " " + (element.id || "");
|
|
378
|
+
if (/article|content|post|body|main|entry/i.test(classAndId)) score += 25;
|
|
379
|
+
if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) score -= 25;
|
|
380
|
+
return score;
|
|
381
|
+
}
|
|
382
|
+
function looksLikeNavigation(element) {
|
|
383
|
+
const linkDensity = getLinkDensity(element);
|
|
384
|
+
if (linkDensity > 0.5) return true;
|
|
385
|
+
const listItems = element.querySelectorAll("li");
|
|
386
|
+
const links = element.querySelectorAll("a");
|
|
387
|
+
if (listItems.length > 5 && links.length > listItems.length * 0.8) return true;
|
|
388
|
+
return false;
|
|
389
|
+
}
|
|
390
|
+
function removeElements(document, selectors) {
|
|
391
|
+
for (const selector of selectors) {
|
|
795
392
|
try {
|
|
796
393
|
document.querySelectorAll(selector).forEach((el) => el.remove());
|
|
797
394
|
} catch {
|
|
798
395
|
}
|
|
799
396
|
}
|
|
397
|
+
}
|
|
398
|
+
function removeWithProtection(document, selectorsToRemove, protectedSelectors) {
|
|
399
|
+
for (const selector of selectorsToRemove) {
|
|
400
|
+
try {
|
|
401
|
+
document.querySelectorAll(selector).forEach((element) => {
|
|
402
|
+
const isProtected = protectedSelectors.some((ps) => {
|
|
403
|
+
try {
|
|
404
|
+
return element.matches(ps);
|
|
405
|
+
} catch {
|
|
406
|
+
return false;
|
|
407
|
+
}
|
|
408
|
+
});
|
|
409
|
+
if (isProtected) return;
|
|
410
|
+
const containsProtected = protectedSelectors.some((ps) => {
|
|
411
|
+
try {
|
|
412
|
+
return element.querySelector(ps) !== null;
|
|
413
|
+
} catch {
|
|
414
|
+
return false;
|
|
415
|
+
}
|
|
416
|
+
});
|
|
417
|
+
if (containsProtected) return;
|
|
418
|
+
element.remove();
|
|
419
|
+
});
|
|
420
|
+
} catch {
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
function findMainContent(document) {
|
|
425
|
+
const isValidContent = (el) => {
|
|
426
|
+
if (!el) return false;
|
|
427
|
+
const text = el.textContent || "";
|
|
428
|
+
if (text.trim().length < 100) return false;
|
|
429
|
+
if (looksLikeNavigation(el)) return false;
|
|
430
|
+
return true;
|
|
431
|
+
};
|
|
432
|
+
const main = document.querySelector("main");
|
|
433
|
+
if (isValidContent(main) && getLinkDensity(main) < 0.4) {
|
|
434
|
+
return main;
|
|
435
|
+
}
|
|
436
|
+
const roleMain = document.querySelector('[role="main"]');
|
|
437
|
+
if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) {
|
|
438
|
+
return roleMain;
|
|
439
|
+
}
|
|
440
|
+
const articles = document.querySelectorAll("article");
|
|
441
|
+
if (articles.length === 1 && isValidContent(articles[0])) {
|
|
442
|
+
return articles[0];
|
|
443
|
+
}
|
|
444
|
+
const contentSelectors = [
|
|
445
|
+
"#content",
|
|
446
|
+
"#main-content",
|
|
447
|
+
"#main",
|
|
448
|
+
".content",
|
|
449
|
+
".main-content",
|
|
450
|
+
".post-content",
|
|
451
|
+
".article-content",
|
|
452
|
+
".entry-content",
|
|
453
|
+
".page-content",
|
|
454
|
+
".article-body",
|
|
455
|
+
".post-body",
|
|
456
|
+
".story-content",
|
|
457
|
+
".blog-content"
|
|
458
|
+
];
|
|
459
|
+
for (const selector of contentSelectors) {
|
|
460
|
+
try {
|
|
461
|
+
const el = document.querySelector(selector);
|
|
462
|
+
if (isValidContent(el) && getLinkDensity(el) < 0.4) {
|
|
463
|
+
return el;
|
|
464
|
+
}
|
|
465
|
+
} catch {
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
const candidates = [];
|
|
469
|
+
const containers = document.querySelectorAll("div, section, article");
|
|
470
|
+
containers.forEach((el) => {
|
|
471
|
+
const text = el.textContent || "";
|
|
472
|
+
if (text.trim().length < 200) return;
|
|
473
|
+
const score = getContentScore(el);
|
|
474
|
+
if (score > 0) {
|
|
475
|
+
candidates.push({ el, score });
|
|
476
|
+
}
|
|
477
|
+
});
|
|
478
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
479
|
+
if (candidates.length > 0 && candidates[0].score > 20) {
|
|
480
|
+
return candidates[0].el;
|
|
481
|
+
}
|
|
482
|
+
return null;
|
|
483
|
+
}
|
|
484
|
+
function cleanHtml(html, baseUrl, options = {}) {
|
|
485
|
+
const {
|
|
486
|
+
removeAds = true,
|
|
487
|
+
removeBase64Images = true,
|
|
488
|
+
onlyMainContent = true,
|
|
489
|
+
includeTags,
|
|
490
|
+
excludeTags
|
|
491
|
+
} = options;
|
|
492
|
+
const { document } = parseHTML(html);
|
|
493
|
+
removeElements(document, ALWAYS_REMOVE_SELECTORS);
|
|
494
|
+
removeElements(document, OVERLAY_SELECTORS);
|
|
800
495
|
if (removeAds) {
|
|
801
|
-
|
|
496
|
+
removeElements(document, AD_SELECTORS);
|
|
497
|
+
}
|
|
498
|
+
if (excludeTags && excludeTags.length > 0) {
|
|
499
|
+
removeElements(document, excludeTags);
|
|
500
|
+
}
|
|
501
|
+
if (onlyMainContent) {
|
|
502
|
+
removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS);
|
|
503
|
+
const mainContent = findMainContent(document);
|
|
504
|
+
if (mainContent) {
|
|
505
|
+
const body = document.body;
|
|
506
|
+
if (body) {
|
|
507
|
+
const clone = mainContent.cloneNode(true);
|
|
508
|
+
body.innerHTML = "";
|
|
509
|
+
body.appendChild(clone);
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
if (includeTags && includeTags.length > 0) {
|
|
514
|
+
const matchedElements = [];
|
|
515
|
+
for (const selector of includeTags) {
|
|
802
516
|
try {
|
|
803
|
-
document.querySelectorAll(selector).forEach((el) =>
|
|
517
|
+
document.querySelectorAll(selector).forEach((el) => {
|
|
518
|
+
matchedElements.push(el.cloneNode(true));
|
|
519
|
+
});
|
|
804
520
|
} catch {
|
|
805
521
|
}
|
|
806
522
|
}
|
|
523
|
+
if (matchedElements.length > 0) {
|
|
524
|
+
const body = document.body;
|
|
525
|
+
if (body) {
|
|
526
|
+
body.innerHTML = "";
|
|
527
|
+
matchedElements.forEach((el) => body.appendChild(el));
|
|
528
|
+
}
|
|
529
|
+
}
|
|
807
530
|
}
|
|
808
531
|
if (removeBase64Images) {
|
|
809
532
|
removeBase64ImagesFromDocument(document);
|
|
@@ -828,7 +551,10 @@ function removeBase64ImagesFromDocument(document) {
|
|
|
828
551
|
document.querySelectorAll("[style*='data:image']").forEach((el) => {
|
|
829
552
|
const style = el.getAttribute("style");
|
|
830
553
|
if (style) {
|
|
831
|
-
const cleanedStyle = style.replace(
|
|
554
|
+
const cleanedStyle = style.replace(
|
|
555
|
+
/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi,
|
|
556
|
+
""
|
|
557
|
+
);
|
|
832
558
|
if (cleanedStyle.trim()) {
|
|
833
559
|
el.setAttribute("style", cleanedStyle);
|
|
834
560
|
} else {
|
|
@@ -865,7 +591,7 @@ function cleanContent(html, baseUrl, options = {}) {
|
|
|
865
591
|
}
|
|
866
592
|
|
|
867
593
|
// src/utils/metadata-extractor.ts
|
|
868
|
-
import { parseHTML as
|
|
594
|
+
import { parseHTML as parseHTML2 } from "linkedom";
|
|
869
595
|
|
|
870
596
|
// src/utils/url-helpers.ts
|
|
871
597
|
import { URL as URL2 } from "url";
|
|
@@ -938,8 +664,26 @@ function isSameDomain(url, baseUrl) {
|
|
|
938
664
|
function getUrlKey(url) {
|
|
939
665
|
try {
|
|
940
666
|
const parsedUrl = new URL2(url);
|
|
667
|
+
parsedUrl.hash = "";
|
|
941
668
|
parsedUrl.search = "";
|
|
942
|
-
|
|
669
|
+
if (parsedUrl.hostname.startsWith("www.")) {
|
|
670
|
+
parsedUrl.hostname = parsedUrl.hostname.slice(4);
|
|
671
|
+
}
|
|
672
|
+
if (parsedUrl.protocol === "http:" && parsedUrl.port === "80" || parsedUrl.protocol === "https:" && parsedUrl.port === "443") {
|
|
673
|
+
parsedUrl.port = "";
|
|
674
|
+
}
|
|
675
|
+
const indexFiles = ["index.html", "index.htm", "default.html", "default.htm", "index.php"];
|
|
676
|
+
for (const indexFile of indexFiles) {
|
|
677
|
+
if (parsedUrl.pathname.endsWith(`/${indexFile}`)) {
|
|
678
|
+
parsedUrl.pathname = parsedUrl.pathname.slice(0, -indexFile.length);
|
|
679
|
+
break;
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
let normalized = parsedUrl.toString().toLowerCase();
|
|
683
|
+
if (normalized.endsWith("/") && parsedUrl.pathname !== "/") {
|
|
684
|
+
normalized = normalized.slice(0, -1);
|
|
685
|
+
}
|
|
686
|
+
return normalized;
|
|
943
687
|
} catch {
|
|
944
688
|
return url.toLowerCase();
|
|
945
689
|
}
|
|
@@ -1006,7 +750,7 @@ function extractMetadata(html, baseUrl) {
|
|
|
1006
750
|
return extractWebsiteMetadata(html, baseUrl);
|
|
1007
751
|
}
|
|
1008
752
|
function extractWebsiteMetadata(html, baseUrl) {
|
|
1009
|
-
const { document } =
|
|
753
|
+
const { document } = parseHTML2(html);
|
|
1010
754
|
const metadata = {
|
|
1011
755
|
title: null,
|
|
1012
756
|
description: null,
|
|
@@ -1161,11 +905,20 @@ function extractTwitterCard(document) {
|
|
|
1161
905
|
|
|
1162
906
|
// src/utils/logger.ts
|
|
1163
907
|
import pino from "pino";
|
|
908
|
+
function hasPinoPretty() {
|
|
909
|
+
try {
|
|
910
|
+
__require.resolve("pino-pretty");
|
|
911
|
+
return true;
|
|
912
|
+
} catch {
|
|
913
|
+
return false;
|
|
914
|
+
}
|
|
915
|
+
}
|
|
1164
916
|
function createLogger(name = "reader", level = process.env.LOG_LEVEL || "info") {
|
|
917
|
+
const usePretty = process.env.NODE_ENV !== "production" && hasPinoPretty();
|
|
1165
918
|
return pino({
|
|
1166
919
|
name,
|
|
1167
920
|
level,
|
|
1168
|
-
transport:
|
|
921
|
+
transport: usePretty ? {
|
|
1169
922
|
target: "pino-pretty",
|
|
1170
923
|
options: {
|
|
1171
924
|
colorize: true,
|
|
@@ -1279,13 +1032,15 @@ function isUrlAllowed(url, rules) {
|
|
|
1279
1032
|
var DEFAULT_OPTIONS = {
|
|
1280
1033
|
urls: [],
|
|
1281
1034
|
formats: ["markdown"],
|
|
1282
|
-
includeMetadata: true,
|
|
1283
1035
|
timeoutMs: 3e4,
|
|
1284
1036
|
includePatterns: [],
|
|
1285
1037
|
excludePatterns: [],
|
|
1286
1038
|
// Content cleaning defaults
|
|
1287
1039
|
removeAds: true,
|
|
1288
1040
|
removeBase64Images: true,
|
|
1041
|
+
onlyMainContent: true,
|
|
1042
|
+
includeTags: [],
|
|
1043
|
+
excludeTags: [],
|
|
1289
1044
|
skipTLSVerification: true,
|
|
1290
1045
|
// Batch defaults
|
|
1291
1046
|
batchConcurrency: 1,
|
|
@@ -1442,14 +1197,9 @@ var Scraper = class {
|
|
|
1442
1197
|
} catch {
|
|
1443
1198
|
}
|
|
1444
1199
|
await hero.waitForPaintingStable();
|
|
1445
|
-
let hadChallenge = false;
|
|
1446
|
-
let challengeType = "none";
|
|
1447
|
-
let waitTimeMs = 0;
|
|
1448
1200
|
const initialUrl = await hero.url;
|
|
1449
1201
|
const detection = await detectChallenge(hero);
|
|
1450
1202
|
if (detection.isChallenge) {
|
|
1451
|
-
hadChallenge = true;
|
|
1452
|
-
challengeType = detection.type;
|
|
1453
1203
|
if (this.options.verbose) {
|
|
1454
1204
|
this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
|
|
1455
1205
|
}
|
|
@@ -1459,12 +1209,11 @@ var Scraper = class {
|
|
|
1459
1209
|
verbose: this.options.verbose,
|
|
1460
1210
|
initialUrl
|
|
1461
1211
|
});
|
|
1462
|
-
waitTimeMs = result2.waitedMs;
|
|
1463
1212
|
if (!result2.resolved) {
|
|
1464
1213
|
throw new Error(`Challenge not resolved: ${detection.type}`);
|
|
1465
1214
|
}
|
|
1466
1215
|
if (this.options.verbose) {
|
|
1467
|
-
this.logger.info(`Challenge resolved via ${result2.method} in ${
|
|
1216
|
+
this.logger.info(`Challenge resolved via ${result2.method} in ${result2.waitedMs}ms`);
|
|
1468
1217
|
}
|
|
1469
1218
|
}
|
|
1470
1219
|
await this.waitForFinalPage(hero, url, this.options.verbose);
|
|
@@ -1477,45 +1226,18 @@ var Scraper = class {
|
|
|
1477
1226
|
this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
|
|
1478
1227
|
}
|
|
1479
1228
|
}
|
|
1480
|
-
const pageTitle = await hero.document.title;
|
|
1481
1229
|
const html = await hero.document.documentElement.outerHTML;
|
|
1482
1230
|
const cleanedHtml = cleanContent(html, url, {
|
|
1483
1231
|
removeAds: this.options.removeAds,
|
|
1484
|
-
removeBase64Images: this.options.removeBase64Images
|
|
1232
|
+
removeBase64Images: this.options.removeBase64Images,
|
|
1233
|
+
onlyMainContent: this.options.onlyMainContent,
|
|
1234
|
+
includeTags: this.options.includeTags,
|
|
1235
|
+
excludeTags: this.options.excludeTags
|
|
1485
1236
|
});
|
|
1486
1237
|
const websiteMetadata = extractMetadata(cleanedHtml, url);
|
|
1487
1238
|
const duration = Date.now() - startTime;
|
|
1488
|
-
const
|
|
1489
|
-
const
|
|
1490
|
-
url,
|
|
1491
|
-
title: pageTitle,
|
|
1492
|
-
markdown: "",
|
|
1493
|
-
// Will be set by formatter
|
|
1494
|
-
html: cleanedHtml,
|
|
1495
|
-
fetchedAt: scrapedAt,
|
|
1496
|
-
depth: 0,
|
|
1497
|
-
hadChallenge,
|
|
1498
|
-
challengeType,
|
|
1499
|
-
waitTimeMs
|
|
1500
|
-
};
|
|
1501
|
-
const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
|
|
1502
|
-
[page],
|
|
1503
|
-
url,
|
|
1504
|
-
scrapedAt,
|
|
1505
|
-
duration,
|
|
1506
|
-
websiteMetadata,
|
|
1507
|
-
this.options.includeMetadata
|
|
1508
|
-
) : void 0;
|
|
1509
|
-
const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1510
|
-
const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1511
|
-
const text = this.options.formats.includes("text") ? formatToText(
|
|
1512
|
-
[page],
|
|
1513
|
-
url,
|
|
1514
|
-
scrapedAt,
|
|
1515
|
-
duration,
|
|
1516
|
-
websiteMetadata,
|
|
1517
|
-
this.options.includeMetadata
|
|
1518
|
-
) : void 0;
|
|
1239
|
+
const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
|
|
1240
|
+
const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
|
|
1519
1241
|
if (this.options.onProgress) {
|
|
1520
1242
|
this.options.onProgress({
|
|
1521
1243
|
completed: index + 1,
|
|
@@ -1547,8 +1269,6 @@ var Scraper = class {
|
|
|
1547
1269
|
const result = {
|
|
1548
1270
|
markdown,
|
|
1549
1271
|
html: htmlOutput,
|
|
1550
|
-
json,
|
|
1551
|
-
text,
|
|
1552
1272
|
metadata: {
|
|
1553
1273
|
baseUrl: url,
|
|
1554
1274
|
totalPages: 1,
|
|
@@ -1603,7 +1323,7 @@ async function scrape(options) {
|
|
|
1603
1323
|
}
|
|
1604
1324
|
|
|
1605
1325
|
// src/crawler.ts
|
|
1606
|
-
import { parseHTML as
|
|
1326
|
+
import { parseHTML as parseHTML3 } from "linkedom";
|
|
1607
1327
|
|
|
1608
1328
|
// src/utils/rate-limiter.ts
|
|
1609
1329
|
import pLimit2 from "p-limit";
|
|
@@ -1752,12 +1472,26 @@ var Crawler = class {
|
|
|
1752
1472
|
*/
|
|
1753
1473
|
extractLinks(html, baseUrl, depth) {
|
|
1754
1474
|
const links = [];
|
|
1755
|
-
const { document } =
|
|
1475
|
+
const { document } = parseHTML3(html);
|
|
1756
1476
|
document.querySelectorAll("a[href]").forEach((anchor) => {
|
|
1757
|
-
const
|
|
1477
|
+
const rawHref = anchor.getAttribute("href");
|
|
1478
|
+
if (!rawHref) return;
|
|
1479
|
+
const href = rawHref.trim();
|
|
1758
1480
|
if (!href) return;
|
|
1759
|
-
|
|
1481
|
+
if (href.startsWith("#")) return;
|
|
1482
|
+
const lowerHref = href.toLowerCase();
|
|
1483
|
+
if (lowerHref.startsWith("javascript:") || lowerHref.startsWith("mailto:") || lowerHref.startsWith("tel:") || lowerHref.startsWith("data:") || lowerHref.startsWith("blob:") || lowerHref.startsWith("ftp:")) {
|
|
1484
|
+
return;
|
|
1485
|
+
}
|
|
1486
|
+
let resolved = resolveUrl(href, baseUrl);
|
|
1760
1487
|
if (!resolved || !isValidUrl(resolved)) return;
|
|
1488
|
+
try {
|
|
1489
|
+
const parsed = new URL(resolved);
|
|
1490
|
+
parsed.hash = "";
|
|
1491
|
+
resolved = parsed.toString();
|
|
1492
|
+
} catch {
|
|
1493
|
+
return;
|
|
1494
|
+
}
|
|
1761
1495
|
if (!isSameDomain(resolved, this.options.url)) return;
|
|
1762
1496
|
if (!isContentUrl(resolved)) return;
|
|
1763
1497
|
if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
|
|
@@ -2868,9 +2602,9 @@ program.command("status").description("Check daemon status").option("-p, --port
|
|
|
2868
2602
|
});
|
|
2869
2603
|
program.command("scrape <urls...>").description("Scrape one or more URLs").option(
|
|
2870
2604
|
"-f, --format <formats>",
|
|
2871
|
-
"
|
|
2605
|
+
"Content formats to include (comma-separated: markdown,html)",
|
|
2872
2606
|
"markdown"
|
|
2873
|
-
).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--
|
|
2607
|
+
).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").option("--no-main-content", "Disable main content extraction (include full page)").option("--include-tags <selectors>", "CSS selectors for elements to include (comma-separated)").option("--exclude-tags <selectors>", "CSS selectors for elements to exclude (comma-separated)").action(async (urls, options) => {
|
|
2874
2608
|
const port = parseInt(options.port, 10);
|
|
2875
2609
|
const useStandalone = options.standalone || false;
|
|
2876
2610
|
let useDaemon = false;
|
|
@@ -2887,7 +2621,7 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
|
|
|
2887
2621
|
}) : null;
|
|
2888
2622
|
try {
|
|
2889
2623
|
const formats = options.format.split(",").map((f) => f.trim());
|
|
2890
|
-
const validFormats = ["markdown", "html"
|
|
2624
|
+
const validFormats = ["markdown", "html"];
|
|
2891
2625
|
for (const format of formats) {
|
|
2892
2626
|
if (!validFormats.includes(format)) {
|
|
2893
2627
|
console.error(
|
|
@@ -2900,6 +2634,8 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
|
|
|
2900
2634
|
console.error(`Scraping ${urls.length} URL(s)...`);
|
|
2901
2635
|
console.error(`Formats: ${formats.join(", ")}`);
|
|
2902
2636
|
}
|
|
2637
|
+
const includeTags = options.includeTags ? options.includeTags.split(",").map((s) => s.trim()) : void 0;
|
|
2638
|
+
const excludeTags = options.excludeTags ? options.excludeTags.split(",").map((s) => s.trim()) : void 0;
|
|
2903
2639
|
const scrapeOptions = {
|
|
2904
2640
|
urls,
|
|
2905
2641
|
formats,
|
|
@@ -2908,33 +2644,26 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
|
|
|
2908
2644
|
batchTimeoutMs: parseInt(options.batchTimeout, 10),
|
|
2909
2645
|
proxy: options.proxy ? { url: options.proxy } : void 0,
|
|
2910
2646
|
userAgent: options.userAgent,
|
|
2911
|
-
includeMetadata: options.metadata !== false,
|
|
2912
2647
|
verbose: options.verbose || false,
|
|
2913
2648
|
showChrome: options.showChrome || false,
|
|
2649
|
+
// Content cleaning options
|
|
2650
|
+
onlyMainContent: options.mainContent !== false,
|
|
2651
|
+
// --no-main-content sets this to false
|
|
2652
|
+
includeTags,
|
|
2653
|
+
excludeTags,
|
|
2914
2654
|
onProgress: options.verbose ? ({ completed, total, currentUrl }) => {
|
|
2915
2655
|
console.error(`[${completed}/${total}] ${currentUrl}`);
|
|
2916
2656
|
} : void 0
|
|
2917
2657
|
};
|
|
2918
2658
|
const result = useDaemon ? await daemonClient.scrape(scrapeOptions) : await standaloneClient.scrape(scrapeOptions);
|
|
2919
|
-
|
|
2920
|
-
for (const site of result.data) {
|
|
2921
|
-
if (formats.includes("markdown") && site.markdown) {
|
|
2922
|
-
output += site.markdown + "\n\n";
|
|
2923
|
-
} else if (formats.includes("text") && site.text) {
|
|
2924
|
-
output += site.text + "\n\n";
|
|
2925
|
-
} else if (formats.includes("html") && site.html) {
|
|
2926
|
-
output += site.html + "\n\n";
|
|
2927
|
-
} else if (formats.includes("json") && site.json) {
|
|
2928
|
-
output += site.json + "\n\n";
|
|
2929
|
-
}
|
|
2930
|
-
}
|
|
2659
|
+
const output = JSON.stringify(result, null, 2);
|
|
2931
2660
|
if (options.output) {
|
|
2932
|
-
writeFileSync(options.output, output
|
|
2661
|
+
writeFileSync(options.output, output);
|
|
2933
2662
|
if (options.verbose) {
|
|
2934
2663
|
console.error(`Output written to ${options.output}`);
|
|
2935
2664
|
}
|
|
2936
2665
|
} else {
|
|
2937
|
-
console.log(output
|
|
2666
|
+
console.log(output);
|
|
2938
2667
|
}
|
|
2939
2668
|
if (options.verbose) {
|
|
2940
2669
|
console.error(`
|
|
@@ -2957,7 +2686,7 @@ Summary:`);
|
|
|
2957
2686
|
}
|
|
2958
2687
|
}
|
|
2959
2688
|
});
|
|
2960
|
-
program.command("crawl <url>").description("Crawl a website to discover and optionally scrape pages").option("-d, --depth <n>", "Maximum crawl depth", "1").option("-m, --max-pages <n>", "Maximum pages to discover", "20").option("-s, --scrape", "Also scrape content of discovered pages").option("-f, --format <formats>", "
|
|
2689
|
+
program.command("crawl <url>").description("Crawl a website to discover and optionally scrape pages").option("-d, --depth <n>", "Maximum crawl depth", "1").option("-m, --max-pages <n>", "Maximum pages to discover", "20").option("-s, --scrape", "Also scrape content of discovered pages").option("-f, --format <formats>", "Content formats when scraping (comma-separated: markdown,html)", "markdown").option("-o, --output <file>", "Output file (stdout if omitted)").option("--delay <ms>", "Delay between requests in milliseconds", "1000").option("-t, --timeout <ms>", "Total timeout for crawl operation in milliseconds").option("--include <patterns>", "URL patterns to include (comma-separated regex)").option("--exclude <patterns>", "URL patterns to exclude (comma-separated regex)").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").action(async (url, options) => {
|
|
2961
2690
|
const port = parseInt(options.port, 10);
|
|
2962
2691
|
const useStandalone = options.standalone || false;
|
|
2963
2692
|
let useDaemon = false;
|
|
@@ -2993,38 +2722,20 @@ program.command("crawl <url>").description("Crawl a website to discover and opti
|
|
|
2993
2722
|
verbose: options.verbose || false,
|
|
2994
2723
|
showChrome: options.showChrome || false
|
|
2995
2724
|
};
|
|
2996
|
-
const
|
|
2997
|
-
|
|
2998
|
-
|
|
2999
|
-
|
|
3000
|
-
|
|
3001
|
-
|
|
3002
|
-
|
|
3003
|
-
} else if (formats.includes("text") && site.text) {
|
|
3004
|
-
output += site.text + "\n\n";
|
|
3005
|
-
} else if (formats.includes("html") && site.html) {
|
|
3006
|
-
output += site.html + "\n\n";
|
|
3007
|
-
} else if (formats.includes("json") && site.json) {
|
|
3008
|
-
output += site.json + "\n\n";
|
|
3009
|
-
}
|
|
3010
|
-
}
|
|
3011
|
-
} else {
|
|
3012
|
-
output = JSON.stringify(
|
|
3013
|
-
{
|
|
3014
|
-
urls: result.urls,
|
|
3015
|
-
metadata: result.metadata
|
|
3016
|
-
},
|
|
3017
|
-
null,
|
|
3018
|
-
2
|
|
3019
|
-
);
|
|
3020
|
-
}
|
|
2725
|
+
const formats = options.format.split(",").map((f) => f.trim());
|
|
2726
|
+
const crawlOptionsWithFormats = {
|
|
2727
|
+
...crawlOptions,
|
|
2728
|
+
formats
|
|
2729
|
+
};
|
|
2730
|
+
const result = useDaemon ? await daemonClient.crawl(crawlOptionsWithFormats) : await standaloneClient.crawl(crawlOptionsWithFormats);
|
|
2731
|
+
const output = JSON.stringify(result, null, 2);
|
|
3021
2732
|
if (options.output) {
|
|
3022
|
-
writeFileSync(options.output, output
|
|
2733
|
+
writeFileSync(options.output, output);
|
|
3023
2734
|
if (options.verbose) {
|
|
3024
2735
|
console.error(`Output written to ${options.output}`);
|
|
3025
2736
|
}
|
|
3026
2737
|
} else {
|
|
3027
|
-
console.log(output
|
|
2738
|
+
console.log(output);
|
|
3028
2739
|
}
|
|
3029
2740
|
if (options.verbose) {
|
|
3030
2741
|
console.error(`
|