@vakra-dev/reader 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/index.js CHANGED
@@ -18,27 +18,36 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
18
18
  import pLimit from "p-limit";
19
19
 
20
20
  // src/cloudflare/detector.ts
21
- var CHALLENGE_DOM_SELECTORS = [
21
+ var CLOUDFLARE_CHALLENGE_SELECTORS = [
22
22
  "#challenge-running",
23
23
  "#challenge-stage",
24
24
  "#challenge-form",
25
- ".cf-browser-verification"
25
+ ".cf-browser-verification",
26
+ "#cf-wrapper",
27
+ "#cf-hcaptcha-container",
28
+ "#turnstile-wrapper"
26
29
  ];
27
- var CHALLENGE_TEXT_PATTERNS = [
28
- "verifying you are human",
30
+ var CLOUDFLARE_TEXT_PATTERNS = [
29
31
  "checking if the site connection is secure",
30
- "this process is automatic. your browser will redirect"
32
+ "this process is automatic. your browser will redirect",
33
+ "ray id:",
34
+ "performance & security by cloudflare"
31
35
  ];
32
- var BLOCKED_SIGNALS = [
33
- "you have been blocked",
34
- "access to this page has been denied",
36
+ var CLOUDFLARE_INFRA_PATTERNS = [
37
+ "/cdn-cgi/",
38
+ "cloudflare",
39
+ "__cf_bm",
40
+ "cf-ray"
41
+ ];
42
+ var CLOUDFLARE_BLOCKED_PATTERNS = [
35
43
  "sorry, you have been blocked",
36
- "access denied",
37
- "403 forbidden"
44
+ "ray id:"
38
45
  ];
39
46
  async function detectChallenge(hero) {
40
47
  const signals = [];
41
48
  let type = "none";
49
+ let hasCloudflareInfra = false;
50
+ let hasChallengeIndicator = false;
42
51
  try {
43
52
  if (!hero.document) {
44
53
  return {
@@ -50,30 +59,51 @@ async function detectChallenge(hero) {
50
59
  }
51
60
  const html = await hero.document.documentElement.outerHTML;
52
61
  const htmlLower = html.toLowerCase();
53
- for (const selector of CHALLENGE_DOM_SELECTORS) {
54
- if (htmlLower.includes(selector.toLowerCase())) {
55
- signals.push(`Challenge element: ${selector}`);
56
- type = "js_challenge";
62
+ for (const pattern of CLOUDFLARE_INFRA_PATTERNS) {
63
+ if (htmlLower.includes(pattern)) {
64
+ hasCloudflareInfra = true;
65
+ signals.push(`Cloudflare infra: "${pattern}"`);
66
+ break;
67
+ }
68
+ }
69
+ if (!hasCloudflareInfra) {
70
+ return {
71
+ isChallenge: false,
72
+ type: "none",
73
+ confidence: 0,
74
+ signals: ["No Cloudflare infrastructure detected"]
75
+ };
76
+ }
77
+ for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
78
+ try {
79
+ const element = await hero.document.querySelector(selector);
80
+ if (element) {
81
+ hasChallengeIndicator = true;
82
+ signals.push(`Challenge element: ${selector}`);
83
+ type = "js_challenge";
84
+ }
85
+ } catch {
57
86
  }
58
87
  }
59
- for (const pattern of CHALLENGE_TEXT_PATTERNS) {
88
+ for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
60
89
  if (htmlLower.includes(pattern)) {
90
+ hasChallengeIndicator = true;
61
91
  signals.push(`Challenge text: "${pattern}"`);
62
92
  type = type === "none" ? "js_challenge" : type;
63
93
  }
64
94
  }
65
95
  if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
96
+ hasChallengeIndicator = true;
66
97
  signals.push('Challenge text: "waiting for...to respond"');
67
98
  type = type === "none" ? "js_challenge" : type;
68
99
  }
69
- for (const pattern of BLOCKED_SIGNALS) {
70
- if (htmlLower.includes(pattern)) {
71
- signals.push(`Blocked: "${pattern}"`);
72
- type = "blocked";
73
- break;
74
- }
100
+ const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
101
+ if (hasBlocked) {
102
+ hasChallengeIndicator = true;
103
+ signals.push("Cloudflare block page detected");
104
+ type = "blocked";
75
105
  }
76
- const isChallenge = signals.length > 0;
106
+ const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
77
107
  const confidence = isChallenge ? 100 : 0;
78
108
  return {
79
109
  isChallenge,
@@ -156,84 +186,6 @@ var turndownService = new TurndownService({
156
186
  linkStyle: "inlined",
157
187
  linkReferenceStyle: "full"
158
188
  });
159
- function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
160
- const sections = [];
161
- if (includeMetadata) {
162
- sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
163
- }
164
- if (pages.length > 1) {
165
- sections.push(createMarkdownTOC(pages));
166
- }
167
- sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
168
- return sections.join("\n\n");
169
- }
170
- function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
171
- const title = website.title || extractDomainFromUrl(baseUrl);
172
- const description = website.description || "";
173
- let header = `# Website Scrape: ${title}
174
-
175
- `;
176
- header += `**Base URL:** ${baseUrl}
177
- `;
178
- header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
179
- `;
180
- header += `**Duration:** ${duration}ms
181
- `;
182
- header += `**Total pages:** ${totalPages}
183
- `;
184
- if (description) {
185
- header += `**Description:** ${description}
186
- `;
187
- }
188
- if (website.author) {
189
- header += `**Author:** ${website.author}
190
- `;
191
- }
192
- if (website.language) {
193
- header += `**Language:** ${website.language}
194
- `;
195
- }
196
- return header;
197
- }
198
- function createMarkdownTOC(pages) {
199
- let toc = "## Table of Contents\n\n";
200
- pages.forEach((page, index) => {
201
- const depth = " ".repeat(page.depth);
202
- const pageNumber = index + 1;
203
- const title = page.title || `Page ${pageNumber}`;
204
- const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
205
- const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
206
- toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
207
- `;
208
- });
209
- return toc;
210
- }
211
- function createMarkdownPage(page, pageNumber) {
212
- const title = page.title || `Page ${pageNumber}`;
213
- const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
214
- const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
215
- let pageContent = `---
216
-
217
- `;
218
- pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
219
-
220
- `;
221
- pageContent += `**URL:** ${page.url}
222
- `;
223
- pageContent += `**Title:** ${page.title}
224
- `;
225
- pageContent += `**Depth:** ${page.depth}
226
- `;
227
- pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
228
-
229
- `;
230
- pageContent += `---
231
-
232
- `;
233
- const markdown = htmlToMarkdown(page.html);
234
- pageContent += markdown;
235
- return pageContent;
236
- }
237
189
  function htmlToMarkdown(html) {
238
190
  try {
239
191
  return turndownService.turndown(html);
@@ -242,574 +194,339 @@ function htmlToMarkdown(html) {
242
194
  return html.replace(/<[^>]*>/g, "").trim();
243
195
  }
244
196
  }
245
- function extractDomainFromUrl(url) {
246
- try {
247
- return new URL(url).hostname;
248
- } catch {
249
- return "Unknown";
250
- }
251
- }
252
-
253
- // src/formatters/html.ts
254
- function formatToHTML(pages, baseUrl, scrapedAt, duration, website) {
255
- const html = `<!DOCTYPE html>
256
- <html lang="${website.language || "en"}">
257
- <head>
258
- <meta charset="${website.charset || "UTF-8"}">
259
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
260
- <title>Scrape: ${website.title || extractDomainFromUrl2(baseUrl)}</title>
261
- ${generateMetaTags(website)}
262
- <style>
263
- ${generateCSS()}
264
- </style>
265
- </head>
266
- <body>
267
- <header class="header">
268
- <h1>Website Scrape: ${escapeHtml(website.title || extractDomainFromUrl2(baseUrl))}</h1>
269
- <div class="meta-info">
270
- <p><strong>Base URL:</strong> <a href="${escapeHtml(
271
- baseUrl
272
- )}" target="_blank">${escapeHtml(baseUrl)}</a></p>
273
- <p><strong>Scraped at:</strong> ${new Date(scrapedAt).toLocaleString()}</p>
274
- <p><strong>Duration:</strong> ${duration}ms</p>
275
- <p><strong>Total pages:</strong> ${pages.length}</p>
276
- ${website.description ? `<p><strong>Description:</strong> ${escapeHtml(website.description)}</p>` : ""}
277
- ${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
278
- ${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
279
- </div>
280
- </header>
281
-
282
- ${pages.length > 1 ? generateTOC(pages) : ""}
283
-
284
- <main class="content">
285
- ${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
286
- </main>
287
-
288
- <footer class="footer">
289
- <p>Generated by Reader JS/TS SDK</p>
290
- </footer>
291
-
292
- <script>
293
- ${generateJavaScript()}
294
- </script>
295
- </body>
296
- </html>`;
297
- return html;
298
- }
299
- function generateMetaTags(website) {
300
- const tags = [];
301
- if (website.description) {
302
- tags.push(`<meta name="description" content="${escapeHtml(website.description)}">`);
303
- }
304
- if (website.author) {
305
- tags.push(`<meta name="author" content="${escapeHtml(website.author)}">`);
306
- }
307
- if (website.keywords) {
308
- tags.push(`<meta name="keywords" content="${escapeHtml(website.keywords.join(", "))}">`);
309
- }
310
- if (website.robots) {
311
- tags.push(`<meta name="robots" content="${escapeHtml(website.robots)}">`);
312
- }
313
- if (website.themeColor) {
314
- tags.push(`<meta name="theme-color" content="${escapeHtml(website.themeColor)}">`);
315
- }
316
- if (website.favicon) {
317
- tags.push(`<link rel="icon" href="${escapeHtml(website.favicon)}">`);
318
- }
319
- if (website.canonical) {
320
- tags.push(`<link rel="canonical" href="${escapeHtml(website.canonical)}">`);
321
- }
322
- if (website.openGraph) {
323
- const og = website.openGraph;
324
- if (og.title) tags.push(`<meta property="og:title" content="${escapeHtml(og.title)}">`);
325
- if (og.description)
326
- tags.push(`<meta property="og:description" content="${escapeHtml(og.description)}">`);
327
- if (og.type) tags.push(`<meta property="og:type" content="${escapeHtml(og.type)}">`);
328
- if (og.url) tags.push(`<meta property="og:url" content="${escapeHtml(og.url)}">`);
329
- if (og.image) tags.push(`<meta property="og:image" content="${escapeHtml(og.image)}">`);
330
- if (og.siteName)
331
- tags.push(`<meta property="og:site_name" content="${escapeHtml(og.siteName)}">`);
332
- if (og.locale) tags.push(`<meta property="og:locale" content="${escapeHtml(og.locale)}">`);
333
- }
334
- if (website.twitter) {
335
- const twitter = website.twitter;
336
- if (twitter.card) tags.push(`<meta name="twitter:card" content="${escapeHtml(twitter.card)}">`);
337
- if (twitter.site) tags.push(`<meta name="twitter:site" content="${escapeHtml(twitter.site)}">`);
338
- if (twitter.creator)
339
- tags.push(`<meta name="twitter:creator" content="${escapeHtml(twitter.creator)}">`);
340
- if (twitter.title)
341
- tags.push(`<meta name="twitter:title" content="${escapeHtml(twitter.title)}">`);
342
- if (twitter.description)
343
- tags.push(`<meta name="twitter:description" content="${escapeHtml(twitter.description)}">`);
344
- if (twitter.image)
345
- tags.push(`<meta name="twitter:image" content="${escapeHtml(twitter.image)}">`);
346
- }
347
- return tags.join("\n ");
348
- }
349
- function generateCSS() {
350
- return `
351
- * {
352
- margin: 0;
353
- padding: 0;
354
- box-sizing: border-box;
355
- }
356
-
357
- body {
358
- font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
359
- line-height: 1.6;
360
- color: #333;
361
- background-color: #f8f9fa;
362
- }
363
-
364
- .header {
365
- background: white;
366
- padding: 2rem;
367
- border-bottom: 1px solid #e9ecef;
368
- margin-bottom: 2rem;
369
- }
370
-
371
- .header h1 {
372
- color: #2c3e50;
373
- margin-bottom: 1rem;
374
- font-size: 2rem;
375
- }
376
-
377
- .meta-info {
378
- display: grid;
379
- grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
380
- gap: 0.5rem;
381
- }
382
-
383
- .meta-info p {
384
- margin: 0.25rem 0;
385
- font-size: 0.9rem;
386
- color: #6c757d;
387
- }
388
-
389
- .toc {
390
- background: white;
391
- padding: 1.5rem;
392
- margin: 2rem 0;
393
- border-radius: 8px;
394
- border: 1px solid #e9ecef;
395
- }
396
-
397
- .toc h2 {
398
- color: #2c3e50;
399
- margin-bottom: 1rem;
400
- font-size: 1.25rem;
401
- }
402
-
403
- .toc ul {
404
- list-style: none;
405
- }
406
-
407
- .toc li {
408
- margin: 0.5rem 0;
409
- }
410
-
411
- .toc a {
412
- color: #007bff;
413
- text-decoration: none;
414
- transition: color 0.2s;
415
- }
416
-
417
- .toc a:hover {
418
- color: #0056b3;
419
- text-decoration: underline;
420
- }
421
-
422
- .content {
423
- max-width: 800px;
424
- margin: 0 auto;
425
- padding: 0 1rem;
426
- }
427
-
428
- .page {
429
- background: white;
430
- margin: 2rem 0;
431
- padding: 2rem;
432
- border-radius: 8px;
433
- border: 1px solid #e9ecef;
434
- box-shadow: 0 2px 4px rgba(0,0,0,0.05);
435
- }
436
-
437
- .page-header {
438
- border-bottom: 2px solid #e9ecef;
439
- padding-bottom: 1rem;
440
- margin-bottom: 2rem;
441
- }
442
-
443
- .page-header h2 {
444
- color: #2c3e50;
445
- margin-bottom: 0.5rem;
446
- font-size: 1.5rem;
447
- }
448
-
449
- .page-meta {
450
- display: flex;
451
- flex-wrap: wrap;
452
- gap: 1rem;
453
- font-size: 0.9rem;
454
- color: #6c757d;
455
- }
456
-
457
- .page-content {
458
- line-height: 1.8;
459
- }
460
-
461
- .page-content h1, .page-content h2, .page-content h3,
462
- .page-content h4, .page-content h5, .page-content h6 {
463
- color: #2c3e50;
464
- margin: 1.5rem 0 0.5rem 0;
465
- }
466
-
467
- .page-content p {
468
- margin: 1rem 0;
469
- }
470
-
471
- .page-content a {
472
- color: #007bff;
473
- text-decoration: none;
474
- }
475
-
476
- .page-content a:hover {
477
- text-decoration: underline;
478
- }
479
-
480
- .page-content code {
481
- background: #f8f9fa;
482
- padding: 0.2rem 0.4rem;
483
- border-radius: 4px;
484
- font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
485
- font-size: 0.9em;
486
- }
487
-
488
- .page-content pre {
489
- background: #f8f9fa;
490
- padding: 1rem;
491
- border-radius: 4px;
492
- overflow-x: auto;
493
- margin: 1rem 0;
494
- }
495
-
496
- .page-content blockquote {
497
- border-left: 4px solid #007bff;
498
- padding-left: 1rem;
499
- margin: 1rem 0;
500
- color: #6c757d;
501
- }
502
-
503
- .footer {
504
- text-align: center;
505
- padding: 2rem;
506
- margin-top: 3rem;
507
- border-top: 1px solid #e9ecef;
508
- color: #6c757d;
509
- font-size: 0.9rem;
510
- }
511
-
512
- @media (max-width: 768px) {
513
- .header {
514
- padding: 1rem;
515
- }
516
-
517
- .header h1 {
518
- font-size: 1.5rem;
519
- }
520
-
521
- .page {
522
- padding: 1rem;
523
- }
524
-
525
- .page-meta {
526
- flex-direction: column;
527
- gap: 0.5rem;
528
- }
529
- }
530
- `.trim();
531
- }
532
- function generateTOC(pages) {
533
- const tocItems = pages.map((page, index) => {
534
- const pageNumber = index + 1;
535
- const title = page.title || `Page ${pageNumber}`;
536
- const id = `page-${pageNumber}`;
537
- return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
538
- }).join("\n");
539
- return `
540
- <nav class="toc">
541
- <h2>Table of Contents</h2>
542
- <ul>
543
- ${tocItems}
544
- </ul>
545
- </nav>`;
546
- }
547
- function generatePageHTML(page, pageNumber) {
548
- const id = `page-${pageNumber}`;
549
- const title = page.title || `Page ${pageNumber}`;
550
- return `
551
- <article class="page" id="${id}">
552
- <div class="page-header">
553
- <h2>${pageNumber}. ${escapeHtml(title)}</h2>
554
- <div class="page-meta">
555
- <span><strong>URL:</strong> <a href="${escapeHtml(
556
- page.url
557
- )}" target="_blank">${escapeHtml(page.url)}</a></span>
558
- <span><strong>Depth:</strong> ${page.depth}</span>
559
- <span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
560
- </div>
561
- </div>
562
- <div class="page-content">
563
- ${page.html}
564
- </div>
565
- </article>`;
566
- }
567
- function generateJavaScript() {
568
- return `
569
- // Smooth scrolling for TOC links
570
- document.querySelectorAll('a[href^="#"]').forEach(anchor => {
571
- anchor.addEventListener('click', function (e) {
572
- e.preventDefault();
573
- const target = document.querySelector(this.getAttribute('href'));
574
- if (target) {
575
- target.scrollIntoView({
576
- behavior: 'smooth',
577
- block: 'start'
578
- });
579
- }
580
- });
581
- });
582
-
583
- // Highlight current section in TOC
584
- window.addEventListener('scroll', function() {
585
- const pages = document.querySelectorAll('.page');
586
- const tocLinks = document.querySelectorAll('.toc a');
587
-
588
- let currentPage = null;
589
- pages.forEach(page => {
590
- const rect = page.getBoundingClientRect();
591
- if (rect.top <= 100) {
592
- currentPage = page;
593
- }
594
- });
595
-
596
- tocLinks.forEach(link => {
597
- link.style.fontWeight = 'normal';
598
- const target = document.querySelector(link.getAttribute('href'));
599
- if (target === currentPage) {
600
- link.style.fontWeight = 'bold';
601
- }
602
- });
603
- });
604
- `;
605
- }
606
- function escapeHtml(text) {
607
- return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, "&quot;").replace(/'/g, "&#039;").replace(/\//g, "&#x2F;");
608
- }
609
- function extractDomainFromUrl2(url) {
610
- try {
611
- return new URL(url).hostname;
612
- } catch {
613
- return "Unknown";
614
- }
615
- }
616
-
617
- // src/formatters/json.ts
618
- function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
619
- const jsonResult = {
620
- metadata: {
621
- baseUrl,
622
- totalPages: pages.length,
623
- scrapedAt,
624
- duration,
625
- website
626
- },
627
- pages: pages.map((page, index) => ({
628
- index: index + 1,
629
- url: page.url,
630
- title: page.title,
631
- markdown: page.markdown,
632
- html: page.html,
633
- fetchedAt: page.fetchedAt,
634
- depth: page.depth,
635
- wordCount: countWords(page.markdown),
636
- readingTime: estimateReadingTime(page.markdown)
637
- }))
638
- };
639
- return JSON.stringify(jsonResult, null, 2);
640
- }
641
- function countWords(markdown) {
642
- const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
643
- return plainText.split(/\s+/).filter((word) => word.length > 0).length;
644
- }
645
- function estimateReadingTime(markdown) {
646
- const wordCount = countWords(markdown);
647
- return Math.ceil(wordCount / 200);
648
- }
649
-
650
- // src/formatters/text.ts
651
- import { parseHTML } from "linkedom";
652
- function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
653
- const sections = [];
654
- if (includeMetadata) {
655
- sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
656
- }
657
- sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
658
- return sections.join("\n\n");
659
- }
660
- function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
661
- const title = website.title || extractDomainFromUrl3(baseUrl);
662
- const lines = [];
663
- lines.push(`=== ${title} ===`);
664
- lines.push("");
665
- lines.push(`URL: ${baseUrl}`);
666
- lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
667
- lines.push(`Duration: ${duration}ms`);
668
- lines.push(`Pages: ${totalPages}`);
669
- if (website.description) {
670
- lines.push(`Description: ${website.description}`);
671
- }
672
- if (website.author) {
673
- lines.push(`Author: ${website.author}`);
674
- }
675
- if (website.language) {
676
- lines.push(`Language: ${website.language}`);
677
- }
678
- return lines.join("\n");
679
- }
680
- function createTextPage(page, pageNumber, showSeparator) {
681
- const lines = [];
682
- if (showSeparator) {
683
- lines.push("\u2500".repeat(60));
684
- lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
685
- lines.push(`URL: ${page.url}`);
686
- lines.push("\u2500".repeat(60));
687
- }
688
- const plainText = htmlToPlainText(page.html);
689
- lines.push(plainText);
690
- return lines.join("\n");
691
- }
692
- function htmlToPlainText(html) {
693
- const { document } = parseHTML(html);
694
- const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
695
- elementsToRemove.forEach((tag) => {
696
- document.querySelectorAll(tag).forEach((el) => el.remove());
697
- });
698
- let text = document.body?.textContent || document.documentElement?.textContent || "";
699
- text = text.replace(/[ \t]+/g, " ");
700
- text = text.replace(/\n[ \t]+/g, "\n");
701
- text = text.replace(/[ \t]+\n/g, "\n");
702
- text = text.replace(/\n{3,}/g, "\n\n");
703
- text = text.trim();
704
- return text;
705
- }
706
- function extractDomainFromUrl3(url) {
707
- try {
708
- return new URL(url).hostname;
709
- } catch {
710
- return "Unknown";
711
- }
712
- }
713
197
 
714
198
  // src/utils/content-cleaner.ts
715
- import { parseHTML as parseHTML2 } from "linkedom";
199
+ import { parseHTML } from "linkedom";
716
200
  var ALWAYS_REMOVE_SELECTORS = [
717
- // Navigation and menus
718
- "nav",
719
- "header nav",
720
- "footer nav",
721
- ".nav",
722
- ".navigation",
723
- ".menu",
724
- ".navbar",
725
- ".sidebar",
726
- ".aside",
727
- // Header and footer elements
728
- "header",
729
- "footer",
730
- ".site-header",
731
- ".page-header",
732
- ".site-footer",
733
- ".page-footer",
734
- // Social media and sharing
735
- ".social",
736
- ".share",
737
- ".sharing",
738
- ".twitter",
739
- ".facebook",
740
- ".linkedin",
741
- ".instagram",
742
- // Comments and discussions
743
- ".comments",
744
- ".comment",
745
- ".discussion",
746
- ".disqus",
747
- // Forms and interactive elements
748
- "form",
749
- "input",
750
- "button:not([type='submit'])",
751
- "select",
752
- "textarea",
753
201
  // Scripts and styles
754
202
  "script",
755
203
  "style",
756
204
  "noscript",
205
+ "link[rel='stylesheet']",
757
206
  // Hidden elements
758
207
  "[hidden]",
208
+ "[aria-hidden='true']",
759
209
  "[style*='display: none']",
760
210
  "[style*='display:none']",
761
- // Common utility classes
762
- ".cookie",
763
- ".cookie-banner",
764
- ".popup",
211
+ "[style*='visibility: hidden']",
212
+ "[style*='visibility:hidden']",
213
+ // SVG icons and decorative elements
214
+ "svg[aria-hidden='true']",
215
+ "svg.icon",
216
+ "svg[class*='icon']",
217
+ // Template and metadata
218
+ "template",
219
+ "meta",
220
+ // Embeds that don't convert to text
221
+ "iframe",
222
+ "canvas",
223
+ "object",
224
+ "embed",
225
+ // Forms (usually not main content)
226
+ "form",
227
+ "input",
228
+ "select",
229
+ "textarea",
230
+ "button"
231
+ ];
232
+ var OVERLAY_SELECTORS = [
233
+ "[class*='modal']",
234
+ "[class*='popup']",
235
+ "[class*='overlay']",
236
+ "[class*='dialog']",
237
+ "[role='dialog']",
238
+ "[role='alertdialog']",
239
+ "[class*='cookie']",
240
+ "[class*='consent']",
241
+ "[class*='gdpr']",
242
+ "[class*='privacy-banner']",
243
+ "[class*='notification-bar']",
244
+ "[id*='cookie']",
245
+ "[id*='consent']",
246
+ "[id*='gdpr']",
247
+ // Fixed/sticky positioned elements
248
+ "[style*='position: fixed']",
249
+ "[style*='position:fixed']",
250
+ "[style*='position: sticky']",
251
+ "[style*='position:sticky']"
252
+ ];
253
+ var NAVIGATION_SELECTORS = [
254
+ // Semantic elements
255
+ "header",
256
+ "footer",
257
+ "nav",
258
+ "aside",
259
+ // Header variations
260
+ ".header",
261
+ ".top",
262
+ ".navbar",
263
+ "#header",
264
+ // Footer variations
265
+ ".footer",
266
+ ".bottom",
267
+ "#footer",
268
+ // Sidebars
269
+ ".sidebar",
270
+ ".side",
271
+ ".aside",
272
+ "#sidebar",
273
+ // Modals/popups (backup if not caught by OVERLAY_SELECTORS)
765
274
  ".modal",
275
+ ".popup",
276
+ "#modal",
766
277
  ".overlay",
767
- ".notification",
278
+ // Ads
279
+ ".ad",
280
+ ".ads",
281
+ ".advert",
282
+ "#ad",
283
+ // Language selectors
284
+ ".lang-selector",
285
+ ".language",
286
+ "#language-selector",
287
+ // Social
288
+ ".social",
289
+ ".social-media",
290
+ ".social-links",
291
+ "#social",
292
+ // Navigation/menus
293
+ ".menu",
294
+ ".navigation",
295
+ "#nav",
768
296
  // Breadcrumbs
769
- ".breadcrumb",
770
297
  ".breadcrumbs",
771
- ".breadcrumb-trail"
298
+ "#breadcrumbs",
299
+ // Share buttons
300
+ ".share",
301
+ "#share",
302
+ // Widgets
303
+ ".widget",
304
+ "#widget",
305
+ // Cookie notices (backup)
306
+ ".cookie",
307
+ "#cookie"
308
+ ];
309
+ var FORCE_INCLUDE_SELECTORS = [
310
+ // IDs
311
+ "#main",
312
+ "#content",
313
+ "#main-content",
314
+ "#article",
315
+ "#post",
316
+ "#page-content",
317
+ // Semantic elements
318
+ "main",
319
+ "article",
320
+ "[role='main']",
321
+ // Classes
322
+ ".main-content",
323
+ ".content",
324
+ ".post-content",
325
+ ".article-content",
326
+ ".entry-content",
327
+ ".page-content",
328
+ ".article-body",
329
+ ".post-body",
330
+ ".story-content",
331
+ ".blog-content"
772
332
  ];
773
333
  var AD_SELECTORS = [
774
- // Ads and promotions
775
- ".ad",
776
- ".ads",
777
- ".advertisement",
778
- ".promotion",
779
- ".sponsored",
780
- "[class*='ad-']",
781
- "[id*='ad-']",
782
- "[class*='advert']",
783
- "[id*='advert']",
784
- "[class*='banner']",
785
- "[id*='banner']",
334
+ // Google ads
335
+ "ins.adsbygoogle",
786
336
  ".google-ad",
787
337
  ".adsense",
338
+ // Generic ad containers
788
339
  "[data-ad]",
789
340
  "[data-ads]",
790
- "ins.adsbygoogle",
791
- // Tracking
792
- "[class*='tracking']",
793
- "[id*='tracking']",
794
- "[class*='analytics']",
795
- "[id*='analytics']"
341
+ "[data-ad-slot]",
342
+ "[data-ad-client]",
343
+ // Common ad class patterns
344
+ ".ad-container",
345
+ ".ad-wrapper",
346
+ ".advertisement",
347
+ ".sponsored-content",
348
+ // Tracking pixels
349
+ "img[width='1'][height='1']",
350
+ "img[src*='pixel']",
351
+ "img[src*='tracking']",
352
+ "img[src*='analytics']"
796
353
  ];
797
- function cleanHtml(html, baseUrl, options = {}) {
798
- const { removeAds = true, removeBase64Images = true } = options;
799
- const { document } = parseHTML2(html);
800
- for (const selector of ALWAYS_REMOVE_SELECTORS) {
354
+ function getLinkDensity(element) {
355
+ const text = element.textContent || "";
356
+ const textLength = text.trim().length;
357
+ if (textLength === 0) return 1;
358
+ let linkLength = 0;
359
+ element.querySelectorAll("a").forEach((link) => {
360
+ linkLength += (link.textContent || "").trim().length;
361
+ });
362
+ return linkLength / textLength;
363
+ }
364
+ function getContentScore(element) {
365
+ let score = 0;
366
+ const text = element.textContent || "";
367
+ const textLength = text.trim().length;
368
+ score += Math.min(textLength / 100, 50);
369
+ score += element.querySelectorAll("p").length * 3;
370
+ score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2;
371
+ score += element.querySelectorAll("img").length * 1;
372
+ score -= element.querySelectorAll("a").length * 0.5;
373
+ score -= element.querySelectorAll("li").length * 0.2;
374
+ const linkDensity = getLinkDensity(element);
375
+ if (linkDensity > 0.5) score -= 30;
376
+ else if (linkDensity > 0.3) score -= 15;
377
+ const classAndId = (element.className || "") + " " + (element.id || "");
378
+ if (/article|content|post|body|main|entry/i.test(classAndId)) score += 25;
379
+ if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) score -= 25;
380
+ return score;
381
+ }
382
+ function looksLikeNavigation(element) {
383
+ const linkDensity = getLinkDensity(element);
384
+ if (linkDensity > 0.5) return true;
385
+ const listItems = element.querySelectorAll("li");
386
+ const links = element.querySelectorAll("a");
387
+ if (listItems.length > 5 && links.length > listItems.length * 0.8) return true;
388
+ return false;
389
+ }
390
+ function removeElements(document, selectors) {
391
+ for (const selector of selectors) {
801
392
  try {
802
393
  document.querySelectorAll(selector).forEach((el) => el.remove());
803
394
  } catch {
804
395
  }
805
396
  }
397
+ }
398
+ function removeWithProtection(document, selectorsToRemove, protectedSelectors) {
399
+ for (const selector of selectorsToRemove) {
400
+ try {
401
+ document.querySelectorAll(selector).forEach((element) => {
402
+ const isProtected = protectedSelectors.some((ps) => {
403
+ try {
404
+ return element.matches(ps);
405
+ } catch {
406
+ return false;
407
+ }
408
+ });
409
+ if (isProtected) return;
410
+ const containsProtected = protectedSelectors.some((ps) => {
411
+ try {
412
+ return element.querySelector(ps) !== null;
413
+ } catch {
414
+ return false;
415
+ }
416
+ });
417
+ if (containsProtected) return;
418
+ element.remove();
419
+ });
420
+ } catch {
421
+ }
422
+ }
423
+ }
424
+ function findMainContent(document) {
425
+ const isValidContent = (el) => {
426
+ if (!el) return false;
427
+ const text = el.textContent || "";
428
+ if (text.trim().length < 100) return false;
429
+ if (looksLikeNavigation(el)) return false;
430
+ return true;
431
+ };
432
+ const main = document.querySelector("main");
433
+ if (isValidContent(main) && getLinkDensity(main) < 0.4) {
434
+ return main;
435
+ }
436
+ const roleMain = document.querySelector('[role="main"]');
437
+ if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) {
438
+ return roleMain;
439
+ }
440
+ const articles = document.querySelectorAll("article");
441
+ if (articles.length === 1 && isValidContent(articles[0])) {
442
+ return articles[0];
443
+ }
444
+ const contentSelectors = [
445
+ "#content",
446
+ "#main-content",
447
+ "#main",
448
+ ".content",
449
+ ".main-content",
450
+ ".post-content",
451
+ ".article-content",
452
+ ".entry-content",
453
+ ".page-content",
454
+ ".article-body",
455
+ ".post-body",
456
+ ".story-content",
457
+ ".blog-content"
458
+ ];
459
+ for (const selector of contentSelectors) {
460
+ try {
461
+ const el = document.querySelector(selector);
462
+ if (isValidContent(el) && getLinkDensity(el) < 0.4) {
463
+ return el;
464
+ }
465
+ } catch {
466
+ }
467
+ }
468
+ const candidates = [];
469
+ const containers = document.querySelectorAll("div, section, article");
470
+ containers.forEach((el) => {
471
+ const text = el.textContent || "";
472
+ if (text.trim().length < 200) return;
473
+ const score = getContentScore(el);
474
+ if (score > 0) {
475
+ candidates.push({ el, score });
476
+ }
477
+ });
478
+ candidates.sort((a, b) => b.score - a.score);
479
+ if (candidates.length > 0 && candidates[0].score > 20) {
480
+ return candidates[0].el;
481
+ }
482
+ return null;
483
+ }
484
+ function cleanHtml(html, baseUrl, options = {}) {
485
+ const {
486
+ removeAds = true,
487
+ removeBase64Images = true,
488
+ onlyMainContent = true,
489
+ includeTags,
490
+ excludeTags
491
+ } = options;
492
+ const { document } = parseHTML(html);
493
+ removeElements(document, ALWAYS_REMOVE_SELECTORS);
494
+ removeElements(document, OVERLAY_SELECTORS);
806
495
  if (removeAds) {
807
- for (const selector of AD_SELECTORS) {
496
+ removeElements(document, AD_SELECTORS);
497
+ }
498
+ if (excludeTags && excludeTags.length > 0) {
499
+ removeElements(document, excludeTags);
500
+ }
501
+ if (onlyMainContent) {
502
+ removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS);
503
+ const mainContent = findMainContent(document);
504
+ if (mainContent) {
505
+ const body = document.body;
506
+ if (body) {
507
+ const clone = mainContent.cloneNode(true);
508
+ body.innerHTML = "";
509
+ body.appendChild(clone);
510
+ }
511
+ }
512
+ }
513
+ if (includeTags && includeTags.length > 0) {
514
+ const matchedElements = [];
515
+ for (const selector of includeTags) {
808
516
  try {
809
- document.querySelectorAll(selector).forEach((el) => el.remove());
517
+ document.querySelectorAll(selector).forEach((el) => {
518
+ matchedElements.push(el.cloneNode(true));
519
+ });
810
520
  } catch {
811
521
  }
812
522
  }
523
+ if (matchedElements.length > 0) {
524
+ const body = document.body;
525
+ if (body) {
526
+ body.innerHTML = "";
527
+ matchedElements.forEach((el) => body.appendChild(el));
528
+ }
529
+ }
813
530
  }
814
531
  if (removeBase64Images) {
815
532
  removeBase64ImagesFromDocument(document);
@@ -834,7 +551,10 @@ function removeBase64ImagesFromDocument(document) {
834
551
  document.querySelectorAll("[style*='data:image']").forEach((el) => {
835
552
  const style = el.getAttribute("style");
836
553
  if (style) {
837
- const cleanedStyle = style.replace(/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi, "");
554
+ const cleanedStyle = style.replace(
555
+ /background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi,
556
+ ""
557
+ );
838
558
  if (cleanedStyle.trim()) {
839
559
  el.setAttribute("style", cleanedStyle);
840
560
  } else {
@@ -871,7 +591,7 @@ function cleanContent(html, baseUrl, options = {}) {
871
591
  }
872
592
 
873
593
  // src/utils/metadata-extractor.ts
874
- import { parseHTML as parseHTML3 } from "linkedom";
594
+ import { parseHTML as parseHTML2 } from "linkedom";
875
595
 
876
596
  // src/utils/url-helpers.ts
877
597
  import { URL as URL2 } from "url";
@@ -944,8 +664,26 @@ function isSameDomain(url, baseUrl) {
944
664
  function getUrlKey(url) {
945
665
  try {
946
666
  const parsedUrl = new URL2(url);
667
+ parsedUrl.hash = "";
947
668
  parsedUrl.search = "";
948
- return parsedUrl.toString().toLowerCase();
669
+ if (parsedUrl.hostname.startsWith("www.")) {
670
+ parsedUrl.hostname = parsedUrl.hostname.slice(4);
671
+ }
672
+ if (parsedUrl.protocol === "http:" && parsedUrl.port === "80" || parsedUrl.protocol === "https:" && parsedUrl.port === "443") {
673
+ parsedUrl.port = "";
674
+ }
675
+ const indexFiles = ["index.html", "index.htm", "default.html", "default.htm", "index.php"];
676
+ for (const indexFile of indexFiles) {
677
+ if (parsedUrl.pathname.endsWith(`/${indexFile}`)) {
678
+ parsedUrl.pathname = parsedUrl.pathname.slice(0, -indexFile.length);
679
+ break;
680
+ }
681
+ }
682
+ let normalized = parsedUrl.toString().toLowerCase();
683
+ if (normalized.endsWith("/") && parsedUrl.pathname !== "/") {
684
+ normalized = normalized.slice(0, -1);
685
+ }
686
+ return normalized;
949
687
  } catch {
950
688
  return url.toLowerCase();
951
689
  }
@@ -1012,7 +750,7 @@ function extractMetadata(html, baseUrl) {
1012
750
  return extractWebsiteMetadata(html, baseUrl);
1013
751
  }
1014
752
  function extractWebsiteMetadata(html, baseUrl) {
1015
- const { document } = parseHTML3(html);
753
+ const { document } = parseHTML2(html);
1016
754
  const metadata = {
1017
755
  title: null,
1018
756
  description: null,
@@ -1294,13 +1032,15 @@ function isUrlAllowed(url, rules) {
1294
1032
  var DEFAULT_OPTIONS = {
1295
1033
  urls: [],
1296
1034
  formats: ["markdown"],
1297
- includeMetadata: true,
1298
1035
  timeoutMs: 3e4,
1299
1036
  includePatterns: [],
1300
1037
  excludePatterns: [],
1301
1038
  // Content cleaning defaults
1302
1039
  removeAds: true,
1303
1040
  removeBase64Images: true,
1041
+ onlyMainContent: true,
1042
+ includeTags: [],
1043
+ excludeTags: [],
1304
1044
  skipTLSVerification: true,
1305
1045
  // Batch defaults
1306
1046
  batchConcurrency: 1,
@@ -1457,14 +1197,9 @@ var Scraper = class {
1457
1197
  } catch {
1458
1198
  }
1459
1199
  await hero.waitForPaintingStable();
1460
- let hadChallenge = false;
1461
- let challengeType = "none";
1462
- let waitTimeMs = 0;
1463
1200
  const initialUrl = await hero.url;
1464
1201
  const detection = await detectChallenge(hero);
1465
1202
  if (detection.isChallenge) {
1466
- hadChallenge = true;
1467
- challengeType = detection.type;
1468
1203
  if (this.options.verbose) {
1469
1204
  this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
1470
1205
  }
@@ -1474,12 +1209,11 @@ var Scraper = class {
1474
1209
  verbose: this.options.verbose,
1475
1210
  initialUrl
1476
1211
  });
1477
- waitTimeMs = result2.waitedMs;
1478
1212
  if (!result2.resolved) {
1479
1213
  throw new Error(`Challenge not resolved: ${detection.type}`);
1480
1214
  }
1481
1215
  if (this.options.verbose) {
1482
- this.logger.info(`Challenge resolved via ${result2.method} in ${waitTimeMs}ms`);
1216
+ this.logger.info(`Challenge resolved via ${result2.method} in ${result2.waitedMs}ms`);
1483
1217
  }
1484
1218
  }
1485
1219
  await this.waitForFinalPage(hero, url, this.options.verbose);
@@ -1492,45 +1226,18 @@ var Scraper = class {
1492
1226
  this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
1493
1227
  }
1494
1228
  }
1495
- const pageTitle = await hero.document.title;
1496
1229
  const html = await hero.document.documentElement.outerHTML;
1497
1230
  const cleanedHtml = cleanContent(html, url, {
1498
1231
  removeAds: this.options.removeAds,
1499
- removeBase64Images: this.options.removeBase64Images
1232
+ removeBase64Images: this.options.removeBase64Images,
1233
+ onlyMainContent: this.options.onlyMainContent,
1234
+ includeTags: this.options.includeTags,
1235
+ excludeTags: this.options.excludeTags
1500
1236
  });
1501
1237
  const websiteMetadata = extractMetadata(cleanedHtml, url);
1502
1238
  const duration = Date.now() - startTime;
1503
- const scrapedAt = (/* @__PURE__ */ new Date()).toISOString();
1504
- const page = {
1505
- url,
1506
- title: pageTitle,
1507
- markdown: "",
1508
- // Will be set by formatter
1509
- html: cleanedHtml,
1510
- fetchedAt: scrapedAt,
1511
- depth: 0,
1512
- hadChallenge,
1513
- challengeType,
1514
- waitTimeMs
1515
- };
1516
- const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
1517
- [page],
1518
- url,
1519
- scrapedAt,
1520
- duration,
1521
- websiteMetadata,
1522
- this.options.includeMetadata
1523
- ) : void 0;
1524
- const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
1525
- const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
1526
- const text = this.options.formats.includes("text") ? formatToText(
1527
- [page],
1528
- url,
1529
- scrapedAt,
1530
- duration,
1531
- websiteMetadata,
1532
- this.options.includeMetadata
1533
- ) : void 0;
1239
+ const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
1240
+ const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
1534
1241
  if (this.options.onProgress) {
1535
1242
  this.options.onProgress({
1536
1243
  completed: index + 1,
@@ -1562,8 +1269,6 @@ var Scraper = class {
1562
1269
  const result = {
1563
1270
  markdown,
1564
1271
  html: htmlOutput,
1565
- json,
1566
- text,
1567
1272
  metadata: {
1568
1273
  baseUrl: url,
1569
1274
  totalPages: 1,
@@ -1618,7 +1323,7 @@ async function scrape(options) {
1618
1323
  }
1619
1324
 
1620
1325
  // src/crawler.ts
1621
- import { parseHTML as parseHTML4 } from "linkedom";
1326
+ import { parseHTML as parseHTML3 } from "linkedom";
1622
1327
 
1623
1328
  // src/utils/rate-limiter.ts
1624
1329
  import pLimit2 from "p-limit";
@@ -1767,12 +1472,26 @@ var Crawler = class {
1767
1472
  */
1768
1473
  extractLinks(html, baseUrl, depth) {
1769
1474
  const links = [];
1770
- const { document } = parseHTML4(html);
1475
+ const { document } = parseHTML3(html);
1771
1476
  document.querySelectorAll("a[href]").forEach((anchor) => {
1772
- const href = anchor.getAttribute("href");
1477
+ const rawHref = anchor.getAttribute("href");
1478
+ if (!rawHref) return;
1479
+ const href = rawHref.trim();
1773
1480
  if (!href) return;
1774
- const resolved = resolveUrl(href, baseUrl);
1481
+ if (href.startsWith("#")) return;
1482
+ const lowerHref = href.toLowerCase();
1483
+ if (lowerHref.startsWith("javascript:") || lowerHref.startsWith("mailto:") || lowerHref.startsWith("tel:") || lowerHref.startsWith("data:") || lowerHref.startsWith("blob:") || lowerHref.startsWith("ftp:")) {
1484
+ return;
1485
+ }
1486
+ let resolved = resolveUrl(href, baseUrl);
1775
1487
  if (!resolved || !isValidUrl(resolved)) return;
1488
+ try {
1489
+ const parsed = new URL(resolved);
1490
+ parsed.hash = "";
1491
+ resolved = parsed.toString();
1492
+ } catch {
1493
+ return;
1494
+ }
1776
1495
  if (!isSameDomain(resolved, this.options.url)) return;
1777
1496
  if (!isContentUrl(resolved)) return;
1778
1497
  if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
@@ -2883,9 +2602,9 @@ program.command("status").description("Check daemon status").option("-p, --port
2883
2602
  });
2884
2603
  program.command("scrape <urls...>").description("Scrape one or more URLs").option(
2885
2604
  "-f, --format <formats>",
2886
- "Output formats (comma-separated: markdown,html,json,text)",
2605
+ "Content formats to include (comma-separated: markdown,html)",
2887
2606
  "markdown"
2888
- ).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--no-metadata", "Exclude metadata from output").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").action(async (urls, options) => {
2607
+ ).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").option("--no-main-content", "Disable main content extraction (include full page)").option("--include-tags <selectors>", "CSS selectors for elements to include (comma-separated)").option("--exclude-tags <selectors>", "CSS selectors for elements to exclude (comma-separated)").action(async (urls, options) => {
2889
2608
  const port = parseInt(options.port, 10);
2890
2609
  const useStandalone = options.standalone || false;
2891
2610
  let useDaemon = false;
@@ -2902,7 +2621,7 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
2902
2621
  }) : null;
2903
2622
  try {
2904
2623
  const formats = options.format.split(",").map((f) => f.trim());
2905
- const validFormats = ["markdown", "html", "json", "text"];
2624
+ const validFormats = ["markdown", "html"];
2906
2625
  for (const format of formats) {
2907
2626
  if (!validFormats.includes(format)) {
2908
2627
  console.error(
@@ -2915,6 +2634,8 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
2915
2634
  console.error(`Scraping ${urls.length} URL(s)...`);
2916
2635
  console.error(`Formats: ${formats.join(", ")}`);
2917
2636
  }
2637
+ const includeTags = options.includeTags ? options.includeTags.split(",").map((s) => s.trim()) : void 0;
2638
+ const excludeTags = options.excludeTags ? options.excludeTags.split(",").map((s) => s.trim()) : void 0;
2918
2639
  const scrapeOptions = {
2919
2640
  urls,
2920
2641
  formats,
@@ -2923,33 +2644,26 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
2923
2644
  batchTimeoutMs: parseInt(options.batchTimeout, 10),
2924
2645
  proxy: options.proxy ? { url: options.proxy } : void 0,
2925
2646
  userAgent: options.userAgent,
2926
- includeMetadata: options.metadata !== false,
2927
2647
  verbose: options.verbose || false,
2928
2648
  showChrome: options.showChrome || false,
2649
+ // Content cleaning options
2650
+ onlyMainContent: options.mainContent !== false,
2651
+ // --no-main-content sets this to false
2652
+ includeTags,
2653
+ excludeTags,
2929
2654
  onProgress: options.verbose ? ({ completed, total, currentUrl }) => {
2930
2655
  console.error(`[${completed}/${total}] ${currentUrl}`);
2931
2656
  } : void 0
2932
2657
  };
2933
2658
  const result = useDaemon ? await daemonClient.scrape(scrapeOptions) : await standaloneClient.scrape(scrapeOptions);
2934
- let output = "";
2935
- for (const site of result.data) {
2936
- if (formats.includes("markdown") && site.markdown) {
2937
- output += site.markdown + "\n\n";
2938
- } else if (formats.includes("text") && site.text) {
2939
- output += site.text + "\n\n";
2940
- } else if (formats.includes("html") && site.html) {
2941
- output += site.html + "\n\n";
2942
- } else if (formats.includes("json") && site.json) {
2943
- output += site.json + "\n\n";
2944
- }
2945
- }
2659
+ const output = JSON.stringify(result, null, 2);
2946
2660
  if (options.output) {
2947
- writeFileSync(options.output, output.trim());
2661
+ writeFileSync(options.output, output);
2948
2662
  if (options.verbose) {
2949
2663
  console.error(`Output written to ${options.output}`);
2950
2664
  }
2951
2665
  } else {
2952
- console.log(output.trim());
2666
+ console.log(output);
2953
2667
  }
2954
2668
  if (options.verbose) {
2955
2669
  console.error(`
@@ -2972,7 +2686,7 @@ Summary:`);
2972
2686
  }
2973
2687
  }
2974
2688
  });
2975
- program.command("crawl <url>").description("Crawl a website to discover and optionally scrape pages").option("-d, --depth <n>", "Maximum crawl depth", "1").option("-m, --max-pages <n>", "Maximum pages to discover", "20").option("-s, --scrape", "Also scrape content of discovered pages").option("-f, --format <formats>", "Output formats when scraping (comma-separated)", "markdown").option("-o, --output <file>", "Output file (stdout if omitted)").option("--delay <ms>", "Delay between requests in milliseconds", "1000").option("-t, --timeout <ms>", "Total timeout for crawl operation in milliseconds").option("--include <patterns>", "URL patterns to include (comma-separated regex)").option("--exclude <patterns>", "URL patterns to exclude (comma-separated regex)").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").action(async (url, options) => {
2689
+ program.command("crawl <url>").description("Crawl a website to discover and optionally scrape pages").option("-d, --depth <n>", "Maximum crawl depth", "1").option("-m, --max-pages <n>", "Maximum pages to discover", "20").option("-s, --scrape", "Also scrape content of discovered pages").option("-f, --format <formats>", "Content formats when scraping (comma-separated: markdown,html)", "markdown").option("-o, --output <file>", "Output file (stdout if omitted)").option("--delay <ms>", "Delay between requests in milliseconds", "1000").option("-t, --timeout <ms>", "Total timeout for crawl operation in milliseconds").option("--include <patterns>", "URL patterns to include (comma-separated regex)").option("--exclude <patterns>", "URL patterns to exclude (comma-separated regex)").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").action(async (url, options) => {
2976
2690
  const port = parseInt(options.port, 10);
2977
2691
  const useStandalone = options.standalone || false;
2978
2692
  let useDaemon = false;
@@ -3008,38 +2722,20 @@ program.command("crawl <url>").description("Crawl a website to discover and opti
3008
2722
  verbose: options.verbose || false,
3009
2723
  showChrome: options.showChrome || false
3010
2724
  };
3011
- const result = useDaemon ? await daemonClient.crawl(crawlOptions) : await standaloneClient.crawl(crawlOptions);
3012
- let output = "";
3013
- if (options.scrape && result.scraped) {
3014
- const formats = options.format.split(",").map((f) => f.trim());
3015
- for (const site of result.scraped.data) {
3016
- if (formats.includes("markdown") && site.markdown) {
3017
- output += site.markdown + "\n\n";
3018
- } else if (formats.includes("text") && site.text) {
3019
- output += site.text + "\n\n";
3020
- } else if (formats.includes("html") && site.html) {
3021
- output += site.html + "\n\n";
3022
- } else if (formats.includes("json") && site.json) {
3023
- output += site.json + "\n\n";
3024
- }
3025
- }
3026
- } else {
3027
- output = JSON.stringify(
3028
- {
3029
- urls: result.urls,
3030
- metadata: result.metadata
3031
- },
3032
- null,
3033
- 2
3034
- );
3035
- }
2725
+ const formats = options.format.split(",").map((f) => f.trim());
2726
+ const crawlOptionsWithFormats = {
2727
+ ...crawlOptions,
2728
+ formats
2729
+ };
2730
+ const result = useDaemon ? await daemonClient.crawl(crawlOptionsWithFormats) : await standaloneClient.crawl(crawlOptionsWithFormats);
2731
+ const output = JSON.stringify(result, null, 2);
3036
2732
  if (options.output) {
3037
- writeFileSync(options.output, output.trim());
2733
+ writeFileSync(options.output, output);
3038
2734
  if (options.verbose) {
3039
2735
  console.error(`Output written to ${options.output}`);
3040
2736
  }
3041
2737
  } else {
3042
- console.log(output.trim());
2738
+ console.log(output);
3043
2739
  }
3044
2740
  if (options.verbose) {
3045
2741
  console.error(`