@vakra-dev/reader 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/index.js CHANGED
@@ -1,4 +1,10 @@
1
1
  #!/usr/bin/env node
2
+ var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
3
+ get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
4
+ }) : x)(function(x) {
5
+ if (typeof require !== "undefined") return require.apply(this, arguments);
6
+ throw Error('Dynamic require of "' + x + '" is not supported');
7
+ });
2
8
 
3
9
  // src/cli/index.ts
4
10
  import { Command } from "commander";
@@ -12,27 +18,36 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
12
18
  import pLimit from "p-limit";
13
19
 
14
20
  // src/cloudflare/detector.ts
15
- var CHALLENGE_DOM_SELECTORS = [
21
+ var CLOUDFLARE_CHALLENGE_SELECTORS = [
16
22
  "#challenge-running",
17
23
  "#challenge-stage",
18
24
  "#challenge-form",
19
- ".cf-browser-verification"
25
+ ".cf-browser-verification",
26
+ "#cf-wrapper",
27
+ "#cf-hcaptcha-container",
28
+ "#turnstile-wrapper"
20
29
  ];
21
- var CHALLENGE_TEXT_PATTERNS = [
22
- "verifying you are human",
30
+ var CLOUDFLARE_TEXT_PATTERNS = [
23
31
  "checking if the site connection is secure",
24
- "this process is automatic. your browser will redirect"
32
+ "this process is automatic. your browser will redirect",
33
+ "ray id:",
34
+ "performance & security by cloudflare"
35
+ ];
36
+ var CLOUDFLARE_INFRA_PATTERNS = [
37
+ "/cdn-cgi/",
38
+ "cloudflare",
39
+ "__cf_bm",
40
+ "cf-ray"
25
41
  ];
26
- var BLOCKED_SIGNALS = [
27
- "you have been blocked",
28
- "access to this page has been denied",
42
+ var CLOUDFLARE_BLOCKED_PATTERNS = [
29
43
  "sorry, you have been blocked",
30
- "access denied",
31
- "403 forbidden"
44
+ "ray id:"
32
45
  ];
33
46
  async function detectChallenge(hero) {
34
47
  const signals = [];
35
48
  let type = "none";
49
+ let hasCloudflareInfra = false;
50
+ let hasChallengeIndicator = false;
36
51
  try {
37
52
  if (!hero.document) {
38
53
  return {
@@ -44,30 +59,51 @@ async function detectChallenge(hero) {
44
59
  }
45
60
  const html = await hero.document.documentElement.outerHTML;
46
61
  const htmlLower = html.toLowerCase();
47
- for (const selector of CHALLENGE_DOM_SELECTORS) {
48
- if (htmlLower.includes(selector.toLowerCase())) {
49
- signals.push(`Challenge element: ${selector}`);
50
- type = "js_challenge";
62
+ for (const pattern of CLOUDFLARE_INFRA_PATTERNS) {
63
+ if (htmlLower.includes(pattern)) {
64
+ hasCloudflareInfra = true;
65
+ signals.push(`Cloudflare infra: "${pattern}"`);
66
+ break;
67
+ }
68
+ }
69
+ if (!hasCloudflareInfra) {
70
+ return {
71
+ isChallenge: false,
72
+ type: "none",
73
+ confidence: 0,
74
+ signals: ["No Cloudflare infrastructure detected"]
75
+ };
76
+ }
77
+ for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
78
+ try {
79
+ const element = await hero.document.querySelector(selector);
80
+ if (element) {
81
+ hasChallengeIndicator = true;
82
+ signals.push(`Challenge element: ${selector}`);
83
+ type = "js_challenge";
84
+ }
85
+ } catch {
51
86
  }
52
87
  }
53
- for (const pattern of CHALLENGE_TEXT_PATTERNS) {
88
+ for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
54
89
  if (htmlLower.includes(pattern)) {
90
+ hasChallengeIndicator = true;
55
91
  signals.push(`Challenge text: "${pattern}"`);
56
92
  type = type === "none" ? "js_challenge" : type;
57
93
  }
58
94
  }
59
95
  if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
96
+ hasChallengeIndicator = true;
60
97
  signals.push('Challenge text: "waiting for...to respond"');
61
98
  type = type === "none" ? "js_challenge" : type;
62
99
  }
63
- for (const pattern of BLOCKED_SIGNALS) {
64
- if (htmlLower.includes(pattern)) {
65
- signals.push(`Blocked: "${pattern}"`);
66
- type = "blocked";
67
- break;
68
- }
100
+ const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
101
+ if (hasBlocked) {
102
+ hasChallengeIndicator = true;
103
+ signals.push("Cloudflare block page detected");
104
+ type = "blocked";
69
105
  }
70
- const isChallenge = signals.length > 0;
106
+ const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
71
107
  const confidence = isChallenge ? 100 : 0;
72
108
  return {
73
109
  isChallenge,
@@ -150,84 +186,6 @@ var turndownService = new TurndownService({
150
186
  linkStyle: "inlined",
151
187
  linkReferenceStyle: "full"
152
188
  });
153
- function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
154
- const sections = [];
155
- if (includeMetadata) {
156
- sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
157
- }
158
- if (pages.length > 1) {
159
- sections.push(createMarkdownTOC(pages));
160
- }
161
- sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
162
- return sections.join("\n\n");
163
- }
164
- function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
165
- const title = website.title || extractDomainFromUrl(baseUrl);
166
- const description = website.description || "";
167
- let header = `# Website Scrape: ${title}
168
-
169
- `;
170
- header += `**Base URL:** ${baseUrl}
171
- `;
172
- header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
173
- `;
174
- header += `**Duration:** ${duration}ms
175
- `;
176
- header += `**Total pages:** ${totalPages}
177
- `;
178
- if (description) {
179
- header += `**Description:** ${description}
180
- `;
181
- }
182
- if (website.author) {
183
- header += `**Author:** ${website.author}
184
- `;
185
- }
186
- if (website.language) {
187
- header += `**Language:** ${website.language}
188
- `;
189
- }
190
- return header;
191
- }
192
- function createMarkdownTOC(pages) {
193
- let toc = "## Table of Contents\n\n";
194
- pages.forEach((page, index) => {
195
- const depth = " ".repeat(page.depth);
196
- const pageNumber = index + 1;
197
- const title = page.title || `Page ${pageNumber}`;
198
- const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
199
- const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
200
- toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
201
- `;
202
- });
203
- return toc;
204
- }
205
- function createMarkdownPage(page, pageNumber) {
206
- const title = page.title || `Page ${pageNumber}`;
207
- const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
208
- const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
209
- let pageContent = `---
210
-
211
- `;
212
- pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
213
-
214
- `;
215
- pageContent += `**URL:** ${page.url}
216
- `;
217
- pageContent += `**Title:** ${page.title}
218
- `;
219
- pageContent += `**Depth:** ${page.depth}
220
- `;
221
- pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
222
-
223
- `;
224
- pageContent += `---
225
-
226
- `;
227
- const markdown = htmlToMarkdown(page.html);
228
- pageContent += markdown;
229
- return pageContent;
230
- }
231
189
  function htmlToMarkdown(html) {
232
190
  try {
233
191
  return turndownService.turndown(html);
@@ -236,574 +194,339 @@ function htmlToMarkdown(html) {
236
194
  return html.replace(/<[^>]*>/g, "").trim();
237
195
  }
238
196
  }
239
- function extractDomainFromUrl(url) {
240
- try {
241
- return new URL(url).hostname;
242
- } catch {
243
- return "Unknown";
244
- }
245
- }
246
-
247
- // src/formatters/html.ts
248
- function formatToHTML(pages, baseUrl, scrapedAt, duration, website) {
249
- const html = `<!DOCTYPE html>
250
- <html lang="${website.language || "en"}">
251
- <head>
252
- <meta charset="${website.charset || "UTF-8"}">
253
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
254
- <title>Scrape: ${website.title || extractDomainFromUrl2(baseUrl)}</title>
255
- ${generateMetaTags(website)}
256
- <style>
257
- ${generateCSS()}
258
- </style>
259
- </head>
260
- <body>
261
- <header class="header">
262
- <h1>Website Scrape: ${escapeHtml(website.title || extractDomainFromUrl2(baseUrl))}</h1>
263
- <div class="meta-info">
264
- <p><strong>Base URL:</strong> <a href="${escapeHtml(
265
- baseUrl
266
- )}" target="_blank">${escapeHtml(baseUrl)}</a></p>
267
- <p><strong>Scraped at:</strong> ${new Date(scrapedAt).toLocaleString()}</p>
268
- <p><strong>Duration:</strong> ${duration}ms</p>
269
- <p><strong>Total pages:</strong> ${pages.length}</p>
270
- ${website.description ? `<p><strong>Description:</strong> ${escapeHtml(website.description)}</p>` : ""}
271
- ${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
272
- ${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
273
- </div>
274
- </header>
275
-
276
- ${pages.length > 1 ? generateTOC(pages) : ""}
277
-
278
- <main class="content">
279
- ${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
280
- </main>
281
-
282
- <footer class="footer">
283
- <p>Generated by Reader JS/TS SDK</p>
284
- </footer>
285
-
286
- <script>
287
- ${generateJavaScript()}
288
- </script>
289
- </body>
290
- </html>`;
291
- return html;
292
- }
293
- function generateMetaTags(website) {
294
- const tags = [];
295
- if (website.description) {
296
- tags.push(`<meta name="description" content="${escapeHtml(website.description)}">`);
297
- }
298
- if (website.author) {
299
- tags.push(`<meta name="author" content="${escapeHtml(website.author)}">`);
300
- }
301
- if (website.keywords) {
302
- tags.push(`<meta name="keywords" content="${escapeHtml(website.keywords.join(", "))}">`);
303
- }
304
- if (website.robots) {
305
- tags.push(`<meta name="robots" content="${escapeHtml(website.robots)}">`);
306
- }
307
- if (website.themeColor) {
308
- tags.push(`<meta name="theme-color" content="${escapeHtml(website.themeColor)}">`);
309
- }
310
- if (website.favicon) {
311
- tags.push(`<link rel="icon" href="${escapeHtml(website.favicon)}">`);
312
- }
313
- if (website.canonical) {
314
- tags.push(`<link rel="canonical" href="${escapeHtml(website.canonical)}">`);
315
- }
316
- if (website.openGraph) {
317
- const og = website.openGraph;
318
- if (og.title) tags.push(`<meta property="og:title" content="${escapeHtml(og.title)}">`);
319
- if (og.description)
320
- tags.push(`<meta property="og:description" content="${escapeHtml(og.description)}">`);
321
- if (og.type) tags.push(`<meta property="og:type" content="${escapeHtml(og.type)}">`);
322
- if (og.url) tags.push(`<meta property="og:url" content="${escapeHtml(og.url)}">`);
323
- if (og.image) tags.push(`<meta property="og:image" content="${escapeHtml(og.image)}">`);
324
- if (og.siteName)
325
- tags.push(`<meta property="og:site_name" content="${escapeHtml(og.siteName)}">`);
326
- if (og.locale) tags.push(`<meta property="og:locale" content="${escapeHtml(og.locale)}">`);
327
- }
328
- if (website.twitter) {
329
- const twitter = website.twitter;
330
- if (twitter.card) tags.push(`<meta name="twitter:card" content="${escapeHtml(twitter.card)}">`);
331
- if (twitter.site) tags.push(`<meta name="twitter:site" content="${escapeHtml(twitter.site)}">`);
332
- if (twitter.creator)
333
- tags.push(`<meta name="twitter:creator" content="${escapeHtml(twitter.creator)}">`);
334
- if (twitter.title)
335
- tags.push(`<meta name="twitter:title" content="${escapeHtml(twitter.title)}">`);
336
- if (twitter.description)
337
- tags.push(`<meta name="twitter:description" content="${escapeHtml(twitter.description)}">`);
338
- if (twitter.image)
339
- tags.push(`<meta name="twitter:image" content="${escapeHtml(twitter.image)}">`);
340
- }
341
- return tags.join("\n ");
342
- }
343
- function generateCSS() {
344
- return `
345
- * {
346
- margin: 0;
347
- padding: 0;
348
- box-sizing: border-box;
349
- }
350
-
351
- body {
352
- font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
353
- line-height: 1.6;
354
- color: #333;
355
- background-color: #f8f9fa;
356
- }
357
-
358
- .header {
359
- background: white;
360
- padding: 2rem;
361
- border-bottom: 1px solid #e9ecef;
362
- margin-bottom: 2rem;
363
- }
364
-
365
- .header h1 {
366
- color: #2c3e50;
367
- margin-bottom: 1rem;
368
- font-size: 2rem;
369
- }
370
-
371
- .meta-info {
372
- display: grid;
373
- grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
374
- gap: 0.5rem;
375
- }
376
-
377
- .meta-info p {
378
- margin: 0.25rem 0;
379
- font-size: 0.9rem;
380
- color: #6c757d;
381
- }
382
-
383
- .toc {
384
- background: white;
385
- padding: 1.5rem;
386
- margin: 2rem 0;
387
- border-radius: 8px;
388
- border: 1px solid #e9ecef;
389
- }
390
-
391
- .toc h2 {
392
- color: #2c3e50;
393
- margin-bottom: 1rem;
394
- font-size: 1.25rem;
395
- }
396
-
397
- .toc ul {
398
- list-style: none;
399
- }
400
-
401
- .toc li {
402
- margin: 0.5rem 0;
403
- }
404
-
405
- .toc a {
406
- color: #007bff;
407
- text-decoration: none;
408
- transition: color 0.2s;
409
- }
410
-
411
- .toc a:hover {
412
- color: #0056b3;
413
- text-decoration: underline;
414
- }
415
-
416
- .content {
417
- max-width: 800px;
418
- margin: 0 auto;
419
- padding: 0 1rem;
420
- }
421
-
422
- .page {
423
- background: white;
424
- margin: 2rem 0;
425
- padding: 2rem;
426
- border-radius: 8px;
427
- border: 1px solid #e9ecef;
428
- box-shadow: 0 2px 4px rgba(0,0,0,0.05);
429
- }
430
-
431
- .page-header {
432
- border-bottom: 2px solid #e9ecef;
433
- padding-bottom: 1rem;
434
- margin-bottom: 2rem;
435
- }
436
-
437
- .page-header h2 {
438
- color: #2c3e50;
439
- margin-bottom: 0.5rem;
440
- font-size: 1.5rem;
441
- }
442
-
443
- .page-meta {
444
- display: flex;
445
- flex-wrap: wrap;
446
- gap: 1rem;
447
- font-size: 0.9rem;
448
- color: #6c757d;
449
- }
450
-
451
- .page-content {
452
- line-height: 1.8;
453
- }
454
-
455
- .page-content h1, .page-content h2, .page-content h3,
456
- .page-content h4, .page-content h5, .page-content h6 {
457
- color: #2c3e50;
458
- margin: 1.5rem 0 0.5rem 0;
459
- }
460
-
461
- .page-content p {
462
- margin: 1rem 0;
463
- }
464
-
465
- .page-content a {
466
- color: #007bff;
467
- text-decoration: none;
468
- }
469
-
470
- .page-content a:hover {
471
- text-decoration: underline;
472
- }
473
-
474
- .page-content code {
475
- background: #f8f9fa;
476
- padding: 0.2rem 0.4rem;
477
- border-radius: 4px;
478
- font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
479
- font-size: 0.9em;
480
- }
481
-
482
- .page-content pre {
483
- background: #f8f9fa;
484
- padding: 1rem;
485
- border-radius: 4px;
486
- overflow-x: auto;
487
- margin: 1rem 0;
488
- }
489
-
490
- .page-content blockquote {
491
- border-left: 4px solid #007bff;
492
- padding-left: 1rem;
493
- margin: 1rem 0;
494
- color: #6c757d;
495
- }
496
-
497
- .footer {
498
- text-align: center;
499
- padding: 2rem;
500
- margin-top: 3rem;
501
- border-top: 1px solid #e9ecef;
502
- color: #6c757d;
503
- font-size: 0.9rem;
504
- }
505
-
506
- @media (max-width: 768px) {
507
- .header {
508
- padding: 1rem;
509
- }
510
-
511
- .header h1 {
512
- font-size: 1.5rem;
513
- }
514
-
515
- .page {
516
- padding: 1rem;
517
- }
518
-
519
- .page-meta {
520
- flex-direction: column;
521
- gap: 0.5rem;
522
- }
523
- }
524
- `.trim();
525
- }
526
- function generateTOC(pages) {
527
- const tocItems = pages.map((page, index) => {
528
- const pageNumber = index + 1;
529
- const title = page.title || `Page ${pageNumber}`;
530
- const id = `page-${pageNumber}`;
531
- return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
532
- }).join("\n");
533
- return `
534
- <nav class="toc">
535
- <h2>Table of Contents</h2>
536
- <ul>
537
- ${tocItems}
538
- </ul>
539
- </nav>`;
540
- }
541
- function generatePageHTML(page, pageNumber) {
542
- const id = `page-${pageNumber}`;
543
- const title = page.title || `Page ${pageNumber}`;
544
- return `
545
- <article class="page" id="${id}">
546
- <div class="page-header">
547
- <h2>${pageNumber}. ${escapeHtml(title)}</h2>
548
- <div class="page-meta">
549
- <span><strong>URL:</strong> <a href="${escapeHtml(
550
- page.url
551
- )}" target="_blank">${escapeHtml(page.url)}</a></span>
552
- <span><strong>Depth:</strong> ${page.depth}</span>
553
- <span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
554
- </div>
555
- </div>
556
- <div class="page-content">
557
- ${page.html}
558
- </div>
559
- </article>`;
560
- }
561
- function generateJavaScript() {
562
- return `
563
- // Smooth scrolling for TOC links
564
- document.querySelectorAll('a[href^="#"]').forEach(anchor => {
565
- anchor.addEventListener('click', function (e) {
566
- e.preventDefault();
567
- const target = document.querySelector(this.getAttribute('href'));
568
- if (target) {
569
- target.scrollIntoView({
570
- behavior: 'smooth',
571
- block: 'start'
572
- });
573
- }
574
- });
575
- });
576
-
577
- // Highlight current section in TOC
578
- window.addEventListener('scroll', function() {
579
- const pages = document.querySelectorAll('.page');
580
- const tocLinks = document.querySelectorAll('.toc a');
581
-
582
- let currentPage = null;
583
- pages.forEach(page => {
584
- const rect = page.getBoundingClientRect();
585
- if (rect.top <= 100) {
586
- currentPage = page;
587
- }
588
- });
589
-
590
- tocLinks.forEach(link => {
591
- link.style.fontWeight = 'normal';
592
- const target = document.querySelector(link.getAttribute('href'));
593
- if (target === currentPage) {
594
- link.style.fontWeight = 'bold';
595
- }
596
- });
597
- });
598
- `;
599
- }
600
- function escapeHtml(text) {
601
- return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, "&quot;").replace(/'/g, "&#039;").replace(/\//g, "&#x2F;");
602
- }
603
- function extractDomainFromUrl2(url) {
604
- try {
605
- return new URL(url).hostname;
606
- } catch {
607
- return "Unknown";
608
- }
609
- }
610
-
611
- // src/formatters/json.ts
612
- function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
613
- const jsonResult = {
614
- metadata: {
615
- baseUrl,
616
- totalPages: pages.length,
617
- scrapedAt,
618
- duration,
619
- website
620
- },
621
- pages: pages.map((page, index) => ({
622
- index: index + 1,
623
- url: page.url,
624
- title: page.title,
625
- markdown: page.markdown,
626
- html: page.html,
627
- fetchedAt: page.fetchedAt,
628
- depth: page.depth,
629
- wordCount: countWords(page.markdown),
630
- readingTime: estimateReadingTime(page.markdown)
631
- }))
632
- };
633
- return JSON.stringify(jsonResult, null, 2);
634
- }
635
- function countWords(markdown) {
636
- const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
637
- return plainText.split(/\s+/).filter((word) => word.length > 0).length;
638
- }
639
- function estimateReadingTime(markdown) {
640
- const wordCount = countWords(markdown);
641
- return Math.ceil(wordCount / 200);
642
- }
643
-
644
- // src/formatters/text.ts
645
- import { parseHTML } from "linkedom";
646
- function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
647
- const sections = [];
648
- if (includeMetadata) {
649
- sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
650
- }
651
- sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
652
- return sections.join("\n\n");
653
- }
654
- function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
655
- const title = website.title || extractDomainFromUrl3(baseUrl);
656
- const lines = [];
657
- lines.push(`=== ${title} ===`);
658
- lines.push("");
659
- lines.push(`URL: ${baseUrl}`);
660
- lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
661
- lines.push(`Duration: ${duration}ms`);
662
- lines.push(`Pages: ${totalPages}`);
663
- if (website.description) {
664
- lines.push(`Description: ${website.description}`);
665
- }
666
- if (website.author) {
667
- lines.push(`Author: ${website.author}`);
668
- }
669
- if (website.language) {
670
- lines.push(`Language: ${website.language}`);
671
- }
672
- return lines.join("\n");
673
- }
674
- function createTextPage(page, pageNumber, showSeparator) {
675
- const lines = [];
676
- if (showSeparator) {
677
- lines.push("\u2500".repeat(60));
678
- lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
679
- lines.push(`URL: ${page.url}`);
680
- lines.push("\u2500".repeat(60));
681
- }
682
- const plainText = htmlToPlainText(page.html);
683
- lines.push(plainText);
684
- return lines.join("\n");
685
- }
686
- function htmlToPlainText(html) {
687
- const { document } = parseHTML(html);
688
- const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
689
- elementsToRemove.forEach((tag) => {
690
- document.querySelectorAll(tag).forEach((el) => el.remove());
691
- });
692
- let text = document.body?.textContent || document.documentElement?.textContent || "";
693
- text = text.replace(/[ \t]+/g, " ");
694
- text = text.replace(/\n[ \t]+/g, "\n");
695
- text = text.replace(/[ \t]+\n/g, "\n");
696
- text = text.replace(/\n{3,}/g, "\n\n");
697
- text = text.trim();
698
- return text;
699
- }
700
- function extractDomainFromUrl3(url) {
701
- try {
702
- return new URL(url).hostname;
703
- } catch {
704
- return "Unknown";
705
- }
706
- }
707
197
 
708
198
  // src/utils/content-cleaner.ts
709
- import { parseHTML as parseHTML2 } from "linkedom";
199
+ import { parseHTML } from "linkedom";
710
200
  var ALWAYS_REMOVE_SELECTORS = [
711
- // Navigation and menus
712
- "nav",
713
- "header nav",
714
- "footer nav",
715
- ".nav",
716
- ".navigation",
717
- ".menu",
718
- ".navbar",
719
- ".sidebar",
720
- ".aside",
721
- // Header and footer elements
722
- "header",
723
- "footer",
724
- ".site-header",
725
- ".page-header",
726
- ".site-footer",
727
- ".page-footer",
728
- // Social media and sharing
729
- ".social",
730
- ".share",
731
- ".sharing",
732
- ".twitter",
733
- ".facebook",
734
- ".linkedin",
735
- ".instagram",
736
- // Comments and discussions
737
- ".comments",
738
- ".comment",
739
- ".discussion",
740
- ".disqus",
741
- // Forms and interactive elements
742
- "form",
743
- "input",
744
- "button:not([type='submit'])",
745
- "select",
746
- "textarea",
747
201
  // Scripts and styles
748
202
  "script",
749
203
  "style",
750
204
  "noscript",
205
+ "link[rel='stylesheet']",
751
206
  // Hidden elements
752
207
  "[hidden]",
208
+ "[aria-hidden='true']",
753
209
  "[style*='display: none']",
754
210
  "[style*='display:none']",
755
- // Common utility classes
756
- ".cookie",
757
- ".cookie-banner",
758
- ".popup",
211
+ "[style*='visibility: hidden']",
212
+ "[style*='visibility:hidden']",
213
+ // SVG icons and decorative elements
214
+ "svg[aria-hidden='true']",
215
+ "svg.icon",
216
+ "svg[class*='icon']",
217
+ // Template and metadata
218
+ "template",
219
+ "meta",
220
+ // Embeds that don't convert to text
221
+ "iframe",
222
+ "canvas",
223
+ "object",
224
+ "embed",
225
+ // Forms (usually not main content)
226
+ "form",
227
+ "input",
228
+ "select",
229
+ "textarea",
230
+ "button"
231
+ ];
232
+ var OVERLAY_SELECTORS = [
233
+ "[class*='modal']",
234
+ "[class*='popup']",
235
+ "[class*='overlay']",
236
+ "[class*='dialog']",
237
+ "[role='dialog']",
238
+ "[role='alertdialog']",
239
+ "[class*='cookie']",
240
+ "[class*='consent']",
241
+ "[class*='gdpr']",
242
+ "[class*='privacy-banner']",
243
+ "[class*='notification-bar']",
244
+ "[id*='cookie']",
245
+ "[id*='consent']",
246
+ "[id*='gdpr']",
247
+ // Fixed/sticky positioned elements
248
+ "[style*='position: fixed']",
249
+ "[style*='position:fixed']",
250
+ "[style*='position: sticky']",
251
+ "[style*='position:sticky']"
252
+ ];
253
+ var NAVIGATION_SELECTORS = [
254
+ // Semantic elements
255
+ "header",
256
+ "footer",
257
+ "nav",
258
+ "aside",
259
+ // Header variations
260
+ ".header",
261
+ ".top",
262
+ ".navbar",
263
+ "#header",
264
+ // Footer variations
265
+ ".footer",
266
+ ".bottom",
267
+ "#footer",
268
+ // Sidebars
269
+ ".sidebar",
270
+ ".side",
271
+ ".aside",
272
+ "#sidebar",
273
+ // Modals/popups (backup if not caught by OVERLAY_SELECTORS)
759
274
  ".modal",
275
+ ".popup",
276
+ "#modal",
760
277
  ".overlay",
761
- ".notification",
278
+ // Ads
279
+ ".ad",
280
+ ".ads",
281
+ ".advert",
282
+ "#ad",
283
+ // Language selectors
284
+ ".lang-selector",
285
+ ".language",
286
+ "#language-selector",
287
+ // Social
288
+ ".social",
289
+ ".social-media",
290
+ ".social-links",
291
+ "#social",
292
+ // Navigation/menus
293
+ ".menu",
294
+ ".navigation",
295
+ "#nav",
762
296
  // Breadcrumbs
763
- ".breadcrumb",
764
297
  ".breadcrumbs",
765
- ".breadcrumb-trail"
298
+ "#breadcrumbs",
299
+ // Share buttons
300
+ ".share",
301
+ "#share",
302
+ // Widgets
303
+ ".widget",
304
+ "#widget",
305
+ // Cookie notices (backup)
306
+ ".cookie",
307
+ "#cookie"
308
+ ];
309
+ var FORCE_INCLUDE_SELECTORS = [
310
+ // IDs
311
+ "#main",
312
+ "#content",
313
+ "#main-content",
314
+ "#article",
315
+ "#post",
316
+ "#page-content",
317
+ // Semantic elements
318
+ "main",
319
+ "article",
320
+ "[role='main']",
321
+ // Classes
322
+ ".main-content",
323
+ ".content",
324
+ ".post-content",
325
+ ".article-content",
326
+ ".entry-content",
327
+ ".page-content",
328
+ ".article-body",
329
+ ".post-body",
330
+ ".story-content",
331
+ ".blog-content"
766
332
  ];
767
333
  var AD_SELECTORS = [
768
- // Ads and promotions
769
- ".ad",
770
- ".ads",
771
- ".advertisement",
772
- ".promotion",
773
- ".sponsored",
774
- "[class*='ad-']",
775
- "[id*='ad-']",
776
- "[class*='advert']",
777
- "[id*='advert']",
778
- "[class*='banner']",
779
- "[id*='banner']",
334
+ // Google ads
335
+ "ins.adsbygoogle",
780
336
  ".google-ad",
781
337
  ".adsense",
338
+ // Generic ad containers
782
339
  "[data-ad]",
783
340
  "[data-ads]",
784
- "ins.adsbygoogle",
785
- // Tracking
786
- "[class*='tracking']",
787
- "[id*='tracking']",
788
- "[class*='analytics']",
789
- "[id*='analytics']"
341
+ "[data-ad-slot]",
342
+ "[data-ad-client]",
343
+ // Common ad class patterns
344
+ ".ad-container",
345
+ ".ad-wrapper",
346
+ ".advertisement",
347
+ ".sponsored-content",
348
+ // Tracking pixels
349
+ "img[width='1'][height='1']",
350
+ "img[src*='pixel']",
351
+ "img[src*='tracking']",
352
+ "img[src*='analytics']"
790
353
  ];
791
- function cleanHtml(html, baseUrl, options = {}) {
792
- const { removeAds = true, removeBase64Images = true } = options;
793
- const { document } = parseHTML2(html);
794
- for (const selector of ALWAYS_REMOVE_SELECTORS) {
354
+ function getLinkDensity(element) {
355
+ const text = element.textContent || "";
356
+ const textLength = text.trim().length;
357
+ if (textLength === 0) return 1;
358
+ let linkLength = 0;
359
+ element.querySelectorAll("a").forEach((link) => {
360
+ linkLength += (link.textContent || "").trim().length;
361
+ });
362
+ return linkLength / textLength;
363
+ }
364
+ function getContentScore(element) {
365
+ let score = 0;
366
+ const text = element.textContent || "";
367
+ const textLength = text.trim().length;
368
+ score += Math.min(textLength / 100, 50);
369
+ score += element.querySelectorAll("p").length * 3;
370
+ score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2;
371
+ score += element.querySelectorAll("img").length * 1;
372
+ score -= element.querySelectorAll("a").length * 0.5;
373
+ score -= element.querySelectorAll("li").length * 0.2;
374
+ const linkDensity = getLinkDensity(element);
375
+ if (linkDensity > 0.5) score -= 30;
376
+ else if (linkDensity > 0.3) score -= 15;
377
+ const classAndId = (element.className || "") + " " + (element.id || "");
378
+ if (/article|content|post|body|main|entry/i.test(classAndId)) score += 25;
379
+ if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) score -= 25;
380
+ return score;
381
+ }
382
+ function looksLikeNavigation(element) {
383
+ const linkDensity = getLinkDensity(element);
384
+ if (linkDensity > 0.5) return true;
385
+ const listItems = element.querySelectorAll("li");
386
+ const links = element.querySelectorAll("a");
387
+ if (listItems.length > 5 && links.length > listItems.length * 0.8) return true;
388
+ return false;
389
+ }
390
+ function removeElements(document, selectors) {
391
+ for (const selector of selectors) {
795
392
  try {
796
393
  document.querySelectorAll(selector).forEach((el) => el.remove());
797
394
  } catch {
798
395
  }
799
396
  }
397
+ }
398
+ function removeWithProtection(document, selectorsToRemove, protectedSelectors) {
399
+ for (const selector of selectorsToRemove) {
400
+ try {
401
+ document.querySelectorAll(selector).forEach((element) => {
402
+ const isProtected = protectedSelectors.some((ps) => {
403
+ try {
404
+ return element.matches(ps);
405
+ } catch {
406
+ return false;
407
+ }
408
+ });
409
+ if (isProtected) return;
410
+ const containsProtected = protectedSelectors.some((ps) => {
411
+ try {
412
+ return element.querySelector(ps) !== null;
413
+ } catch {
414
+ return false;
415
+ }
416
+ });
417
+ if (containsProtected) return;
418
+ element.remove();
419
+ });
420
+ } catch {
421
+ }
422
+ }
423
+ }
424
+ function findMainContent(document) {
425
+ const isValidContent = (el) => {
426
+ if (!el) return false;
427
+ const text = el.textContent || "";
428
+ if (text.trim().length < 100) return false;
429
+ if (looksLikeNavigation(el)) return false;
430
+ return true;
431
+ };
432
+ const main = document.querySelector("main");
433
+ if (isValidContent(main) && getLinkDensity(main) < 0.4) {
434
+ return main;
435
+ }
436
+ const roleMain = document.querySelector('[role="main"]');
437
+ if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) {
438
+ return roleMain;
439
+ }
440
+ const articles = document.querySelectorAll("article");
441
+ if (articles.length === 1 && isValidContent(articles[0])) {
442
+ return articles[0];
443
+ }
444
+ const contentSelectors = [
445
+ "#content",
446
+ "#main-content",
447
+ "#main",
448
+ ".content",
449
+ ".main-content",
450
+ ".post-content",
451
+ ".article-content",
452
+ ".entry-content",
453
+ ".page-content",
454
+ ".article-body",
455
+ ".post-body",
456
+ ".story-content",
457
+ ".blog-content"
458
+ ];
459
+ for (const selector of contentSelectors) {
460
+ try {
461
+ const el = document.querySelector(selector);
462
+ if (isValidContent(el) && getLinkDensity(el) < 0.4) {
463
+ return el;
464
+ }
465
+ } catch {
466
+ }
467
+ }
468
+ const candidates = [];
469
+ const containers = document.querySelectorAll("div, section, article");
470
+ containers.forEach((el) => {
471
+ const text = el.textContent || "";
472
+ if (text.trim().length < 200) return;
473
+ const score = getContentScore(el);
474
+ if (score > 0) {
475
+ candidates.push({ el, score });
476
+ }
477
+ });
478
+ candidates.sort((a, b) => b.score - a.score);
479
+ if (candidates.length > 0 && candidates[0].score > 20) {
480
+ return candidates[0].el;
481
+ }
482
+ return null;
483
+ }
484
+ function cleanHtml(html, baseUrl, options = {}) {
485
+ const {
486
+ removeAds = true,
487
+ removeBase64Images = true,
488
+ onlyMainContent = true,
489
+ includeTags,
490
+ excludeTags
491
+ } = options;
492
+ const { document } = parseHTML(html);
493
+ removeElements(document, ALWAYS_REMOVE_SELECTORS);
494
+ removeElements(document, OVERLAY_SELECTORS);
800
495
  if (removeAds) {
801
- for (const selector of AD_SELECTORS) {
496
+ removeElements(document, AD_SELECTORS);
497
+ }
498
+ if (excludeTags && excludeTags.length > 0) {
499
+ removeElements(document, excludeTags);
500
+ }
501
+ if (onlyMainContent) {
502
+ removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS);
503
+ const mainContent = findMainContent(document);
504
+ if (mainContent) {
505
+ const body = document.body;
506
+ if (body) {
507
+ const clone = mainContent.cloneNode(true);
508
+ body.innerHTML = "";
509
+ body.appendChild(clone);
510
+ }
511
+ }
512
+ }
513
+ if (includeTags && includeTags.length > 0) {
514
+ const matchedElements = [];
515
+ for (const selector of includeTags) {
802
516
  try {
803
- document.querySelectorAll(selector).forEach((el) => el.remove());
517
+ document.querySelectorAll(selector).forEach((el) => {
518
+ matchedElements.push(el.cloneNode(true));
519
+ });
804
520
  } catch {
805
521
  }
806
522
  }
523
+ if (matchedElements.length > 0) {
524
+ const body = document.body;
525
+ if (body) {
526
+ body.innerHTML = "";
527
+ matchedElements.forEach((el) => body.appendChild(el));
528
+ }
529
+ }
807
530
  }
808
531
  if (removeBase64Images) {
809
532
  removeBase64ImagesFromDocument(document);
@@ -828,7 +551,10 @@ function removeBase64ImagesFromDocument(document) {
828
551
  document.querySelectorAll("[style*='data:image']").forEach((el) => {
829
552
  const style = el.getAttribute("style");
830
553
  if (style) {
831
- const cleanedStyle = style.replace(/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi, "");
554
+ const cleanedStyle = style.replace(
555
+ /background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi,
556
+ ""
557
+ );
832
558
  if (cleanedStyle.trim()) {
833
559
  el.setAttribute("style", cleanedStyle);
834
560
  } else {
@@ -865,7 +591,7 @@ function cleanContent(html, baseUrl, options = {}) {
865
591
  }
866
592
 
867
593
  // src/utils/metadata-extractor.ts
868
- import { parseHTML as parseHTML3 } from "linkedom";
594
+ import { parseHTML as parseHTML2 } from "linkedom";
869
595
 
870
596
  // src/utils/url-helpers.ts
871
597
  import { URL as URL2 } from "url";
@@ -938,8 +664,26 @@ function isSameDomain(url, baseUrl) {
938
664
  function getUrlKey(url) {
939
665
  try {
940
666
  const parsedUrl = new URL2(url);
667
+ parsedUrl.hash = "";
941
668
  parsedUrl.search = "";
942
- return parsedUrl.toString().toLowerCase();
669
+ if (parsedUrl.hostname.startsWith("www.")) {
670
+ parsedUrl.hostname = parsedUrl.hostname.slice(4);
671
+ }
672
+ if (parsedUrl.protocol === "http:" && parsedUrl.port === "80" || parsedUrl.protocol === "https:" && parsedUrl.port === "443") {
673
+ parsedUrl.port = "";
674
+ }
675
+ const indexFiles = ["index.html", "index.htm", "default.html", "default.htm", "index.php"];
676
+ for (const indexFile of indexFiles) {
677
+ if (parsedUrl.pathname.endsWith(`/${indexFile}`)) {
678
+ parsedUrl.pathname = parsedUrl.pathname.slice(0, -indexFile.length);
679
+ break;
680
+ }
681
+ }
682
+ let normalized = parsedUrl.toString().toLowerCase();
683
+ if (normalized.endsWith("/") && parsedUrl.pathname !== "/") {
684
+ normalized = normalized.slice(0, -1);
685
+ }
686
+ return normalized;
943
687
  } catch {
944
688
  return url.toLowerCase();
945
689
  }
@@ -1006,7 +750,7 @@ function extractMetadata(html, baseUrl) {
1006
750
  return extractWebsiteMetadata(html, baseUrl);
1007
751
  }
1008
752
  function extractWebsiteMetadata(html, baseUrl) {
1009
- const { document } = parseHTML3(html);
753
+ const { document } = parseHTML2(html);
1010
754
  const metadata = {
1011
755
  title: null,
1012
756
  description: null,
@@ -1161,11 +905,20 @@ function extractTwitterCard(document) {
1161
905
 
1162
906
  // src/utils/logger.ts
1163
907
  import pino from "pino";
908
+ function hasPinoPretty() {
909
+ try {
910
+ __require.resolve("pino-pretty");
911
+ return true;
912
+ } catch {
913
+ return false;
914
+ }
915
+ }
1164
916
  function createLogger(name = "reader", level = process.env.LOG_LEVEL || "info") {
917
+ const usePretty = process.env.NODE_ENV !== "production" && hasPinoPretty();
1165
918
  return pino({
1166
919
  name,
1167
920
  level,
1168
- transport: process.env.NODE_ENV !== "production" ? {
921
+ transport: usePretty ? {
1169
922
  target: "pino-pretty",
1170
923
  options: {
1171
924
  colorize: true,
@@ -1279,13 +1032,15 @@ function isUrlAllowed(url, rules) {
1279
1032
  var DEFAULT_OPTIONS = {
1280
1033
  urls: [],
1281
1034
  formats: ["markdown"],
1282
- includeMetadata: true,
1283
1035
  timeoutMs: 3e4,
1284
1036
  includePatterns: [],
1285
1037
  excludePatterns: [],
1286
1038
  // Content cleaning defaults
1287
1039
  removeAds: true,
1288
1040
  removeBase64Images: true,
1041
+ onlyMainContent: true,
1042
+ includeTags: [],
1043
+ excludeTags: [],
1289
1044
  skipTLSVerification: true,
1290
1045
  // Batch defaults
1291
1046
  batchConcurrency: 1,
@@ -1442,14 +1197,9 @@ var Scraper = class {
1442
1197
  } catch {
1443
1198
  }
1444
1199
  await hero.waitForPaintingStable();
1445
- let hadChallenge = false;
1446
- let challengeType = "none";
1447
- let waitTimeMs = 0;
1448
1200
  const initialUrl = await hero.url;
1449
1201
  const detection = await detectChallenge(hero);
1450
1202
  if (detection.isChallenge) {
1451
- hadChallenge = true;
1452
- challengeType = detection.type;
1453
1203
  if (this.options.verbose) {
1454
1204
  this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
1455
1205
  }
@@ -1459,12 +1209,11 @@ var Scraper = class {
1459
1209
  verbose: this.options.verbose,
1460
1210
  initialUrl
1461
1211
  });
1462
- waitTimeMs = result2.waitedMs;
1463
1212
  if (!result2.resolved) {
1464
1213
  throw new Error(`Challenge not resolved: ${detection.type}`);
1465
1214
  }
1466
1215
  if (this.options.verbose) {
1467
- this.logger.info(`Challenge resolved via ${result2.method} in ${waitTimeMs}ms`);
1216
+ this.logger.info(`Challenge resolved via ${result2.method} in ${result2.waitedMs}ms`);
1468
1217
  }
1469
1218
  }
1470
1219
  await this.waitForFinalPage(hero, url, this.options.verbose);
@@ -1477,45 +1226,18 @@ var Scraper = class {
1477
1226
  this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
1478
1227
  }
1479
1228
  }
1480
- const pageTitle = await hero.document.title;
1481
1229
  const html = await hero.document.documentElement.outerHTML;
1482
1230
  const cleanedHtml = cleanContent(html, url, {
1483
1231
  removeAds: this.options.removeAds,
1484
- removeBase64Images: this.options.removeBase64Images
1232
+ removeBase64Images: this.options.removeBase64Images,
1233
+ onlyMainContent: this.options.onlyMainContent,
1234
+ includeTags: this.options.includeTags,
1235
+ excludeTags: this.options.excludeTags
1485
1236
  });
1486
1237
  const websiteMetadata = extractMetadata(cleanedHtml, url);
1487
1238
  const duration = Date.now() - startTime;
1488
- const scrapedAt = (/* @__PURE__ */ new Date()).toISOString();
1489
- const page = {
1490
- url,
1491
- title: pageTitle,
1492
- markdown: "",
1493
- // Will be set by formatter
1494
- html: cleanedHtml,
1495
- fetchedAt: scrapedAt,
1496
- depth: 0,
1497
- hadChallenge,
1498
- challengeType,
1499
- waitTimeMs
1500
- };
1501
- const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
1502
- [page],
1503
- url,
1504
- scrapedAt,
1505
- duration,
1506
- websiteMetadata,
1507
- this.options.includeMetadata
1508
- ) : void 0;
1509
- const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
1510
- const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
1511
- const text = this.options.formats.includes("text") ? formatToText(
1512
- [page],
1513
- url,
1514
- scrapedAt,
1515
- duration,
1516
- websiteMetadata,
1517
- this.options.includeMetadata
1518
- ) : void 0;
1239
+ const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
1240
+ const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
1519
1241
  if (this.options.onProgress) {
1520
1242
  this.options.onProgress({
1521
1243
  completed: index + 1,
@@ -1547,8 +1269,6 @@ var Scraper = class {
1547
1269
  const result = {
1548
1270
  markdown,
1549
1271
  html: htmlOutput,
1550
- json,
1551
- text,
1552
1272
  metadata: {
1553
1273
  baseUrl: url,
1554
1274
  totalPages: 1,
@@ -1603,7 +1323,7 @@ async function scrape(options) {
1603
1323
  }
1604
1324
 
1605
1325
  // src/crawler.ts
1606
- import { parseHTML as parseHTML4 } from "linkedom";
1326
+ import { parseHTML as parseHTML3 } from "linkedom";
1607
1327
 
1608
1328
  // src/utils/rate-limiter.ts
1609
1329
  import pLimit2 from "p-limit";
@@ -1752,12 +1472,26 @@ var Crawler = class {
1752
1472
  */
1753
1473
  extractLinks(html, baseUrl, depth) {
1754
1474
  const links = [];
1755
- const { document } = parseHTML4(html);
1475
+ const { document } = parseHTML3(html);
1756
1476
  document.querySelectorAll("a[href]").forEach((anchor) => {
1757
- const href = anchor.getAttribute("href");
1477
+ const rawHref = anchor.getAttribute("href");
1478
+ if (!rawHref) return;
1479
+ const href = rawHref.trim();
1758
1480
  if (!href) return;
1759
- const resolved = resolveUrl(href, baseUrl);
1481
+ if (href.startsWith("#")) return;
1482
+ const lowerHref = href.toLowerCase();
1483
+ if (lowerHref.startsWith("javascript:") || lowerHref.startsWith("mailto:") || lowerHref.startsWith("tel:") || lowerHref.startsWith("data:") || lowerHref.startsWith("blob:") || lowerHref.startsWith("ftp:")) {
1484
+ return;
1485
+ }
1486
+ let resolved = resolveUrl(href, baseUrl);
1760
1487
  if (!resolved || !isValidUrl(resolved)) return;
1488
+ try {
1489
+ const parsed = new URL(resolved);
1490
+ parsed.hash = "";
1491
+ resolved = parsed.toString();
1492
+ } catch {
1493
+ return;
1494
+ }
1761
1495
  if (!isSameDomain(resolved, this.options.url)) return;
1762
1496
  if (!isContentUrl(resolved)) return;
1763
1497
  if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
@@ -2868,9 +2602,9 @@ program.command("status").description("Check daemon status").option("-p, --port
2868
2602
  });
2869
2603
  program.command("scrape <urls...>").description("Scrape one or more URLs").option(
2870
2604
  "-f, --format <formats>",
2871
- "Output formats (comma-separated: markdown,html,json,text)",
2605
+ "Content formats to include (comma-separated: markdown,html)",
2872
2606
  "markdown"
2873
- ).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--no-metadata", "Exclude metadata from output").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").action(async (urls, options) => {
2607
+ ).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").option("--no-main-content", "Disable main content extraction (include full page)").option("--include-tags <selectors>", "CSS selectors for elements to include (comma-separated)").option("--exclude-tags <selectors>", "CSS selectors for elements to exclude (comma-separated)").action(async (urls, options) => {
2874
2608
  const port = parseInt(options.port, 10);
2875
2609
  const useStandalone = options.standalone || false;
2876
2610
  let useDaemon = false;
@@ -2887,7 +2621,7 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
2887
2621
  }) : null;
2888
2622
  try {
2889
2623
  const formats = options.format.split(",").map((f) => f.trim());
2890
- const validFormats = ["markdown", "html", "json", "text"];
2624
+ const validFormats = ["markdown", "html"];
2891
2625
  for (const format of formats) {
2892
2626
  if (!validFormats.includes(format)) {
2893
2627
  console.error(
@@ -2900,6 +2634,8 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
2900
2634
  console.error(`Scraping ${urls.length} URL(s)...`);
2901
2635
  console.error(`Formats: ${formats.join(", ")}`);
2902
2636
  }
2637
+ const includeTags = options.includeTags ? options.includeTags.split(",").map((s) => s.trim()) : void 0;
2638
+ const excludeTags = options.excludeTags ? options.excludeTags.split(",").map((s) => s.trim()) : void 0;
2903
2639
  const scrapeOptions = {
2904
2640
  urls,
2905
2641
  formats,
@@ -2908,33 +2644,26 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
2908
2644
  batchTimeoutMs: parseInt(options.batchTimeout, 10),
2909
2645
  proxy: options.proxy ? { url: options.proxy } : void 0,
2910
2646
  userAgent: options.userAgent,
2911
- includeMetadata: options.metadata !== false,
2912
2647
  verbose: options.verbose || false,
2913
2648
  showChrome: options.showChrome || false,
2649
+ // Content cleaning options
2650
+ onlyMainContent: options.mainContent !== false,
2651
+ // --no-main-content sets this to false
2652
+ includeTags,
2653
+ excludeTags,
2914
2654
  onProgress: options.verbose ? ({ completed, total, currentUrl }) => {
2915
2655
  console.error(`[${completed}/${total}] ${currentUrl}`);
2916
2656
  } : void 0
2917
2657
  };
2918
2658
  const result = useDaemon ? await daemonClient.scrape(scrapeOptions) : await standaloneClient.scrape(scrapeOptions);
2919
- let output = "";
2920
- for (const site of result.data) {
2921
- if (formats.includes("markdown") && site.markdown) {
2922
- output += site.markdown + "\n\n";
2923
- } else if (formats.includes("text") && site.text) {
2924
- output += site.text + "\n\n";
2925
- } else if (formats.includes("html") && site.html) {
2926
- output += site.html + "\n\n";
2927
- } else if (formats.includes("json") && site.json) {
2928
- output += site.json + "\n\n";
2929
- }
2930
- }
2659
+ const output = JSON.stringify(result, null, 2);
2931
2660
  if (options.output) {
2932
- writeFileSync(options.output, output.trim());
2661
+ writeFileSync(options.output, output);
2933
2662
  if (options.verbose) {
2934
2663
  console.error(`Output written to ${options.output}`);
2935
2664
  }
2936
2665
  } else {
2937
- console.log(output.trim());
2666
+ console.log(output);
2938
2667
  }
2939
2668
  if (options.verbose) {
2940
2669
  console.error(`
@@ -2957,7 +2686,7 @@ Summary:`);
2957
2686
  }
2958
2687
  }
2959
2688
  });
2960
- program.command("crawl <url>").description("Crawl a website to discover and optionally scrape pages").option("-d, --depth <n>", "Maximum crawl depth", "1").option("-m, --max-pages <n>", "Maximum pages to discover", "20").option("-s, --scrape", "Also scrape content of discovered pages").option("-f, --format <formats>", "Output formats when scraping (comma-separated)", "markdown").option("-o, --output <file>", "Output file (stdout if omitted)").option("--delay <ms>", "Delay between requests in milliseconds", "1000").option("-t, --timeout <ms>", "Total timeout for crawl operation in milliseconds").option("--include <patterns>", "URL patterns to include (comma-separated regex)").option("--exclude <patterns>", "URL patterns to exclude (comma-separated regex)").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").action(async (url, options) => {
2689
+ program.command("crawl <url>").description("Crawl a website to discover and optionally scrape pages").option("-d, --depth <n>", "Maximum crawl depth", "1").option("-m, --max-pages <n>", "Maximum pages to discover", "20").option("-s, --scrape", "Also scrape content of discovered pages").option("-f, --format <formats>", "Content formats when scraping (comma-separated: markdown,html)", "markdown").option("-o, --output <file>", "Output file (stdout if omitted)").option("--delay <ms>", "Delay between requests in milliseconds", "1000").option("-t, --timeout <ms>", "Total timeout for crawl operation in milliseconds").option("--include <patterns>", "URL patterns to include (comma-separated regex)").option("--exclude <patterns>", "URL patterns to exclude (comma-separated regex)").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").action(async (url, options) => {
2961
2690
  const port = parseInt(options.port, 10);
2962
2691
  const useStandalone = options.standalone || false;
2963
2692
  let useDaemon = false;
@@ -2993,38 +2722,20 @@ program.command("crawl <url>").description("Crawl a website to discover and opti
2993
2722
  verbose: options.verbose || false,
2994
2723
  showChrome: options.showChrome || false
2995
2724
  };
2996
- const result = useDaemon ? await daemonClient.crawl(crawlOptions) : await standaloneClient.crawl(crawlOptions);
2997
- let output = "";
2998
- if (options.scrape && result.scraped) {
2999
- const formats = options.format.split(",").map((f) => f.trim());
3000
- for (const site of result.scraped.data) {
3001
- if (formats.includes("markdown") && site.markdown) {
3002
- output += site.markdown + "\n\n";
3003
- } else if (formats.includes("text") && site.text) {
3004
- output += site.text + "\n\n";
3005
- } else if (formats.includes("html") && site.html) {
3006
- output += site.html + "\n\n";
3007
- } else if (formats.includes("json") && site.json) {
3008
- output += site.json + "\n\n";
3009
- }
3010
- }
3011
- } else {
3012
- output = JSON.stringify(
3013
- {
3014
- urls: result.urls,
3015
- metadata: result.metadata
3016
- },
3017
- null,
3018
- 2
3019
- );
3020
- }
2725
+ const formats = options.format.split(",").map((f) => f.trim());
2726
+ const crawlOptionsWithFormats = {
2727
+ ...crawlOptions,
2728
+ formats
2729
+ };
2730
+ const result = useDaemon ? await daemonClient.crawl(crawlOptionsWithFormats) : await standaloneClient.crawl(crawlOptionsWithFormats);
2731
+ const output = JSON.stringify(result, null, 2);
3021
2732
  if (options.output) {
3022
- writeFileSync(options.output, output.trim());
2733
+ writeFileSync(options.output, output);
3023
2734
  if (options.verbose) {
3024
2735
  console.error(`Output written to ${options.output}`);
3025
2736
  }
3026
2737
  } else {
3027
- console.log(output.trim());
2738
+ console.log(output);
3028
2739
  }
3029
2740
  if (options.verbose) {
3030
2741
  console.error(`