@vakra-dev/reader 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -14,27 +14,36 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
14
14
  import pLimit from "p-limit";
15
15
 
16
16
  // src/cloudflare/detector.ts
17
- var CHALLENGE_DOM_SELECTORS = [
17
+ var CLOUDFLARE_CHALLENGE_SELECTORS = [
18
18
  "#challenge-running",
19
19
  "#challenge-stage",
20
20
  "#challenge-form",
21
- ".cf-browser-verification"
21
+ ".cf-browser-verification",
22
+ "#cf-wrapper",
23
+ "#cf-hcaptcha-container",
24
+ "#turnstile-wrapper"
22
25
  ];
23
- var CHALLENGE_TEXT_PATTERNS = [
24
- "verifying you are human",
26
+ var CLOUDFLARE_TEXT_PATTERNS = [
25
27
  "checking if the site connection is secure",
26
- "this process is automatic. your browser will redirect"
28
+ "this process is automatic. your browser will redirect",
29
+ "ray id:",
30
+ "performance & security by cloudflare"
27
31
  ];
28
- var BLOCKED_SIGNALS = [
29
- "you have been blocked",
30
- "access to this page has been denied",
32
+ var CLOUDFLARE_INFRA_PATTERNS = [
33
+ "/cdn-cgi/",
34
+ "cloudflare",
35
+ "__cf_bm",
36
+ "cf-ray"
37
+ ];
38
+ var CLOUDFLARE_BLOCKED_PATTERNS = [
31
39
  "sorry, you have been blocked",
32
- "access denied",
33
- "403 forbidden"
40
+ "ray id:"
34
41
  ];
35
42
  async function detectChallenge(hero) {
36
43
  const signals = [];
37
44
  let type = "none";
45
+ let hasCloudflareInfra = false;
46
+ let hasChallengeIndicator = false;
38
47
  try {
39
48
  if (!hero.document) {
40
49
  return {
@@ -46,30 +55,51 @@ async function detectChallenge(hero) {
46
55
  }
47
56
  const html = await hero.document.documentElement.outerHTML;
48
57
  const htmlLower = html.toLowerCase();
49
- for (const selector of CHALLENGE_DOM_SELECTORS) {
50
- if (htmlLower.includes(selector.toLowerCase())) {
51
- signals.push(`Challenge element: ${selector}`);
52
- type = "js_challenge";
58
+ for (const pattern of CLOUDFLARE_INFRA_PATTERNS) {
59
+ if (htmlLower.includes(pattern)) {
60
+ hasCloudflareInfra = true;
61
+ signals.push(`Cloudflare infra: "${pattern}"`);
62
+ break;
63
+ }
64
+ }
65
+ if (!hasCloudflareInfra) {
66
+ return {
67
+ isChallenge: false,
68
+ type: "none",
69
+ confidence: 0,
70
+ signals: ["No Cloudflare infrastructure detected"]
71
+ };
72
+ }
73
+ for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
74
+ try {
75
+ const element = await hero.document.querySelector(selector);
76
+ if (element) {
77
+ hasChallengeIndicator = true;
78
+ signals.push(`Challenge element: ${selector}`);
79
+ type = "js_challenge";
80
+ }
81
+ } catch {
53
82
  }
54
83
  }
55
- for (const pattern of CHALLENGE_TEXT_PATTERNS) {
84
+ for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
56
85
  if (htmlLower.includes(pattern)) {
86
+ hasChallengeIndicator = true;
57
87
  signals.push(`Challenge text: "${pattern}"`);
58
88
  type = type === "none" ? "js_challenge" : type;
59
89
  }
60
90
  }
61
91
  if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
92
+ hasChallengeIndicator = true;
62
93
  signals.push('Challenge text: "waiting for...to respond"');
63
94
  type = type === "none" ? "js_challenge" : type;
64
95
  }
65
- for (const pattern of BLOCKED_SIGNALS) {
66
- if (htmlLower.includes(pattern)) {
67
- signals.push(`Blocked: "${pattern}"`);
68
- type = "blocked";
69
- break;
70
- }
96
+ const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
97
+ if (hasBlocked) {
98
+ hasChallengeIndicator = true;
99
+ signals.push("Cloudflare block page detected");
100
+ type = "blocked";
71
101
  }
72
- const isChallenge = signals.length > 0;
102
+ const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
73
103
  const confidence = isChallenge ? 100 : 0;
74
104
  return {
75
105
  isChallenge,
@@ -186,84 +216,6 @@ var turndownService = new TurndownService({
186
216
  linkStyle: "inlined",
187
217
  linkReferenceStyle: "full"
188
218
  });
189
- function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
190
- const sections = [];
191
- if (includeMetadata) {
192
- sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
193
- }
194
- if (pages.length > 1) {
195
- sections.push(createMarkdownTOC(pages));
196
- }
197
- sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
198
- return sections.join("\n\n");
199
- }
200
- function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
201
- const title = website.title || extractDomainFromUrl(baseUrl);
202
- const description = website.description || "";
203
- let header = `# Website Scrape: ${title}
204
-
205
- `;
206
- header += `**Base URL:** ${baseUrl}
207
- `;
208
- header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
209
- `;
210
- header += `**Duration:** ${duration}ms
211
- `;
212
- header += `**Total pages:** ${totalPages}
213
- `;
214
- if (description) {
215
- header += `**Description:** ${description}
216
- `;
217
- }
218
- if (website.author) {
219
- header += `**Author:** ${website.author}
220
- `;
221
- }
222
- if (website.language) {
223
- header += `**Language:** ${website.language}
224
- `;
225
- }
226
- return header;
227
- }
228
- function createMarkdownTOC(pages) {
229
- let toc = "## Table of Contents\n\n";
230
- pages.forEach((page, index) => {
231
- const depth = " ".repeat(page.depth);
232
- const pageNumber = index + 1;
233
- const title = page.title || `Page ${pageNumber}`;
234
- const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
235
- const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
236
- toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
237
- `;
238
- });
239
- return toc;
240
- }
241
- function createMarkdownPage(page, pageNumber) {
242
- const title = page.title || `Page ${pageNumber}`;
243
- const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
244
- const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
245
- let pageContent = `---
246
-
247
- `;
248
- pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
249
-
250
- `;
251
- pageContent += `**URL:** ${page.url}
252
- `;
253
- pageContent += `**Title:** ${page.title}
254
- `;
255
- pageContent += `**Depth:** ${page.depth}
256
- `;
257
- pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
258
-
259
- `;
260
- pageContent += `---
261
-
262
- `;
263
- const markdown = htmlToMarkdown(page.html);
264
- pageContent += markdown;
265
- return pageContent;
266
- }
267
219
  function htmlToMarkdown(html) {
268
220
  try {
269
221
  return turndownService.turndown(html);
@@ -272,596 +224,340 @@ function htmlToMarkdown(html) {
272
224
  return html.replace(/<[^>]*>/g, "").trim();
273
225
  }
274
226
  }
275
- function extractDomainFromUrl(url) {
276
- try {
277
- return new URL(url).hostname;
278
- } catch {
279
- return "Unknown";
280
- }
281
- }
282
-
283
- // src/formatters/html.ts
284
- function formatToHTML(pages, baseUrl, scrapedAt, duration, website) {
285
- const html = `<!DOCTYPE html>
286
- <html lang="${website.language || "en"}">
287
- <head>
288
- <meta charset="${website.charset || "UTF-8"}">
289
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
290
- <title>Scrape: ${website.title || extractDomainFromUrl2(baseUrl)}</title>
291
- ${generateMetaTags(website)}
292
- <style>
293
- ${generateCSS()}
294
- </style>
295
- </head>
296
- <body>
297
- <header class="header">
298
- <h1>Website Scrape: ${escapeHtml(website.title || extractDomainFromUrl2(baseUrl))}</h1>
299
- <div class="meta-info">
300
- <p><strong>Base URL:</strong> <a href="${escapeHtml(
301
- baseUrl
302
- )}" target="_blank">${escapeHtml(baseUrl)}</a></p>
303
- <p><strong>Scraped at:</strong> ${new Date(scrapedAt).toLocaleString()}</p>
304
- <p><strong>Duration:</strong> ${duration}ms</p>
305
- <p><strong>Total pages:</strong> ${pages.length}</p>
306
- ${website.description ? `<p><strong>Description:</strong> ${escapeHtml(website.description)}</p>` : ""}
307
- ${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
308
- ${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
309
- </div>
310
- </header>
311
-
312
- ${pages.length > 1 ? generateTOC(pages) : ""}
313
-
314
- <main class="content">
315
- ${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
316
- </main>
317
-
318
- <footer class="footer">
319
- <p>Generated by Reader JS/TS SDK</p>
320
- </footer>
321
-
322
- <script>
323
- ${generateJavaScript()}
324
- </script>
325
- </body>
326
- </html>`;
327
- return html;
328
- }
329
- function generateMetaTags(website) {
330
- const tags = [];
331
- if (website.description) {
332
- tags.push(`<meta name="description" content="${escapeHtml(website.description)}">`);
333
- }
334
- if (website.author) {
335
- tags.push(`<meta name="author" content="${escapeHtml(website.author)}">`);
336
- }
337
- if (website.keywords) {
338
- tags.push(`<meta name="keywords" content="${escapeHtml(website.keywords.join(", "))}">`);
339
- }
340
- if (website.robots) {
341
- tags.push(`<meta name="robots" content="${escapeHtml(website.robots)}">`);
342
- }
343
- if (website.themeColor) {
344
- tags.push(`<meta name="theme-color" content="${escapeHtml(website.themeColor)}">`);
345
- }
346
- if (website.favicon) {
347
- tags.push(`<link rel="icon" href="${escapeHtml(website.favicon)}">`);
348
- }
349
- if (website.canonical) {
350
- tags.push(`<link rel="canonical" href="${escapeHtml(website.canonical)}">`);
351
- }
352
- if (website.openGraph) {
353
- const og = website.openGraph;
354
- if (og.title) tags.push(`<meta property="og:title" content="${escapeHtml(og.title)}">`);
355
- if (og.description)
356
- tags.push(`<meta property="og:description" content="${escapeHtml(og.description)}">`);
357
- if (og.type) tags.push(`<meta property="og:type" content="${escapeHtml(og.type)}">`);
358
- if (og.url) tags.push(`<meta property="og:url" content="${escapeHtml(og.url)}">`);
359
- if (og.image) tags.push(`<meta property="og:image" content="${escapeHtml(og.image)}">`);
360
- if (og.siteName)
361
- tags.push(`<meta property="og:site_name" content="${escapeHtml(og.siteName)}">`);
362
- if (og.locale) tags.push(`<meta property="og:locale" content="${escapeHtml(og.locale)}">`);
363
- }
364
- if (website.twitter) {
365
- const twitter = website.twitter;
366
- if (twitter.card) tags.push(`<meta name="twitter:card" content="${escapeHtml(twitter.card)}">`);
367
- if (twitter.site) tags.push(`<meta name="twitter:site" content="${escapeHtml(twitter.site)}">`);
368
- if (twitter.creator)
369
- tags.push(`<meta name="twitter:creator" content="${escapeHtml(twitter.creator)}">`);
370
- if (twitter.title)
371
- tags.push(`<meta name="twitter:title" content="${escapeHtml(twitter.title)}">`);
372
- if (twitter.description)
373
- tags.push(`<meta name="twitter:description" content="${escapeHtml(twitter.description)}">`);
374
- if (twitter.image)
375
- tags.push(`<meta name="twitter:image" content="${escapeHtml(twitter.image)}">`);
376
- }
377
- return tags.join("\n ");
378
- }
379
- function generateCSS() {
380
- return `
381
- * {
382
- margin: 0;
383
- padding: 0;
384
- box-sizing: border-box;
385
- }
386
-
387
- body {
388
- font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
389
- line-height: 1.6;
390
- color: #333;
391
- background-color: #f8f9fa;
392
- }
393
-
394
- .header {
395
- background: white;
396
- padding: 2rem;
397
- border-bottom: 1px solid #e9ecef;
398
- margin-bottom: 2rem;
399
- }
400
-
401
- .header h1 {
402
- color: #2c3e50;
403
- margin-bottom: 1rem;
404
- font-size: 2rem;
405
- }
406
-
407
- .meta-info {
408
- display: grid;
409
- grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
410
- gap: 0.5rem;
411
- }
412
-
413
- .meta-info p {
414
- margin: 0.25rem 0;
415
- font-size: 0.9rem;
416
- color: #6c757d;
417
- }
418
-
419
- .toc {
420
- background: white;
421
- padding: 1.5rem;
422
- margin: 2rem 0;
423
- border-radius: 8px;
424
- border: 1px solid #e9ecef;
425
- }
426
-
427
- .toc h2 {
428
- color: #2c3e50;
429
- margin-bottom: 1rem;
430
- font-size: 1.25rem;
431
- }
432
-
433
- .toc ul {
434
- list-style: none;
435
- }
436
-
437
- .toc li {
438
- margin: 0.5rem 0;
439
- }
440
-
441
- .toc a {
442
- color: #007bff;
443
- text-decoration: none;
444
- transition: color 0.2s;
445
- }
446
-
447
- .toc a:hover {
448
- color: #0056b3;
449
- text-decoration: underline;
450
- }
451
-
452
- .content {
453
- max-width: 800px;
454
- margin: 0 auto;
455
- padding: 0 1rem;
456
- }
457
-
458
- .page {
459
- background: white;
460
- margin: 2rem 0;
461
- padding: 2rem;
462
- border-radius: 8px;
463
- border: 1px solid #e9ecef;
464
- box-shadow: 0 2px 4px rgba(0,0,0,0.05);
465
- }
466
-
467
- .page-header {
468
- border-bottom: 2px solid #e9ecef;
469
- padding-bottom: 1rem;
470
- margin-bottom: 2rem;
471
- }
472
-
473
- .page-header h2 {
474
- color: #2c3e50;
475
- margin-bottom: 0.5rem;
476
- font-size: 1.5rem;
477
- }
478
-
479
- .page-meta {
480
- display: flex;
481
- flex-wrap: wrap;
482
- gap: 1rem;
483
- font-size: 0.9rem;
484
- color: #6c757d;
485
- }
486
-
487
- .page-content {
488
- line-height: 1.8;
489
- }
490
-
491
- .page-content h1, .page-content h2, .page-content h3,
492
- .page-content h4, .page-content h5, .page-content h6 {
493
- color: #2c3e50;
494
- margin: 1.5rem 0 0.5rem 0;
495
- }
496
-
497
- .page-content p {
498
- margin: 1rem 0;
499
- }
500
-
501
- .page-content a {
502
- color: #007bff;
503
- text-decoration: none;
504
- }
505
-
506
- .page-content a:hover {
507
- text-decoration: underline;
508
- }
509
-
510
- .page-content code {
511
- background: #f8f9fa;
512
- padding: 0.2rem 0.4rem;
513
- border-radius: 4px;
514
- font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
515
- font-size: 0.9em;
516
- }
517
-
518
- .page-content pre {
519
- background: #f8f9fa;
520
- padding: 1rem;
521
- border-radius: 4px;
522
- overflow-x: auto;
523
- margin: 1rem 0;
524
- }
525
-
526
- .page-content blockquote {
527
- border-left: 4px solid #007bff;
528
- padding-left: 1rem;
529
- margin: 1rem 0;
530
- color: #6c757d;
531
- }
532
-
533
- .footer {
534
- text-align: center;
535
- padding: 2rem;
536
- margin-top: 3rem;
537
- border-top: 1px solid #e9ecef;
538
- color: #6c757d;
539
- font-size: 0.9rem;
540
- }
541
-
542
- @media (max-width: 768px) {
543
- .header {
544
- padding: 1rem;
545
- }
546
-
547
- .header h1 {
548
- font-size: 1.5rem;
549
- }
550
-
551
- .page {
552
- padding: 1rem;
553
- }
554
-
555
- .page-meta {
556
- flex-direction: column;
557
- gap: 0.5rem;
558
- }
559
- }
560
- `.trim();
561
- }
562
- function generateTOC(pages) {
563
- const tocItems = pages.map((page, index) => {
564
- const pageNumber = index + 1;
565
- const title = page.title || `Page ${pageNumber}`;
566
- const id = `page-${pageNumber}`;
567
- return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
568
- }).join("\n");
569
- return `
570
- <nav class="toc">
571
- <h2>Table of Contents</h2>
572
- <ul>
573
- ${tocItems}
574
- </ul>
575
- </nav>`;
576
- }
577
- function generatePageHTML(page, pageNumber) {
578
- const id = `page-${pageNumber}`;
579
- const title = page.title || `Page ${pageNumber}`;
580
- return `
581
- <article class="page" id="${id}">
582
- <div class="page-header">
583
- <h2>${pageNumber}. ${escapeHtml(title)}</h2>
584
- <div class="page-meta">
585
- <span><strong>URL:</strong> <a href="${escapeHtml(
586
- page.url
587
- )}" target="_blank">${escapeHtml(page.url)}</a></span>
588
- <span><strong>Depth:</strong> ${page.depth}</span>
589
- <span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
590
- </div>
591
- </div>
592
- <div class="page-content">
593
- ${page.html}
594
- </div>
595
- </article>`;
596
- }
597
- function generateJavaScript() {
598
- return `
599
- // Smooth scrolling for TOC links
600
- document.querySelectorAll('a[href^="#"]').forEach(anchor => {
601
- anchor.addEventListener('click', function (e) {
602
- e.preventDefault();
603
- const target = document.querySelector(this.getAttribute('href'));
604
- if (target) {
605
- target.scrollIntoView({
606
- behavior: 'smooth',
607
- block: 'start'
608
- });
609
- }
610
- });
611
- });
612
-
613
- // Highlight current section in TOC
614
- window.addEventListener('scroll', function() {
615
- const pages = document.querySelectorAll('.page');
616
- const tocLinks = document.querySelectorAll('.toc a');
617
-
618
- let currentPage = null;
619
- pages.forEach(page => {
620
- const rect = page.getBoundingClientRect();
621
- if (rect.top <= 100) {
622
- currentPage = page;
623
- }
624
- });
625
-
626
- tocLinks.forEach(link => {
627
- link.style.fontWeight = 'normal';
628
- const target = document.querySelector(link.getAttribute('href'));
629
- if (target === currentPage) {
630
- link.style.fontWeight = 'bold';
631
- }
632
- });
633
- });
634
- `;
635
- }
636
- function escapeHtml(text) {
637
- return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, "&quot;").replace(/'/g, "&#039;").replace(/\//g, "&#x2F;");
638
- }
639
- function extractDomainFromUrl2(url) {
640
- try {
641
- return new URL(url).hostname;
642
- } catch {
643
- return "Unknown";
644
- }
645
- }
646
-
647
- // src/formatters/json.ts
648
- function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
649
- const jsonResult = {
650
- metadata: {
651
- baseUrl,
652
- totalPages: pages.length,
653
- scrapedAt,
654
- duration,
655
- website
656
- },
657
- pages: pages.map((page, index) => ({
658
- index: index + 1,
659
- url: page.url,
660
- title: page.title,
661
- markdown: page.markdown,
662
- html: page.html,
663
- fetchedAt: page.fetchedAt,
664
- depth: page.depth,
665
- wordCount: countWords(page.markdown),
666
- readingTime: estimateReadingTime(page.markdown)
667
- }))
668
- };
669
- return JSON.stringify(jsonResult, null, 2);
670
- }
671
- function formatToJsonLite(pages, baseUrl, scrapedAt, duration, website) {
672
- const jsonResult = {
673
- metadata: {
674
- baseUrl,
675
- totalPages: pages.length,
676
- scrapedAt,
677
- duration,
678
- website
679
- },
680
- pages: pages.map((page, index) => ({
681
- index: index + 1,
682
- url: page.url,
683
- title: page.title,
684
- markdown: page.markdown,
685
- fetchedAt: page.fetchedAt,
686
- depth: page.depth,
687
- wordCount: countWords(page.markdown),
688
- readingTime: estimateReadingTime(page.markdown)
689
- }))
690
- };
691
- return JSON.stringify(jsonResult, null, 2);
692
- }
693
- function countWords(markdown) {
694
- const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
695
- return plainText.split(/\s+/).filter((word) => word.length > 0).length;
696
- }
697
- function estimateReadingTime(markdown) {
698
- const wordCount = countWords(markdown);
699
- return Math.ceil(wordCount / 200);
700
- }
701
-
702
- // src/formatters/text.ts
703
- import { parseHTML } from "linkedom";
704
- function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
705
- const sections = [];
706
- if (includeMetadata) {
707
- sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
708
- }
709
- sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
710
- return sections.join("\n\n");
711
- }
712
- function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
713
- const title = website.title || extractDomainFromUrl3(baseUrl);
714
- const lines = [];
715
- lines.push(`=== ${title} ===`);
716
- lines.push("");
717
- lines.push(`URL: ${baseUrl}`);
718
- lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
719
- lines.push(`Duration: ${duration}ms`);
720
- lines.push(`Pages: ${totalPages}`);
721
- if (website.description) {
722
- lines.push(`Description: ${website.description}`);
723
- }
724
- if (website.author) {
725
- lines.push(`Author: ${website.author}`);
726
- }
727
- if (website.language) {
728
- lines.push(`Language: ${website.language}`);
729
- }
730
- return lines.join("\n");
731
- }
732
- function createTextPage(page, pageNumber, showSeparator) {
733
- const lines = [];
734
- if (showSeparator) {
735
- lines.push("\u2500".repeat(60));
736
- lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
737
- lines.push(`URL: ${page.url}`);
738
- lines.push("\u2500".repeat(60));
739
- }
740
- const plainText = htmlToPlainText(page.html);
741
- lines.push(plainText);
742
- return lines.join("\n");
743
- }
744
- function htmlToPlainText(html) {
745
- const { document } = parseHTML(html);
746
- const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
747
- elementsToRemove.forEach((tag) => {
748
- document.querySelectorAll(tag).forEach((el) => el.remove());
749
- });
750
- let text = document.body?.textContent || document.documentElement?.textContent || "";
751
- text = text.replace(/[ \t]+/g, " ");
752
- text = text.replace(/\n[ \t]+/g, "\n");
753
- text = text.replace(/[ \t]+\n/g, "\n");
754
- text = text.replace(/\n{3,}/g, "\n\n");
755
- text = text.trim();
756
- return text;
757
- }
758
- function extractDomainFromUrl3(url) {
759
- try {
760
- return new URL(url).hostname;
761
- } catch {
762
- return "Unknown";
763
- }
764
- }
227
+ var formatToMarkdown = htmlToMarkdown;
765
228
 
766
229
  // src/utils/content-cleaner.ts
767
- import { parseHTML as parseHTML2 } from "linkedom";
230
+ import { parseHTML } from "linkedom";
768
231
  var ALWAYS_REMOVE_SELECTORS = [
769
- // Navigation and menus
770
- "nav",
771
- "header nav",
772
- "footer nav",
773
- ".nav",
774
- ".navigation",
775
- ".menu",
776
- ".navbar",
777
- ".sidebar",
778
- ".aside",
779
- // Header and footer elements
780
- "header",
781
- "footer",
782
- ".site-header",
783
- ".page-header",
784
- ".site-footer",
785
- ".page-footer",
786
- // Social media and sharing
787
- ".social",
788
- ".share",
789
- ".sharing",
790
- ".twitter",
791
- ".facebook",
792
- ".linkedin",
793
- ".instagram",
794
- // Comments and discussions
795
- ".comments",
796
- ".comment",
797
- ".discussion",
798
- ".disqus",
799
- // Forms and interactive elements
800
- "form",
801
- "input",
802
- "button:not([type='submit'])",
803
- "select",
804
- "textarea",
805
232
  // Scripts and styles
806
233
  "script",
807
234
  "style",
808
235
  "noscript",
236
+ "link[rel='stylesheet']",
809
237
  // Hidden elements
810
238
  "[hidden]",
239
+ "[aria-hidden='true']",
811
240
  "[style*='display: none']",
812
241
  "[style*='display:none']",
813
- // Common utility classes
814
- ".cookie",
815
- ".cookie-banner",
816
- ".popup",
242
+ "[style*='visibility: hidden']",
243
+ "[style*='visibility:hidden']",
244
+ // SVG icons and decorative elements
245
+ "svg[aria-hidden='true']",
246
+ "svg.icon",
247
+ "svg[class*='icon']",
248
+ // Template and metadata
249
+ "template",
250
+ "meta",
251
+ // Embeds that don't convert to text
252
+ "iframe",
253
+ "canvas",
254
+ "object",
255
+ "embed",
256
+ // Forms (usually not main content)
257
+ "form",
258
+ "input",
259
+ "select",
260
+ "textarea",
261
+ "button"
262
+ ];
263
+ var OVERLAY_SELECTORS = [
264
+ "[class*='modal']",
265
+ "[class*='popup']",
266
+ "[class*='overlay']",
267
+ "[class*='dialog']",
268
+ "[role='dialog']",
269
+ "[role='alertdialog']",
270
+ "[class*='cookie']",
271
+ "[class*='consent']",
272
+ "[class*='gdpr']",
273
+ "[class*='privacy-banner']",
274
+ "[class*='notification-bar']",
275
+ "[id*='cookie']",
276
+ "[id*='consent']",
277
+ "[id*='gdpr']",
278
+ // Fixed/sticky positioned elements
279
+ "[style*='position: fixed']",
280
+ "[style*='position:fixed']",
281
+ "[style*='position: sticky']",
282
+ "[style*='position:sticky']"
283
+ ];
284
+ var NAVIGATION_SELECTORS = [
285
+ // Semantic elements
286
+ "header",
287
+ "footer",
288
+ "nav",
289
+ "aside",
290
+ // Header variations
291
+ ".header",
292
+ ".top",
293
+ ".navbar",
294
+ "#header",
295
+ // Footer variations
296
+ ".footer",
297
+ ".bottom",
298
+ "#footer",
299
+ // Sidebars
300
+ ".sidebar",
301
+ ".side",
302
+ ".aside",
303
+ "#sidebar",
304
+ // Modals/popups (backup if not caught by OVERLAY_SELECTORS)
817
305
  ".modal",
306
+ ".popup",
307
+ "#modal",
818
308
  ".overlay",
819
- ".notification",
309
+ // Ads
310
+ ".ad",
311
+ ".ads",
312
+ ".advert",
313
+ "#ad",
314
+ // Language selectors
315
+ ".lang-selector",
316
+ ".language",
317
+ "#language-selector",
318
+ // Social
319
+ ".social",
320
+ ".social-media",
321
+ ".social-links",
322
+ "#social",
323
+ // Navigation/menus
324
+ ".menu",
325
+ ".navigation",
326
+ "#nav",
820
327
  // Breadcrumbs
821
- ".breadcrumb",
822
328
  ".breadcrumbs",
823
- ".breadcrumb-trail"
329
+ "#breadcrumbs",
330
+ // Share buttons
331
+ ".share",
332
+ "#share",
333
+ // Widgets
334
+ ".widget",
335
+ "#widget",
336
+ // Cookie notices (backup)
337
+ ".cookie",
338
+ "#cookie"
339
+ ];
340
+ var FORCE_INCLUDE_SELECTORS = [
341
+ // IDs
342
+ "#main",
343
+ "#content",
344
+ "#main-content",
345
+ "#article",
346
+ "#post",
347
+ "#page-content",
348
+ // Semantic elements
349
+ "main",
350
+ "article",
351
+ "[role='main']",
352
+ // Classes
353
+ ".main-content",
354
+ ".content",
355
+ ".post-content",
356
+ ".article-content",
357
+ ".entry-content",
358
+ ".page-content",
359
+ ".article-body",
360
+ ".post-body",
361
+ ".story-content",
362
+ ".blog-content"
824
363
  ];
825
364
  var AD_SELECTORS = [
826
- // Ads and promotions
827
- ".ad",
828
- ".ads",
829
- ".advertisement",
830
- ".promotion",
831
- ".sponsored",
832
- "[class*='ad-']",
833
- "[id*='ad-']",
834
- "[class*='advert']",
835
- "[id*='advert']",
836
- "[class*='banner']",
837
- "[id*='banner']",
365
+ // Google ads
366
+ "ins.adsbygoogle",
838
367
  ".google-ad",
839
368
  ".adsense",
369
+ // Generic ad containers
840
370
  "[data-ad]",
841
371
  "[data-ads]",
842
- "ins.adsbygoogle",
843
- // Tracking
844
- "[class*='tracking']",
845
- "[id*='tracking']",
846
- "[class*='analytics']",
847
- "[id*='analytics']"
372
+ "[data-ad-slot]",
373
+ "[data-ad-client]",
374
+ // Common ad class patterns
375
+ ".ad-container",
376
+ ".ad-wrapper",
377
+ ".advertisement",
378
+ ".sponsored-content",
379
+ // Tracking pixels
380
+ "img[width='1'][height='1']",
381
+ "img[src*='pixel']",
382
+ "img[src*='tracking']",
383
+ "img[src*='analytics']"
848
384
  ];
849
- function cleanHtml(html, baseUrl, options = {}) {
850
- const { removeAds = true, removeBase64Images = true } = options;
851
- const { document } = parseHTML2(html);
852
- for (const selector of ALWAYS_REMOVE_SELECTORS) {
385
+ function getLinkDensity(element) {
386
+ const text = element.textContent || "";
387
+ const textLength = text.trim().length;
388
+ if (textLength === 0) return 1;
389
+ let linkLength = 0;
390
+ element.querySelectorAll("a").forEach((link) => {
391
+ linkLength += (link.textContent || "").trim().length;
392
+ });
393
+ return linkLength / textLength;
394
+ }
395
+ function getContentScore(element) {
396
+ let score = 0;
397
+ const text = element.textContent || "";
398
+ const textLength = text.trim().length;
399
+ score += Math.min(textLength / 100, 50);
400
+ score += element.querySelectorAll("p").length * 3;
401
+ score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2;
402
+ score += element.querySelectorAll("img").length * 1;
403
+ score -= element.querySelectorAll("a").length * 0.5;
404
+ score -= element.querySelectorAll("li").length * 0.2;
405
+ const linkDensity = getLinkDensity(element);
406
+ if (linkDensity > 0.5) score -= 30;
407
+ else if (linkDensity > 0.3) score -= 15;
408
+ const classAndId = (element.className || "") + " " + (element.id || "");
409
+ if (/article|content|post|body|main|entry/i.test(classAndId)) score += 25;
410
+ if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) score -= 25;
411
+ return score;
412
+ }
413
+ function looksLikeNavigation(element) {
414
+ const linkDensity = getLinkDensity(element);
415
+ if (linkDensity > 0.5) return true;
416
+ const listItems = element.querySelectorAll("li");
417
+ const links = element.querySelectorAll("a");
418
+ if (listItems.length > 5 && links.length > listItems.length * 0.8) return true;
419
+ return false;
420
+ }
421
+ function removeElements(document, selectors) {
422
+ for (const selector of selectors) {
853
423
  try {
854
424
  document.querySelectorAll(selector).forEach((el) => el.remove());
855
425
  } catch {
856
426
  }
857
427
  }
428
+ }
429
+ function removeWithProtection(document, selectorsToRemove, protectedSelectors) {
430
+ for (const selector of selectorsToRemove) {
431
+ try {
432
+ document.querySelectorAll(selector).forEach((element) => {
433
+ const isProtected = protectedSelectors.some((ps) => {
434
+ try {
435
+ return element.matches(ps);
436
+ } catch {
437
+ return false;
438
+ }
439
+ });
440
+ if (isProtected) return;
441
+ const containsProtected = protectedSelectors.some((ps) => {
442
+ try {
443
+ return element.querySelector(ps) !== null;
444
+ } catch {
445
+ return false;
446
+ }
447
+ });
448
+ if (containsProtected) return;
449
+ element.remove();
450
+ });
451
+ } catch {
452
+ }
453
+ }
454
+ }
455
+ function findMainContent(document) {
456
+ const isValidContent = (el) => {
457
+ if (!el) return false;
458
+ const text = el.textContent || "";
459
+ if (text.trim().length < 100) return false;
460
+ if (looksLikeNavigation(el)) return false;
461
+ return true;
462
+ };
463
+ const main = document.querySelector("main");
464
+ if (isValidContent(main) && getLinkDensity(main) < 0.4) {
465
+ return main;
466
+ }
467
+ const roleMain = document.querySelector('[role="main"]');
468
+ if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) {
469
+ return roleMain;
470
+ }
471
+ const articles = document.querySelectorAll("article");
472
+ if (articles.length === 1 && isValidContent(articles[0])) {
473
+ return articles[0];
474
+ }
475
+ const contentSelectors = [
476
+ "#content",
477
+ "#main-content",
478
+ "#main",
479
+ ".content",
480
+ ".main-content",
481
+ ".post-content",
482
+ ".article-content",
483
+ ".entry-content",
484
+ ".page-content",
485
+ ".article-body",
486
+ ".post-body",
487
+ ".story-content",
488
+ ".blog-content"
489
+ ];
490
+ for (const selector of contentSelectors) {
491
+ try {
492
+ const el = document.querySelector(selector);
493
+ if (isValidContent(el) && getLinkDensity(el) < 0.4) {
494
+ return el;
495
+ }
496
+ } catch {
497
+ }
498
+ }
499
+ const candidates = [];
500
+ const containers = document.querySelectorAll("div, section, article");
501
+ containers.forEach((el) => {
502
+ const text = el.textContent || "";
503
+ if (text.trim().length < 200) return;
504
+ const score = getContentScore(el);
505
+ if (score > 0) {
506
+ candidates.push({ el, score });
507
+ }
508
+ });
509
+ candidates.sort((a, b) => b.score - a.score);
510
+ if (candidates.length > 0 && candidates[0].score > 20) {
511
+ return candidates[0].el;
512
+ }
513
+ return null;
514
+ }
515
+ function cleanHtml(html, baseUrl, options = {}) {
516
+ const {
517
+ removeAds = true,
518
+ removeBase64Images = true,
519
+ onlyMainContent = true,
520
+ includeTags,
521
+ excludeTags
522
+ } = options;
523
+ const { document } = parseHTML(html);
524
+ removeElements(document, ALWAYS_REMOVE_SELECTORS);
525
+ removeElements(document, OVERLAY_SELECTORS);
858
526
  if (removeAds) {
859
- for (const selector of AD_SELECTORS) {
527
+ removeElements(document, AD_SELECTORS);
528
+ }
529
+ if (excludeTags && excludeTags.length > 0) {
530
+ removeElements(document, excludeTags);
531
+ }
532
+ if (onlyMainContent) {
533
+ removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS);
534
+ const mainContent = findMainContent(document);
535
+ if (mainContent) {
536
+ const body = document.body;
537
+ if (body) {
538
+ const clone = mainContent.cloneNode(true);
539
+ body.innerHTML = "";
540
+ body.appendChild(clone);
541
+ }
542
+ }
543
+ }
544
+ if (includeTags && includeTags.length > 0) {
545
+ const matchedElements = [];
546
+ for (const selector of includeTags) {
860
547
  try {
861
- document.querySelectorAll(selector).forEach((el) => el.remove());
548
+ document.querySelectorAll(selector).forEach((el) => {
549
+ matchedElements.push(el.cloneNode(true));
550
+ });
862
551
  } catch {
863
552
  }
864
553
  }
554
+ if (matchedElements.length > 0) {
555
+ const body = document.body;
556
+ if (body) {
557
+ body.innerHTML = "";
558
+ matchedElements.forEach((el) => body.appendChild(el));
559
+ }
560
+ }
865
561
  }
866
562
  if (removeBase64Images) {
867
563
  removeBase64ImagesFromDocument(document);
@@ -886,7 +582,10 @@ function removeBase64ImagesFromDocument(document) {
886
582
  document.querySelectorAll("[style*='data:image']").forEach((el) => {
887
583
  const style = el.getAttribute("style");
888
584
  if (style) {
889
- const cleanedStyle = style.replace(/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi, "");
585
+ const cleanedStyle = style.replace(
586
+ /background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi,
587
+ ""
588
+ );
890
589
  if (cleanedStyle.trim()) {
891
590
  el.setAttribute("style", cleanedStyle);
892
591
  } else {
@@ -923,7 +622,7 @@ function cleanContent(html, baseUrl, options = {}) {
923
622
  }
924
623
 
925
624
  // src/utils/metadata-extractor.ts
926
- import { parseHTML as parseHTML3 } from "linkedom";
625
+ import { parseHTML as parseHTML2 } from "linkedom";
927
626
 
928
627
  // src/utils/url-helpers.ts
929
628
  import { URL as URL2 } from "url";
@@ -996,8 +695,26 @@ function isSameDomain(url, baseUrl) {
996
695
  function getUrlKey(url) {
997
696
  try {
998
697
  const parsedUrl = new URL2(url);
698
+ parsedUrl.hash = "";
999
699
  parsedUrl.search = "";
1000
- return parsedUrl.toString().toLowerCase();
700
+ if (parsedUrl.hostname.startsWith("www.")) {
701
+ parsedUrl.hostname = parsedUrl.hostname.slice(4);
702
+ }
703
+ if (parsedUrl.protocol === "http:" && parsedUrl.port === "80" || parsedUrl.protocol === "https:" && parsedUrl.port === "443") {
704
+ parsedUrl.port = "";
705
+ }
706
+ const indexFiles = ["index.html", "index.htm", "default.html", "default.htm", "index.php"];
707
+ for (const indexFile of indexFiles) {
708
+ if (parsedUrl.pathname.endsWith(`/${indexFile}`)) {
709
+ parsedUrl.pathname = parsedUrl.pathname.slice(0, -indexFile.length);
710
+ break;
711
+ }
712
+ }
713
+ let normalized = parsedUrl.toString().toLowerCase();
714
+ if (normalized.endsWith("/") && parsedUrl.pathname !== "/") {
715
+ normalized = normalized.slice(0, -1);
716
+ }
717
+ return normalized;
1001
718
  } catch {
1002
719
  return url.toLowerCase();
1003
720
  }
@@ -1232,7 +949,7 @@ function extractMetadata(html, baseUrl) {
1232
949
  return extractWebsiteMetadata(html, baseUrl);
1233
950
  }
1234
951
  function extractWebsiteMetadata(html, baseUrl) {
1235
- const { document } = parseHTML3(html);
952
+ const { document } = parseHTML2(html);
1236
953
  const metadata = {
1237
954
  title: null,
1238
955
  description: null,
@@ -1514,13 +1231,15 @@ function isUrlAllowed(url, rules) {
1514
1231
  var DEFAULT_OPTIONS = {
1515
1232
  urls: [],
1516
1233
  formats: ["markdown"],
1517
- includeMetadata: true,
1518
1234
  timeoutMs: 3e4,
1519
1235
  includePatterns: [],
1520
1236
  excludePatterns: [],
1521
1237
  // Content cleaning defaults
1522
1238
  removeAds: true,
1523
1239
  removeBase64Images: true,
1240
+ onlyMainContent: true,
1241
+ includeTags: [],
1242
+ excludeTags: [],
1524
1243
  skipTLSVerification: true,
1525
1244
  // Batch defaults
1526
1245
  batchConcurrency: 1,
@@ -1534,7 +1253,7 @@ var DEFAULT_OPTIONS = {
1534
1253
  showChrome: false
1535
1254
  };
1536
1255
  function isValidFormat(format) {
1537
- return format === "markdown" || format === "html" || format === "json" || format === "text";
1256
+ return format === "markdown" || format === "html";
1538
1257
  }
1539
1258
  function shouldCrawlUrl2(url, baseDomain) {
1540
1259
  return url.hostname === baseDomain || url.hostname.endsWith(`.${baseDomain}`);
@@ -1683,14 +1402,9 @@ var Scraper = class {
1683
1402
  } catch {
1684
1403
  }
1685
1404
  await hero.waitForPaintingStable();
1686
- let hadChallenge = false;
1687
- let challengeType = "none";
1688
- let waitTimeMs = 0;
1689
1405
  const initialUrl = await hero.url;
1690
1406
  const detection = await detectChallenge(hero);
1691
1407
  if (detection.isChallenge) {
1692
- hadChallenge = true;
1693
- challengeType = detection.type;
1694
1408
  if (this.options.verbose) {
1695
1409
  this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
1696
1410
  }
@@ -1700,12 +1414,11 @@ var Scraper = class {
1700
1414
  verbose: this.options.verbose,
1701
1415
  initialUrl
1702
1416
  });
1703
- waitTimeMs = result2.waitedMs;
1704
1417
  if (!result2.resolved) {
1705
1418
  throw new Error(`Challenge not resolved: ${detection.type}`);
1706
1419
  }
1707
1420
  if (this.options.verbose) {
1708
- this.logger.info(`Challenge resolved via ${result2.method} in ${waitTimeMs}ms`);
1421
+ this.logger.info(`Challenge resolved via ${result2.method} in ${result2.waitedMs}ms`);
1709
1422
  }
1710
1423
  }
1711
1424
  await this.waitForFinalPage(hero, url, this.options.verbose);
@@ -1718,45 +1431,18 @@ var Scraper = class {
1718
1431
  this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
1719
1432
  }
1720
1433
  }
1721
- const pageTitle = await hero.document.title;
1722
1434
  const html = await hero.document.documentElement.outerHTML;
1723
1435
  const cleanedHtml = cleanContent(html, url, {
1724
1436
  removeAds: this.options.removeAds,
1725
- removeBase64Images: this.options.removeBase64Images
1437
+ removeBase64Images: this.options.removeBase64Images,
1438
+ onlyMainContent: this.options.onlyMainContent,
1439
+ includeTags: this.options.includeTags,
1440
+ excludeTags: this.options.excludeTags
1726
1441
  });
1727
1442
  const websiteMetadata = extractMetadata(cleanedHtml, url);
1728
1443
  const duration = Date.now() - startTime;
1729
- const scrapedAt = (/* @__PURE__ */ new Date()).toISOString();
1730
- const page = {
1731
- url,
1732
- title: pageTitle,
1733
- markdown: "",
1734
- // Will be set by formatter
1735
- html: cleanedHtml,
1736
- fetchedAt: scrapedAt,
1737
- depth: 0,
1738
- hadChallenge,
1739
- challengeType,
1740
- waitTimeMs
1741
- };
1742
- const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
1743
- [page],
1744
- url,
1745
- scrapedAt,
1746
- duration,
1747
- websiteMetadata,
1748
- this.options.includeMetadata
1749
- ) : void 0;
1750
- const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
1751
- const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
1752
- const text = this.options.formats.includes("text") ? formatToText(
1753
- [page],
1754
- url,
1755
- scrapedAt,
1756
- duration,
1757
- websiteMetadata,
1758
- this.options.includeMetadata
1759
- ) : void 0;
1444
+ const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
1445
+ const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
1760
1446
  if (this.options.onProgress) {
1761
1447
  this.options.onProgress({
1762
1448
  completed: index + 1,
@@ -1788,8 +1474,6 @@ var Scraper = class {
1788
1474
  const result = {
1789
1475
  markdown,
1790
1476
  html: htmlOutput,
1791
- json,
1792
- text,
1793
1477
  metadata: {
1794
1478
  baseUrl: url,
1795
1479
  totalPages: 1,
@@ -1844,7 +1528,7 @@ async function scrape(options) {
1844
1528
  }
1845
1529
 
1846
1530
  // src/crawler.ts
1847
- import { parseHTML as parseHTML4 } from "linkedom";
1531
+ import { parseHTML as parseHTML3 } from "linkedom";
1848
1532
 
1849
1533
  // src/utils/rate-limiter.ts
1850
1534
  import pLimit2 from "p-limit";
@@ -1993,12 +1677,26 @@ var Crawler = class {
1993
1677
  */
1994
1678
  extractLinks(html, baseUrl, depth) {
1995
1679
  const links = [];
1996
- const { document } = parseHTML4(html);
1680
+ const { document } = parseHTML3(html);
1997
1681
  document.querySelectorAll("a[href]").forEach((anchor) => {
1998
- const href = anchor.getAttribute("href");
1682
+ const rawHref = anchor.getAttribute("href");
1683
+ if (!rawHref) return;
1684
+ const href = rawHref.trim();
1999
1685
  if (!href) return;
2000
- const resolved = resolveUrl(href, baseUrl);
1686
+ if (href.startsWith("#")) return;
1687
+ const lowerHref = href.toLowerCase();
1688
+ if (lowerHref.startsWith("javascript:") || lowerHref.startsWith("mailto:") || lowerHref.startsWith("tel:") || lowerHref.startsWith("data:") || lowerHref.startsWith("blob:") || lowerHref.startsWith("ftp:")) {
1689
+ return;
1690
+ }
1691
+ let resolved = resolveUrl(href, baseUrl);
2001
1692
  if (!resolved || !isValidUrl(resolved)) return;
1693
+ try {
1694
+ const parsed = new URL(resolved);
1695
+ parsed.hash = "";
1696
+ resolved = parsed.toString();
1697
+ } catch {
1698
+ return;
1699
+ }
2002
1700
  if (!isSameDomain(resolved, this.options.url)) return;
2003
1701
  if (!isContentUrl(resolved)) return;
2004
1702
  if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
@@ -3046,16 +2744,251 @@ async function isDaemonRunning(port = DEFAULT_DAEMON_PORT) {
3046
2744
  const client = new DaemonClient({ port, timeoutMs: 5e3 });
3047
2745
  return client.isRunning();
3048
2746
  }
2747
+
2748
+ // src/formatters/html.ts
2749
+ function formatToHTML(html) {
2750
+ return html;
2751
+ }
2752
+
2753
+ // src/errors.ts
2754
+ var ReaderErrorCode = /* @__PURE__ */ ((ReaderErrorCode2) => {
2755
+ ReaderErrorCode2["NETWORK_ERROR"] = "NETWORK_ERROR";
2756
+ ReaderErrorCode2["TIMEOUT"] = "TIMEOUT";
2757
+ ReaderErrorCode2["CONNECTION_REFUSED"] = "CONNECTION_REFUSED";
2758
+ ReaderErrorCode2["CLOUDFLARE_CHALLENGE"] = "CLOUDFLARE_CHALLENGE";
2759
+ ReaderErrorCode2["BOT_DETECTED"] = "BOT_DETECTED";
2760
+ ReaderErrorCode2["ACCESS_DENIED"] = "ACCESS_DENIED";
2761
+ ReaderErrorCode2["CONTENT_EXTRACTION_FAILED"] = "CONTENT_EXTRACTION_FAILED";
2762
+ ReaderErrorCode2["EMPTY_CONTENT"] = "EMPTY_CONTENT";
2763
+ ReaderErrorCode2["INVALID_URL"] = "INVALID_URL";
2764
+ ReaderErrorCode2["INVALID_OPTIONS"] = "INVALID_OPTIONS";
2765
+ ReaderErrorCode2["ROBOTS_BLOCKED"] = "ROBOTS_BLOCKED";
2766
+ ReaderErrorCode2["BROWSER_ERROR"] = "BROWSER_ERROR";
2767
+ ReaderErrorCode2["POOL_EXHAUSTED"] = "POOL_EXHAUSTED";
2768
+ ReaderErrorCode2["CLIENT_CLOSED"] = "CLIENT_CLOSED";
2769
+ ReaderErrorCode2["NOT_INITIALIZED"] = "NOT_INITIALIZED";
2770
+ ReaderErrorCode2["UNKNOWN"] = "UNKNOWN";
2771
+ return ReaderErrorCode2;
2772
+ })(ReaderErrorCode || {});
2773
+ var ReaderError = class extends Error {
2774
+ code;
2775
+ url;
2776
+ cause;
2777
+ timestamp;
2778
+ retryable;
2779
+ constructor(message, code, options) {
2780
+ super(message);
2781
+ this.name = "ReaderError";
2782
+ this.code = code;
2783
+ this.url = options?.url;
2784
+ this.cause = options?.cause;
2785
+ this.timestamp = (/* @__PURE__ */ new Date()).toISOString();
2786
+ this.retryable = options?.retryable ?? false;
2787
+ if (Error.captureStackTrace) {
2788
+ Error.captureStackTrace(this, this.constructor);
2789
+ }
2790
+ }
2791
+ /**
2792
+ * Convert to a plain object for serialization
2793
+ */
2794
+ toJSON() {
2795
+ return {
2796
+ name: this.name,
2797
+ code: this.code,
2798
+ message: this.message,
2799
+ url: this.url,
2800
+ timestamp: this.timestamp,
2801
+ retryable: this.retryable,
2802
+ cause: this.cause?.message,
2803
+ stack: this.stack
2804
+ };
2805
+ }
2806
+ };
2807
+ var NetworkError = class extends ReaderError {
2808
+ constructor(message, options) {
2809
+ super(message, "NETWORK_ERROR" /* NETWORK_ERROR */, {
2810
+ ...options,
2811
+ retryable: true
2812
+ });
2813
+ this.name = "NetworkError";
2814
+ }
2815
+ };
2816
+ var TimeoutError = class extends ReaderError {
2817
+ timeoutMs;
2818
+ constructor(message, timeoutMs, options) {
2819
+ super(message, "TIMEOUT" /* TIMEOUT */, {
2820
+ ...options,
2821
+ retryable: true
2822
+ });
2823
+ this.name = "TimeoutError";
2824
+ this.timeoutMs = timeoutMs;
2825
+ }
2826
+ toJSON() {
2827
+ return {
2828
+ ...super.toJSON(),
2829
+ timeoutMs: this.timeoutMs
2830
+ };
2831
+ }
2832
+ };
2833
+ var CloudflareError = class extends ReaderError {
2834
+ challengeType;
2835
+ constructor(challengeType, options) {
2836
+ super(
2837
+ `Cloudflare ${challengeType} challenge not resolved. Consider using a residential proxy or increasing timeout.`,
2838
+ "CLOUDFLARE_CHALLENGE" /* CLOUDFLARE_CHALLENGE */,
2839
+ {
2840
+ ...options,
2841
+ retryable: true
2842
+ }
2843
+ );
2844
+ this.name = "CloudflareError";
2845
+ this.challengeType = challengeType;
2846
+ }
2847
+ toJSON() {
2848
+ return {
2849
+ ...super.toJSON(),
2850
+ challengeType: this.challengeType
2851
+ };
2852
+ }
2853
+ };
2854
+ var AccessDeniedError = class extends ReaderError {
2855
+ statusCode;
2856
+ constructor(message, options) {
2857
+ super(message, "ACCESS_DENIED" /* ACCESS_DENIED */, {
2858
+ ...options,
2859
+ retryable: false
2860
+ });
2861
+ this.name = "AccessDeniedError";
2862
+ this.statusCode = options?.statusCode;
2863
+ }
2864
+ toJSON() {
2865
+ return {
2866
+ ...super.toJSON(),
2867
+ statusCode: this.statusCode
2868
+ };
2869
+ }
2870
+ };
2871
+ var ContentExtractionError = class extends ReaderError {
2872
+ constructor(message, options) {
2873
+ super(message, "CONTENT_EXTRACTION_FAILED" /* CONTENT_EXTRACTION_FAILED */, {
2874
+ ...options,
2875
+ retryable: false
2876
+ });
2877
+ this.name = "ContentExtractionError";
2878
+ }
2879
+ };
2880
+ var ValidationError = class extends ReaderError {
2881
+ field;
2882
+ constructor(message, options) {
2883
+ super(message, "INVALID_OPTIONS" /* INVALID_OPTIONS */, {
2884
+ url: options?.url,
2885
+ retryable: false
2886
+ });
2887
+ this.name = "ValidationError";
2888
+ this.field = options?.field;
2889
+ }
2890
+ toJSON() {
2891
+ return {
2892
+ ...super.toJSON(),
2893
+ field: this.field
2894
+ };
2895
+ }
2896
+ };
2897
+ var InvalidUrlError = class extends ReaderError {
2898
+ constructor(url, reason) {
2899
+ super(reason ? `Invalid URL "${url}": ${reason}` : `Invalid URL: ${url}`, "INVALID_URL" /* INVALID_URL */, {
2900
+ url,
2901
+ retryable: false
2902
+ });
2903
+ this.name = "InvalidUrlError";
2904
+ }
2905
+ };
2906
+ var RobotsBlockedError = class extends ReaderError {
2907
+ constructor(url) {
2908
+ super(`URL blocked by robots.txt: ${url}. Set respectRobotsTxt: false to override.`, "ROBOTS_BLOCKED" /* ROBOTS_BLOCKED */, {
2909
+ url,
2910
+ retryable: false
2911
+ });
2912
+ this.name = "RobotsBlockedError";
2913
+ }
2914
+ };
2915
+ var BrowserPoolError = class extends ReaderError {
2916
+ constructor(message, options) {
2917
+ super(message, "BROWSER_ERROR" /* BROWSER_ERROR */, {
2918
+ ...options,
2919
+ retryable: true
2920
+ });
2921
+ this.name = "BrowserPoolError";
2922
+ }
2923
+ };
2924
+ var ClientClosedError = class extends ReaderError {
2925
+ constructor() {
2926
+ super("ReaderClient has been closed. Create a new instance to continue.", "CLIENT_CLOSED" /* CLIENT_CLOSED */, {
2927
+ retryable: false
2928
+ });
2929
+ this.name = "ClientClosedError";
2930
+ }
2931
+ };
2932
+ var NotInitializedError = class extends ReaderError {
2933
+ constructor(component) {
2934
+ super(`${component} not initialized. This should not happen - please report this bug.`, "NOT_INITIALIZED" /* NOT_INITIALIZED */, {
2935
+ retryable: false
2936
+ });
2937
+ this.name = "NotInitializedError";
2938
+ }
2939
+ };
2940
+ function wrapError(error, url) {
2941
+ if (error instanceof ReaderError) {
2942
+ return error;
2943
+ }
2944
+ if (error instanceof Error) {
2945
+ const message = error.message.toLowerCase();
2946
+ if (message.includes("timeout") || message.includes("timed out")) {
2947
+ return new TimeoutError(error.message, 3e4, { url, cause: error });
2948
+ }
2949
+ if (message.includes("econnrefused") || message.includes("connection refused")) {
2950
+ return new NetworkError(`Connection refused: ${error.message}`, { url, cause: error });
2951
+ }
2952
+ if (message.includes("enotfound") || message.includes("dns")) {
2953
+ return new NetworkError(`DNS lookup failed: ${error.message}`, { url, cause: error });
2954
+ }
2955
+ if (message.includes("cloudflare") || message.includes("challenge")) {
2956
+ return new CloudflareError("unknown", { url, cause: error });
2957
+ }
2958
+ return new ReaderError(error.message, "UNKNOWN" /* UNKNOWN */, {
2959
+ url,
2960
+ cause: error,
2961
+ retryable: false
2962
+ });
2963
+ }
2964
+ return new ReaderError(String(error), "UNKNOWN" /* UNKNOWN */, {
2965
+ url,
2966
+ retryable: false
2967
+ });
2968
+ }
3049
2969
  export {
2970
+ AccessDeniedError,
3050
2971
  BrowserPool,
2972
+ BrowserPoolError,
2973
+ ClientClosedError,
2974
+ CloudflareError,
2975
+ ContentExtractionError,
3051
2976
  Crawler,
3052
2977
  DEFAULT_DAEMON_PORT,
3053
2978
  DEFAULT_OPTIONS,
3054
2979
  DaemonClient,
3055
2980
  DaemonServer,
3056
2981
  BrowserPool as HeroBrowserPool,
2982
+ InvalidUrlError,
2983
+ NetworkError,
2984
+ NotInitializedError,
3057
2985
  ReaderClient,
2986
+ ReaderError,
2987
+ ReaderErrorCode,
2988
+ RobotsBlockedError,
3058
2989
  Scraper,
2990
+ TimeoutError,
2991
+ ValidationError,
3059
2992
  cleanContent,
3060
2993
  crawl,
3061
2994
  createHeroConfig,
@@ -3063,14 +2996,12 @@ export {
3063
2996
  detectChallenge,
3064
2997
  extractMetadata,
3065
2998
  formatToHTML,
3066
- formatToJson,
3067
- formatToJsonLite,
3068
2999
  formatToMarkdown,
3069
- formatToText,
3070
3000
  getDaemonInfo,
3071
3001
  getPidFilePath,
3072
3002
  getUrlKey,
3073
3003
  handleChallenge,
3004
+ htmlToMarkdown,
3074
3005
  isChallengePage,
3075
3006
  isDaemonRunning,
3076
3007
  isSameDomain,
@@ -3084,6 +3015,7 @@ export {
3084
3015
  shouldCrawlUrl2 as shouldCrawlUrlFn,
3085
3016
  validateUrls,
3086
3017
  waitForChallengeResolution,
3087
- waitForSelector
3018
+ waitForSelector,
3019
+ wrapError
3088
3020
  };
3089
3021
  //# sourceMappingURL=index.js.map