@vakra-dev/reader 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,3 +1,10 @@
1
+ var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
2
+ get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
3
+ }) : x)(function(x) {
4
+ if (typeof require !== "undefined") return require.apply(this, arguments);
5
+ throw Error('Dynamic require of "' + x + '" is not supported');
6
+ });
7
+
1
8
  // src/client.ts
2
9
  import HeroCore from "@ulixee/hero-core";
3
10
  import { TransportBridge } from "@ulixee/net";
@@ -7,27 +14,36 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
7
14
  import pLimit from "p-limit";
8
15
 
9
16
  // src/cloudflare/detector.ts
10
- var CHALLENGE_DOM_SELECTORS = [
17
+ var CLOUDFLARE_CHALLENGE_SELECTORS = [
11
18
  "#challenge-running",
12
19
  "#challenge-stage",
13
20
  "#challenge-form",
14
- ".cf-browser-verification"
21
+ ".cf-browser-verification",
22
+ "#cf-wrapper",
23
+ "#cf-hcaptcha-container",
24
+ "#turnstile-wrapper"
15
25
  ];
16
- var CHALLENGE_TEXT_PATTERNS = [
17
- "verifying you are human",
26
+ var CLOUDFLARE_TEXT_PATTERNS = [
18
27
  "checking if the site connection is secure",
19
- "this process is automatic. your browser will redirect"
28
+ "this process is automatic. your browser will redirect",
29
+ "ray id:",
30
+ "performance & security by cloudflare"
31
+ ];
32
+ var CLOUDFLARE_INFRA_PATTERNS = [
33
+ "/cdn-cgi/",
34
+ "cloudflare",
35
+ "__cf_bm",
36
+ "cf-ray"
20
37
  ];
21
- var BLOCKED_SIGNALS = [
22
- "you have been blocked",
23
- "access to this page has been denied",
38
+ var CLOUDFLARE_BLOCKED_PATTERNS = [
24
39
  "sorry, you have been blocked",
25
- "access denied",
26
- "403 forbidden"
40
+ "ray id:"
27
41
  ];
28
42
  async function detectChallenge(hero) {
29
43
  const signals = [];
30
44
  let type = "none";
45
+ let hasCloudflareInfra = false;
46
+ let hasChallengeIndicator = false;
31
47
  try {
32
48
  if (!hero.document) {
33
49
  return {
@@ -39,30 +55,51 @@ async function detectChallenge(hero) {
39
55
  }
40
56
  const html = await hero.document.documentElement.outerHTML;
41
57
  const htmlLower = html.toLowerCase();
42
- for (const selector of CHALLENGE_DOM_SELECTORS) {
43
- if (htmlLower.includes(selector.toLowerCase())) {
44
- signals.push(`Challenge element: ${selector}`);
45
- type = "js_challenge";
58
+ for (const pattern of CLOUDFLARE_INFRA_PATTERNS) {
59
+ if (htmlLower.includes(pattern)) {
60
+ hasCloudflareInfra = true;
61
+ signals.push(`Cloudflare infra: "${pattern}"`);
62
+ break;
46
63
  }
47
64
  }
48
- for (const pattern of CHALLENGE_TEXT_PATTERNS) {
65
+ if (!hasCloudflareInfra) {
66
+ return {
67
+ isChallenge: false,
68
+ type: "none",
69
+ confidence: 0,
70
+ signals: ["No Cloudflare infrastructure detected"]
71
+ };
72
+ }
73
+ for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
74
+ try {
75
+ const element = await hero.document.querySelector(selector);
76
+ if (element) {
77
+ hasChallengeIndicator = true;
78
+ signals.push(`Challenge element: ${selector}`);
79
+ type = "js_challenge";
80
+ }
81
+ } catch {
82
+ }
83
+ }
84
+ for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
49
85
  if (htmlLower.includes(pattern)) {
86
+ hasChallengeIndicator = true;
50
87
  signals.push(`Challenge text: "${pattern}"`);
51
88
  type = type === "none" ? "js_challenge" : type;
52
89
  }
53
90
  }
54
91
  if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
92
+ hasChallengeIndicator = true;
55
93
  signals.push('Challenge text: "waiting for...to respond"');
56
94
  type = type === "none" ? "js_challenge" : type;
57
95
  }
58
- for (const pattern of BLOCKED_SIGNALS) {
59
- if (htmlLower.includes(pattern)) {
60
- signals.push(`Blocked: "${pattern}"`);
61
- type = "blocked";
62
- break;
63
- }
96
+ const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
97
+ if (hasBlocked) {
98
+ hasChallengeIndicator = true;
99
+ signals.push("Cloudflare block page detected");
100
+ type = "blocked";
64
101
  }
65
- const isChallenge = signals.length > 0;
102
+ const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
66
103
  const confidence = isChallenge ? 100 : 0;
67
104
  return {
68
105
  isChallenge,
@@ -179,84 +216,6 @@ var turndownService = new TurndownService({
179
216
  linkStyle: "inlined",
180
217
  linkReferenceStyle: "full"
181
218
  });
182
- function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
183
- const sections = [];
184
- if (includeMetadata) {
185
- sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
186
- }
187
- if (pages.length > 1) {
188
- sections.push(createMarkdownTOC(pages));
189
- }
190
- sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
191
- return sections.join("\n\n");
192
- }
193
- function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
194
- const title = website.title || extractDomainFromUrl(baseUrl);
195
- const description = website.description || "";
196
- let header = `# Website Scrape: ${title}
197
-
198
- `;
199
- header += `**Base URL:** ${baseUrl}
200
- `;
201
- header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
202
- `;
203
- header += `**Duration:** ${duration}ms
204
- `;
205
- header += `**Total pages:** ${totalPages}
206
- `;
207
- if (description) {
208
- header += `**Description:** ${description}
209
- `;
210
- }
211
- if (website.author) {
212
- header += `**Author:** ${website.author}
213
- `;
214
- }
215
- if (website.language) {
216
- header += `**Language:** ${website.language}
217
- `;
218
- }
219
- return header;
220
- }
221
- function createMarkdownTOC(pages) {
222
- let toc = "## Table of Contents\n\n";
223
- pages.forEach((page, index) => {
224
- const depth = " ".repeat(page.depth);
225
- const pageNumber = index + 1;
226
- const title = page.title || `Page ${pageNumber}`;
227
- const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
228
- const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
229
- toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
230
- `;
231
- });
232
- return toc;
233
- }
234
- function createMarkdownPage(page, pageNumber) {
235
- const title = page.title || `Page ${pageNumber}`;
236
- const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
237
- const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
238
- let pageContent = `---
239
-
240
- `;
241
- pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
242
-
243
- `;
244
- pageContent += `**URL:** ${page.url}
245
- `;
246
- pageContent += `**Title:** ${page.title}
247
- `;
248
- pageContent += `**Depth:** ${page.depth}
249
- `;
250
- pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
251
-
252
- `;
253
- pageContent += `---
254
-
255
- `;
256
- const markdown = htmlToMarkdown(page.html);
257
- pageContent += markdown;
258
- return pageContent;
259
- }
260
219
  function htmlToMarkdown(html) {
261
220
  try {
262
221
  return turndownService.turndown(html);
@@ -265,596 +224,340 @@ function htmlToMarkdown(html) {
265
224
  return html.replace(/<[^>]*>/g, "").trim();
266
225
  }
267
226
  }
268
- function extractDomainFromUrl(url) {
269
- try {
270
- return new URL(url).hostname;
271
- } catch {
272
- return "Unknown";
273
- }
274
- }
275
-
276
- // src/formatters/html.ts
277
- function formatToHTML(pages, baseUrl, scrapedAt, duration, website) {
278
- const html = `<!DOCTYPE html>
279
- <html lang="${website.language || "en"}">
280
- <head>
281
- <meta charset="${website.charset || "UTF-8"}">
282
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
283
- <title>Scrape: ${website.title || extractDomainFromUrl2(baseUrl)}</title>
284
- ${generateMetaTags(website)}
285
- <style>
286
- ${generateCSS()}
287
- </style>
288
- </head>
289
- <body>
290
- <header class="header">
291
- <h1>Website Scrape: ${escapeHtml(website.title || extractDomainFromUrl2(baseUrl))}</h1>
292
- <div class="meta-info">
293
- <p><strong>Base URL:</strong> <a href="${escapeHtml(
294
- baseUrl
295
- )}" target="_blank">${escapeHtml(baseUrl)}</a></p>
296
- <p><strong>Scraped at:</strong> ${new Date(scrapedAt).toLocaleString()}</p>
297
- <p><strong>Duration:</strong> ${duration}ms</p>
298
- <p><strong>Total pages:</strong> ${pages.length}</p>
299
- ${website.description ? `<p><strong>Description:</strong> ${escapeHtml(website.description)}</p>` : ""}
300
- ${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
301
- ${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
302
- </div>
303
- </header>
304
-
305
- ${pages.length > 1 ? generateTOC(pages) : ""}
306
-
307
- <main class="content">
308
- ${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
309
- </main>
310
-
311
- <footer class="footer">
312
- <p>Generated by Reader JS/TS SDK</p>
313
- </footer>
314
-
315
- <script>
316
- ${generateJavaScript()}
317
- </script>
318
- </body>
319
- </html>`;
320
- return html;
321
- }
322
- function generateMetaTags(website) {
323
- const tags = [];
324
- if (website.description) {
325
- tags.push(`<meta name="description" content="${escapeHtml(website.description)}">`);
326
- }
327
- if (website.author) {
328
- tags.push(`<meta name="author" content="${escapeHtml(website.author)}">`);
329
- }
330
- if (website.keywords) {
331
- tags.push(`<meta name="keywords" content="${escapeHtml(website.keywords.join(", "))}">`);
332
- }
333
- if (website.robots) {
334
- tags.push(`<meta name="robots" content="${escapeHtml(website.robots)}">`);
335
- }
336
- if (website.themeColor) {
337
- tags.push(`<meta name="theme-color" content="${escapeHtml(website.themeColor)}">`);
338
- }
339
- if (website.favicon) {
340
- tags.push(`<link rel="icon" href="${escapeHtml(website.favicon)}">`);
341
- }
342
- if (website.canonical) {
343
- tags.push(`<link rel="canonical" href="${escapeHtml(website.canonical)}">`);
344
- }
345
- if (website.openGraph) {
346
- const og = website.openGraph;
347
- if (og.title) tags.push(`<meta property="og:title" content="${escapeHtml(og.title)}">`);
348
- if (og.description)
349
- tags.push(`<meta property="og:description" content="${escapeHtml(og.description)}">`);
350
- if (og.type) tags.push(`<meta property="og:type" content="${escapeHtml(og.type)}">`);
351
- if (og.url) tags.push(`<meta property="og:url" content="${escapeHtml(og.url)}">`);
352
- if (og.image) tags.push(`<meta property="og:image" content="${escapeHtml(og.image)}">`);
353
- if (og.siteName)
354
- tags.push(`<meta property="og:site_name" content="${escapeHtml(og.siteName)}">`);
355
- if (og.locale) tags.push(`<meta property="og:locale" content="${escapeHtml(og.locale)}">`);
356
- }
357
- if (website.twitter) {
358
- const twitter = website.twitter;
359
- if (twitter.card) tags.push(`<meta name="twitter:card" content="${escapeHtml(twitter.card)}">`);
360
- if (twitter.site) tags.push(`<meta name="twitter:site" content="${escapeHtml(twitter.site)}">`);
361
- if (twitter.creator)
362
- tags.push(`<meta name="twitter:creator" content="${escapeHtml(twitter.creator)}">`);
363
- if (twitter.title)
364
- tags.push(`<meta name="twitter:title" content="${escapeHtml(twitter.title)}">`);
365
- if (twitter.description)
366
- tags.push(`<meta name="twitter:description" content="${escapeHtml(twitter.description)}">`);
367
- if (twitter.image)
368
- tags.push(`<meta name="twitter:image" content="${escapeHtml(twitter.image)}">`);
369
- }
370
- return tags.join("\n ");
371
- }
372
- function generateCSS() {
373
- return `
374
- * {
375
- margin: 0;
376
- padding: 0;
377
- box-sizing: border-box;
378
- }
379
-
380
- body {
381
- font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
382
- line-height: 1.6;
383
- color: #333;
384
- background-color: #f8f9fa;
385
- }
386
-
387
- .header {
388
- background: white;
389
- padding: 2rem;
390
- border-bottom: 1px solid #e9ecef;
391
- margin-bottom: 2rem;
392
- }
393
-
394
- .header h1 {
395
- color: #2c3e50;
396
- margin-bottom: 1rem;
397
- font-size: 2rem;
398
- }
399
-
400
- .meta-info {
401
- display: grid;
402
- grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
403
- gap: 0.5rem;
404
- }
405
-
406
- .meta-info p {
407
- margin: 0.25rem 0;
408
- font-size: 0.9rem;
409
- color: #6c757d;
410
- }
411
-
412
- .toc {
413
- background: white;
414
- padding: 1.5rem;
415
- margin: 2rem 0;
416
- border-radius: 8px;
417
- border: 1px solid #e9ecef;
418
- }
419
-
420
- .toc h2 {
421
- color: #2c3e50;
422
- margin-bottom: 1rem;
423
- font-size: 1.25rem;
424
- }
425
-
426
- .toc ul {
427
- list-style: none;
428
- }
429
-
430
- .toc li {
431
- margin: 0.5rem 0;
432
- }
433
-
434
- .toc a {
435
- color: #007bff;
436
- text-decoration: none;
437
- transition: color 0.2s;
438
- }
439
-
440
- .toc a:hover {
441
- color: #0056b3;
442
- text-decoration: underline;
443
- }
444
-
445
- .content {
446
- max-width: 800px;
447
- margin: 0 auto;
448
- padding: 0 1rem;
449
- }
450
-
451
- .page {
452
- background: white;
453
- margin: 2rem 0;
454
- padding: 2rem;
455
- border-radius: 8px;
456
- border: 1px solid #e9ecef;
457
- box-shadow: 0 2px 4px rgba(0,0,0,0.05);
458
- }
459
-
460
- .page-header {
461
- border-bottom: 2px solid #e9ecef;
462
- padding-bottom: 1rem;
463
- margin-bottom: 2rem;
464
- }
465
-
466
- .page-header h2 {
467
- color: #2c3e50;
468
- margin-bottom: 0.5rem;
469
- font-size: 1.5rem;
470
- }
471
-
472
- .page-meta {
473
- display: flex;
474
- flex-wrap: wrap;
475
- gap: 1rem;
476
- font-size: 0.9rem;
477
- color: #6c757d;
478
- }
479
-
480
- .page-content {
481
- line-height: 1.8;
482
- }
483
-
484
- .page-content h1, .page-content h2, .page-content h3,
485
- .page-content h4, .page-content h5, .page-content h6 {
486
- color: #2c3e50;
487
- margin: 1.5rem 0 0.5rem 0;
488
- }
489
-
490
- .page-content p {
491
- margin: 1rem 0;
492
- }
493
-
494
- .page-content a {
495
- color: #007bff;
496
- text-decoration: none;
497
- }
498
-
499
- .page-content a:hover {
500
- text-decoration: underline;
501
- }
502
-
503
- .page-content code {
504
- background: #f8f9fa;
505
- padding: 0.2rem 0.4rem;
506
- border-radius: 4px;
507
- font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
508
- font-size: 0.9em;
509
- }
510
-
511
- .page-content pre {
512
- background: #f8f9fa;
513
- padding: 1rem;
514
- border-radius: 4px;
515
- overflow-x: auto;
516
- margin: 1rem 0;
517
- }
518
-
519
- .page-content blockquote {
520
- border-left: 4px solid #007bff;
521
- padding-left: 1rem;
522
- margin: 1rem 0;
523
- color: #6c757d;
524
- }
525
-
526
- .footer {
527
- text-align: center;
528
- padding: 2rem;
529
- margin-top: 3rem;
530
- border-top: 1px solid #e9ecef;
531
- color: #6c757d;
532
- font-size: 0.9rem;
533
- }
534
-
535
- @media (max-width: 768px) {
536
- .header {
537
- padding: 1rem;
538
- }
539
-
540
- .header h1 {
541
- font-size: 1.5rem;
542
- }
543
-
544
- .page {
545
- padding: 1rem;
546
- }
547
-
548
- .page-meta {
549
- flex-direction: column;
550
- gap: 0.5rem;
551
- }
552
- }
553
- `.trim();
554
- }
555
- function generateTOC(pages) {
556
- const tocItems = pages.map((page, index) => {
557
- const pageNumber = index + 1;
558
- const title = page.title || `Page ${pageNumber}`;
559
- const id = `page-${pageNumber}`;
560
- return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
561
- }).join("\n");
562
- return `
563
- <nav class="toc">
564
- <h2>Table of Contents</h2>
565
- <ul>
566
- ${tocItems}
567
- </ul>
568
- </nav>`;
569
- }
570
- function generatePageHTML(page, pageNumber) {
571
- const id = `page-${pageNumber}`;
572
- const title = page.title || `Page ${pageNumber}`;
573
- return `
574
- <article class="page" id="${id}">
575
- <div class="page-header">
576
- <h2>${pageNumber}. ${escapeHtml(title)}</h2>
577
- <div class="page-meta">
578
- <span><strong>URL:</strong> <a href="${escapeHtml(
579
- page.url
580
- )}" target="_blank">${escapeHtml(page.url)}</a></span>
581
- <span><strong>Depth:</strong> ${page.depth}</span>
582
- <span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
583
- </div>
584
- </div>
585
- <div class="page-content">
586
- ${page.html}
587
- </div>
588
- </article>`;
589
- }
590
- function generateJavaScript() {
591
- return `
592
- // Smooth scrolling for TOC links
593
- document.querySelectorAll('a[href^="#"]').forEach(anchor => {
594
- anchor.addEventListener('click', function (e) {
595
- e.preventDefault();
596
- const target = document.querySelector(this.getAttribute('href'));
597
- if (target) {
598
- target.scrollIntoView({
599
- behavior: 'smooth',
600
- block: 'start'
601
- });
602
- }
603
- });
604
- });
605
-
606
- // Highlight current section in TOC
607
- window.addEventListener('scroll', function() {
608
- const pages = document.querySelectorAll('.page');
609
- const tocLinks = document.querySelectorAll('.toc a');
610
-
611
- let currentPage = null;
612
- pages.forEach(page => {
613
- const rect = page.getBoundingClientRect();
614
- if (rect.top <= 100) {
615
- currentPage = page;
616
- }
617
- });
618
-
619
- tocLinks.forEach(link => {
620
- link.style.fontWeight = 'normal';
621
- const target = document.querySelector(link.getAttribute('href'));
622
- if (target === currentPage) {
623
- link.style.fontWeight = 'bold';
624
- }
625
- });
626
- });
627
- `;
628
- }
629
- function escapeHtml(text) {
630
- return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, "&quot;").replace(/'/g, "&#039;").replace(/\//g, "&#x2F;");
631
- }
632
- function extractDomainFromUrl2(url) {
633
- try {
634
- return new URL(url).hostname;
635
- } catch {
636
- return "Unknown";
637
- }
638
- }
639
-
640
- // src/formatters/json.ts
641
- function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
642
- const jsonResult = {
643
- metadata: {
644
- baseUrl,
645
- totalPages: pages.length,
646
- scrapedAt,
647
- duration,
648
- website
649
- },
650
- pages: pages.map((page, index) => ({
651
- index: index + 1,
652
- url: page.url,
653
- title: page.title,
654
- markdown: page.markdown,
655
- html: page.html,
656
- fetchedAt: page.fetchedAt,
657
- depth: page.depth,
658
- wordCount: countWords(page.markdown),
659
- readingTime: estimateReadingTime(page.markdown)
660
- }))
661
- };
662
- return JSON.stringify(jsonResult, null, 2);
663
- }
664
- function formatToJsonLite(pages, baseUrl, scrapedAt, duration, website) {
665
- const jsonResult = {
666
- metadata: {
667
- baseUrl,
668
- totalPages: pages.length,
669
- scrapedAt,
670
- duration,
671
- website
672
- },
673
- pages: pages.map((page, index) => ({
674
- index: index + 1,
675
- url: page.url,
676
- title: page.title,
677
- markdown: page.markdown,
678
- fetchedAt: page.fetchedAt,
679
- depth: page.depth,
680
- wordCount: countWords(page.markdown),
681
- readingTime: estimateReadingTime(page.markdown)
682
- }))
683
- };
684
- return JSON.stringify(jsonResult, null, 2);
685
- }
686
- function countWords(markdown) {
687
- const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
688
- return plainText.split(/\s+/).filter((word) => word.length > 0).length;
689
- }
690
- function estimateReadingTime(markdown) {
691
- const wordCount = countWords(markdown);
692
- return Math.ceil(wordCount / 200);
693
- }
694
-
695
- // src/formatters/text.ts
696
- import { parseHTML } from "linkedom";
697
- function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
698
- const sections = [];
699
- if (includeMetadata) {
700
- sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
701
- }
702
- sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
703
- return sections.join("\n\n");
704
- }
705
- function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
706
- const title = website.title || extractDomainFromUrl3(baseUrl);
707
- const lines = [];
708
- lines.push(`=== ${title} ===`);
709
- lines.push("");
710
- lines.push(`URL: ${baseUrl}`);
711
- lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
712
- lines.push(`Duration: ${duration}ms`);
713
- lines.push(`Pages: ${totalPages}`);
714
- if (website.description) {
715
- lines.push(`Description: ${website.description}`);
716
- }
717
- if (website.author) {
718
- lines.push(`Author: ${website.author}`);
719
- }
720
- if (website.language) {
721
- lines.push(`Language: ${website.language}`);
722
- }
723
- return lines.join("\n");
724
- }
725
- function createTextPage(page, pageNumber, showSeparator) {
726
- const lines = [];
727
- if (showSeparator) {
728
- lines.push("\u2500".repeat(60));
729
- lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
730
- lines.push(`URL: ${page.url}`);
731
- lines.push("\u2500".repeat(60));
732
- }
733
- const plainText = htmlToPlainText(page.html);
734
- lines.push(plainText);
735
- return lines.join("\n");
736
- }
737
- function htmlToPlainText(html) {
738
- const { document } = parseHTML(html);
739
- const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
740
- elementsToRemove.forEach((tag) => {
741
- document.querySelectorAll(tag).forEach((el) => el.remove());
742
- });
743
- let text = document.body?.textContent || document.documentElement?.textContent || "";
744
- text = text.replace(/[ \t]+/g, " ");
745
- text = text.replace(/\n[ \t]+/g, "\n");
746
- text = text.replace(/[ \t]+\n/g, "\n");
747
- text = text.replace(/\n{3,}/g, "\n\n");
748
- text = text.trim();
749
- return text;
750
- }
751
- function extractDomainFromUrl3(url) {
752
- try {
753
- return new URL(url).hostname;
754
- } catch {
755
- return "Unknown";
756
- }
757
- }
227
+ var formatToMarkdown = htmlToMarkdown;
758
228
 
759
229
  // src/utils/content-cleaner.ts
760
- import { parseHTML as parseHTML2 } from "linkedom";
230
+ import { parseHTML } from "linkedom";
761
231
  var ALWAYS_REMOVE_SELECTORS = [
762
- // Navigation and menus
763
- "nav",
764
- "header nav",
765
- "footer nav",
766
- ".nav",
767
- ".navigation",
768
- ".menu",
769
- ".navbar",
770
- ".sidebar",
771
- ".aside",
772
- // Header and footer elements
773
- "header",
774
- "footer",
775
- ".site-header",
776
- ".page-header",
777
- ".site-footer",
778
- ".page-footer",
779
- // Social media and sharing
780
- ".social",
781
- ".share",
782
- ".sharing",
783
- ".twitter",
784
- ".facebook",
785
- ".linkedin",
786
- ".instagram",
787
- // Comments and discussions
788
- ".comments",
789
- ".comment",
790
- ".discussion",
791
- ".disqus",
792
- // Forms and interactive elements
793
- "form",
794
- "input",
795
- "button:not([type='submit'])",
796
- "select",
797
- "textarea",
798
232
  // Scripts and styles
799
233
  "script",
800
234
  "style",
801
235
  "noscript",
236
+ "link[rel='stylesheet']",
802
237
  // Hidden elements
803
238
  "[hidden]",
239
+ "[aria-hidden='true']",
804
240
  "[style*='display: none']",
805
241
  "[style*='display:none']",
806
- // Common utility classes
807
- ".cookie",
808
- ".cookie-banner",
809
- ".popup",
242
+ "[style*='visibility: hidden']",
243
+ "[style*='visibility:hidden']",
244
+ // SVG icons and decorative elements
245
+ "svg[aria-hidden='true']",
246
+ "svg.icon",
247
+ "svg[class*='icon']",
248
+ // Template and metadata
249
+ "template",
250
+ "meta",
251
+ // Embeds that don't convert to text
252
+ "iframe",
253
+ "canvas",
254
+ "object",
255
+ "embed",
256
+ // Forms (usually not main content)
257
+ "form",
258
+ "input",
259
+ "select",
260
+ "textarea",
261
+ "button"
262
+ ];
263
+ var OVERLAY_SELECTORS = [
264
+ "[class*='modal']",
265
+ "[class*='popup']",
266
+ "[class*='overlay']",
267
+ "[class*='dialog']",
268
+ "[role='dialog']",
269
+ "[role='alertdialog']",
270
+ "[class*='cookie']",
271
+ "[class*='consent']",
272
+ "[class*='gdpr']",
273
+ "[class*='privacy-banner']",
274
+ "[class*='notification-bar']",
275
+ "[id*='cookie']",
276
+ "[id*='consent']",
277
+ "[id*='gdpr']",
278
+ // Fixed/sticky positioned elements
279
+ "[style*='position: fixed']",
280
+ "[style*='position:fixed']",
281
+ "[style*='position: sticky']",
282
+ "[style*='position:sticky']"
283
+ ];
284
+ var NAVIGATION_SELECTORS = [
285
+ // Semantic elements
286
+ "header",
287
+ "footer",
288
+ "nav",
289
+ "aside",
290
+ // Header variations
291
+ ".header",
292
+ ".top",
293
+ ".navbar",
294
+ "#header",
295
+ // Footer variations
296
+ ".footer",
297
+ ".bottom",
298
+ "#footer",
299
+ // Sidebars
300
+ ".sidebar",
301
+ ".side",
302
+ ".aside",
303
+ "#sidebar",
304
+ // Modals/popups (backup if not caught by OVERLAY_SELECTORS)
810
305
  ".modal",
306
+ ".popup",
307
+ "#modal",
811
308
  ".overlay",
812
- ".notification",
309
+ // Ads
310
+ ".ad",
311
+ ".ads",
312
+ ".advert",
313
+ "#ad",
314
+ // Language selectors
315
+ ".lang-selector",
316
+ ".language",
317
+ "#language-selector",
318
+ // Social
319
+ ".social",
320
+ ".social-media",
321
+ ".social-links",
322
+ "#social",
323
+ // Navigation/menus
324
+ ".menu",
325
+ ".navigation",
326
+ "#nav",
813
327
  // Breadcrumbs
814
- ".breadcrumb",
815
328
  ".breadcrumbs",
816
- ".breadcrumb-trail"
329
+ "#breadcrumbs",
330
+ // Share buttons
331
+ ".share",
332
+ "#share",
333
+ // Widgets
334
+ ".widget",
335
+ "#widget",
336
+ // Cookie notices (backup)
337
+ ".cookie",
338
+ "#cookie"
339
+ ];
340
+ var FORCE_INCLUDE_SELECTORS = [
341
+ // IDs
342
+ "#main",
343
+ "#content",
344
+ "#main-content",
345
+ "#article",
346
+ "#post",
347
+ "#page-content",
348
+ // Semantic elements
349
+ "main",
350
+ "article",
351
+ "[role='main']",
352
+ // Classes
353
+ ".main-content",
354
+ ".content",
355
+ ".post-content",
356
+ ".article-content",
357
+ ".entry-content",
358
+ ".page-content",
359
+ ".article-body",
360
+ ".post-body",
361
+ ".story-content",
362
+ ".blog-content"
817
363
  ];
818
364
  var AD_SELECTORS = [
819
- // Ads and promotions
820
- ".ad",
821
- ".ads",
822
- ".advertisement",
823
- ".promotion",
824
- ".sponsored",
825
- "[class*='ad-']",
826
- "[id*='ad-']",
827
- "[class*='advert']",
828
- "[id*='advert']",
829
- "[class*='banner']",
830
- "[id*='banner']",
365
+ // Google ads
366
+ "ins.adsbygoogle",
831
367
  ".google-ad",
832
368
  ".adsense",
369
+ // Generic ad containers
833
370
  "[data-ad]",
834
371
  "[data-ads]",
835
- "ins.adsbygoogle",
836
- // Tracking
837
- "[class*='tracking']",
838
- "[id*='tracking']",
839
- "[class*='analytics']",
840
- "[id*='analytics']"
372
+ "[data-ad-slot]",
373
+ "[data-ad-client]",
374
+ // Common ad class patterns
375
+ ".ad-container",
376
+ ".ad-wrapper",
377
+ ".advertisement",
378
+ ".sponsored-content",
379
+ // Tracking pixels
380
+ "img[width='1'][height='1']",
381
+ "img[src*='pixel']",
382
+ "img[src*='tracking']",
383
+ "img[src*='analytics']"
841
384
  ];
842
- function cleanHtml(html, baseUrl, options = {}) {
843
- const { removeAds = true, removeBase64Images = true } = options;
844
- const { document } = parseHTML2(html);
845
- for (const selector of ALWAYS_REMOVE_SELECTORS) {
385
+ function getLinkDensity(element) {
386
+ const text = element.textContent || "";
387
+ const textLength = text.trim().length;
388
+ if (textLength === 0) return 1;
389
+ let linkLength = 0;
390
+ element.querySelectorAll("a").forEach((link) => {
391
+ linkLength += (link.textContent || "").trim().length;
392
+ });
393
+ return linkLength / textLength;
394
+ }
395
+ function getContentScore(element) {
396
+ let score = 0;
397
+ const text = element.textContent || "";
398
+ const textLength = text.trim().length;
399
+ score += Math.min(textLength / 100, 50);
400
+ score += element.querySelectorAll("p").length * 3;
401
+ score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2;
402
+ score += element.querySelectorAll("img").length * 1;
403
+ score -= element.querySelectorAll("a").length * 0.5;
404
+ score -= element.querySelectorAll("li").length * 0.2;
405
+ const linkDensity = getLinkDensity(element);
406
+ if (linkDensity > 0.5) score -= 30;
407
+ else if (linkDensity > 0.3) score -= 15;
408
+ const classAndId = (element.className || "") + " " + (element.id || "");
409
+ if (/article|content|post|body|main|entry/i.test(classAndId)) score += 25;
410
+ if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) score -= 25;
411
+ return score;
412
+ }
413
+ function looksLikeNavigation(element) {
414
+ const linkDensity = getLinkDensity(element);
415
+ if (linkDensity > 0.5) return true;
416
+ const listItems = element.querySelectorAll("li");
417
+ const links = element.querySelectorAll("a");
418
+ if (listItems.length > 5 && links.length > listItems.length * 0.8) return true;
419
+ return false;
420
+ }
421
+ function removeElements(document, selectors) {
422
+ for (const selector of selectors) {
846
423
  try {
847
424
  document.querySelectorAll(selector).forEach((el) => el.remove());
848
425
  } catch {
849
426
  }
850
427
  }
428
+ }
429
+ function removeWithProtection(document, selectorsToRemove, protectedSelectors) {
430
+ for (const selector of selectorsToRemove) {
431
+ try {
432
+ document.querySelectorAll(selector).forEach((element) => {
433
+ const isProtected = protectedSelectors.some((ps) => {
434
+ try {
435
+ return element.matches(ps);
436
+ } catch {
437
+ return false;
438
+ }
439
+ });
440
+ if (isProtected) return;
441
+ const containsProtected = protectedSelectors.some((ps) => {
442
+ try {
443
+ return element.querySelector(ps) !== null;
444
+ } catch {
445
+ return false;
446
+ }
447
+ });
448
+ if (containsProtected) return;
449
+ element.remove();
450
+ });
451
+ } catch {
452
+ }
453
+ }
454
+ }
455
+ function findMainContent(document) {
456
+ const isValidContent = (el) => {
457
+ if (!el) return false;
458
+ const text = el.textContent || "";
459
+ if (text.trim().length < 100) return false;
460
+ if (looksLikeNavigation(el)) return false;
461
+ return true;
462
+ };
463
+ const main = document.querySelector("main");
464
+ if (isValidContent(main) && getLinkDensity(main) < 0.4) {
465
+ return main;
466
+ }
467
+ const roleMain = document.querySelector('[role="main"]');
468
+ if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) {
469
+ return roleMain;
470
+ }
471
+ const articles = document.querySelectorAll("article");
472
+ if (articles.length === 1 && isValidContent(articles[0])) {
473
+ return articles[0];
474
+ }
475
+ const contentSelectors = [
476
+ "#content",
477
+ "#main-content",
478
+ "#main",
479
+ ".content",
480
+ ".main-content",
481
+ ".post-content",
482
+ ".article-content",
483
+ ".entry-content",
484
+ ".page-content",
485
+ ".article-body",
486
+ ".post-body",
487
+ ".story-content",
488
+ ".blog-content"
489
+ ];
490
+ for (const selector of contentSelectors) {
491
+ try {
492
+ const el = document.querySelector(selector);
493
+ if (isValidContent(el) && getLinkDensity(el) < 0.4) {
494
+ return el;
495
+ }
496
+ } catch {
497
+ }
498
+ }
499
+ const candidates = [];
500
+ const containers = document.querySelectorAll("div, section, article");
501
+ containers.forEach((el) => {
502
+ const text = el.textContent || "";
503
+ if (text.trim().length < 200) return;
504
+ const score = getContentScore(el);
505
+ if (score > 0) {
506
+ candidates.push({ el, score });
507
+ }
508
+ });
509
+ candidates.sort((a, b) => b.score - a.score);
510
+ if (candidates.length > 0 && candidates[0].score > 20) {
511
+ return candidates[0].el;
512
+ }
513
+ return null;
514
+ }
515
+ function cleanHtml(html, baseUrl, options = {}) {
516
+ const {
517
+ removeAds = true,
518
+ removeBase64Images = true,
519
+ onlyMainContent = true,
520
+ includeTags,
521
+ excludeTags
522
+ } = options;
523
+ const { document } = parseHTML(html);
524
+ removeElements(document, ALWAYS_REMOVE_SELECTORS);
525
+ removeElements(document, OVERLAY_SELECTORS);
851
526
  if (removeAds) {
852
- for (const selector of AD_SELECTORS) {
527
+ removeElements(document, AD_SELECTORS);
528
+ }
529
+ if (excludeTags && excludeTags.length > 0) {
530
+ removeElements(document, excludeTags);
531
+ }
532
+ if (onlyMainContent) {
533
+ removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS);
534
+ const mainContent = findMainContent(document);
535
+ if (mainContent) {
536
+ const body = document.body;
537
+ if (body) {
538
+ const clone = mainContent.cloneNode(true);
539
+ body.innerHTML = "";
540
+ body.appendChild(clone);
541
+ }
542
+ }
543
+ }
544
+ if (includeTags && includeTags.length > 0) {
545
+ const matchedElements = [];
546
+ for (const selector of includeTags) {
853
547
  try {
854
- document.querySelectorAll(selector).forEach((el) => el.remove());
548
+ document.querySelectorAll(selector).forEach((el) => {
549
+ matchedElements.push(el.cloneNode(true));
550
+ });
855
551
  } catch {
856
552
  }
857
553
  }
554
+ if (matchedElements.length > 0) {
555
+ const body = document.body;
556
+ if (body) {
557
+ body.innerHTML = "";
558
+ matchedElements.forEach((el) => body.appendChild(el));
559
+ }
560
+ }
858
561
  }
859
562
  if (removeBase64Images) {
860
563
  removeBase64ImagesFromDocument(document);
@@ -879,7 +582,10 @@ function removeBase64ImagesFromDocument(document) {
879
582
  document.querySelectorAll("[style*='data:image']").forEach((el) => {
880
583
  const style = el.getAttribute("style");
881
584
  if (style) {
882
- const cleanedStyle = style.replace(/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi, "");
585
+ const cleanedStyle = style.replace(
586
+ /background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi,
587
+ ""
588
+ );
883
589
  if (cleanedStyle.trim()) {
884
590
  el.setAttribute("style", cleanedStyle);
885
591
  } else {
@@ -916,7 +622,7 @@ function cleanContent(html, baseUrl, options = {}) {
916
622
  }
917
623
 
918
624
  // src/utils/metadata-extractor.ts
919
- import { parseHTML as parseHTML3 } from "linkedom";
625
+ import { parseHTML as parseHTML2 } from "linkedom";
920
626
 
921
627
  // src/utils/url-helpers.ts
922
628
  import { URL as URL2 } from "url";
@@ -989,8 +695,26 @@ function isSameDomain(url, baseUrl) {
989
695
  function getUrlKey(url) {
990
696
  try {
991
697
  const parsedUrl = new URL2(url);
698
+ parsedUrl.hash = "";
992
699
  parsedUrl.search = "";
993
- return parsedUrl.toString().toLowerCase();
700
+ if (parsedUrl.hostname.startsWith("www.")) {
701
+ parsedUrl.hostname = parsedUrl.hostname.slice(4);
702
+ }
703
+ if (parsedUrl.protocol === "http:" && parsedUrl.port === "80" || parsedUrl.protocol === "https:" && parsedUrl.port === "443") {
704
+ parsedUrl.port = "";
705
+ }
706
+ const indexFiles = ["index.html", "index.htm", "default.html", "default.htm", "index.php"];
707
+ for (const indexFile of indexFiles) {
708
+ if (parsedUrl.pathname.endsWith(`/${indexFile}`)) {
709
+ parsedUrl.pathname = parsedUrl.pathname.slice(0, -indexFile.length);
710
+ break;
711
+ }
712
+ }
713
+ let normalized = parsedUrl.toString().toLowerCase();
714
+ if (normalized.endsWith("/") && parsedUrl.pathname !== "/") {
715
+ normalized = normalized.slice(0, -1);
716
+ }
717
+ return normalized;
994
718
  } catch {
995
719
  return url.toLowerCase();
996
720
  }
@@ -1225,7 +949,7 @@ function extractMetadata(html, baseUrl) {
1225
949
  return extractWebsiteMetadata(html, baseUrl);
1226
950
  }
1227
951
  function extractWebsiteMetadata(html, baseUrl) {
1228
- const { document } = parseHTML3(html);
952
+ const { document } = parseHTML2(html);
1229
953
  const metadata = {
1230
954
  title: null,
1231
955
  description: null,
@@ -1380,11 +1104,20 @@ function extractTwitterCard(document) {
1380
1104
 
1381
1105
  // src/utils/logger.ts
1382
1106
  import pino from "pino";
1107
+ function hasPinoPretty() {
1108
+ try {
1109
+ __require.resolve("pino-pretty");
1110
+ return true;
1111
+ } catch {
1112
+ return false;
1113
+ }
1114
+ }
1383
1115
  function createLogger(name = "reader", level = process.env.LOG_LEVEL || "info") {
1116
+ const usePretty = process.env.NODE_ENV !== "production" && hasPinoPretty();
1384
1117
  return pino({
1385
1118
  name,
1386
1119
  level,
1387
- transport: process.env.NODE_ENV !== "production" ? {
1120
+ transport: usePretty ? {
1388
1121
  target: "pino-pretty",
1389
1122
  options: {
1390
1123
  colorize: true,
@@ -1498,13 +1231,15 @@ function isUrlAllowed(url, rules) {
1498
1231
  var DEFAULT_OPTIONS = {
1499
1232
  urls: [],
1500
1233
  formats: ["markdown"],
1501
- includeMetadata: true,
1502
1234
  timeoutMs: 3e4,
1503
1235
  includePatterns: [],
1504
1236
  excludePatterns: [],
1505
1237
  // Content cleaning defaults
1506
1238
  removeAds: true,
1507
1239
  removeBase64Images: true,
1240
+ onlyMainContent: true,
1241
+ includeTags: [],
1242
+ excludeTags: [],
1508
1243
  skipTLSVerification: true,
1509
1244
  // Batch defaults
1510
1245
  batchConcurrency: 1,
@@ -1518,7 +1253,7 @@ var DEFAULT_OPTIONS = {
1518
1253
  showChrome: false
1519
1254
  };
1520
1255
  function isValidFormat(format) {
1521
- return format === "markdown" || format === "html" || format === "json" || format === "text";
1256
+ return format === "markdown" || format === "html";
1522
1257
  }
1523
1258
  function shouldCrawlUrl2(url, baseDomain) {
1524
1259
  return url.hostname === baseDomain || url.hostname.endsWith(`.${baseDomain}`);
@@ -1667,14 +1402,9 @@ var Scraper = class {
1667
1402
  } catch {
1668
1403
  }
1669
1404
  await hero.waitForPaintingStable();
1670
- let hadChallenge = false;
1671
- let challengeType = "none";
1672
- let waitTimeMs = 0;
1673
1405
  const initialUrl = await hero.url;
1674
1406
  const detection = await detectChallenge(hero);
1675
1407
  if (detection.isChallenge) {
1676
- hadChallenge = true;
1677
- challengeType = detection.type;
1678
1408
  if (this.options.verbose) {
1679
1409
  this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
1680
1410
  }
@@ -1684,12 +1414,11 @@ var Scraper = class {
1684
1414
  verbose: this.options.verbose,
1685
1415
  initialUrl
1686
1416
  });
1687
- waitTimeMs = result2.waitedMs;
1688
1417
  if (!result2.resolved) {
1689
1418
  throw new Error(`Challenge not resolved: ${detection.type}`);
1690
1419
  }
1691
1420
  if (this.options.verbose) {
1692
- this.logger.info(`Challenge resolved via ${result2.method} in ${waitTimeMs}ms`);
1421
+ this.logger.info(`Challenge resolved via ${result2.method} in ${result2.waitedMs}ms`);
1693
1422
  }
1694
1423
  }
1695
1424
  await this.waitForFinalPage(hero, url, this.options.verbose);
@@ -1702,45 +1431,18 @@ var Scraper = class {
1702
1431
  this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
1703
1432
  }
1704
1433
  }
1705
- const pageTitle = await hero.document.title;
1706
1434
  const html = await hero.document.documentElement.outerHTML;
1707
1435
  const cleanedHtml = cleanContent(html, url, {
1708
1436
  removeAds: this.options.removeAds,
1709
- removeBase64Images: this.options.removeBase64Images
1437
+ removeBase64Images: this.options.removeBase64Images,
1438
+ onlyMainContent: this.options.onlyMainContent,
1439
+ includeTags: this.options.includeTags,
1440
+ excludeTags: this.options.excludeTags
1710
1441
  });
1711
1442
  const websiteMetadata = extractMetadata(cleanedHtml, url);
1712
1443
  const duration = Date.now() - startTime;
1713
- const scrapedAt = (/* @__PURE__ */ new Date()).toISOString();
1714
- const page = {
1715
- url,
1716
- title: pageTitle,
1717
- markdown: "",
1718
- // Will be set by formatter
1719
- html: cleanedHtml,
1720
- fetchedAt: scrapedAt,
1721
- depth: 0,
1722
- hadChallenge,
1723
- challengeType,
1724
- waitTimeMs
1725
- };
1726
- const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
1727
- [page],
1728
- url,
1729
- scrapedAt,
1730
- duration,
1731
- websiteMetadata,
1732
- this.options.includeMetadata
1733
- ) : void 0;
1734
- const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
1735
- const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
1736
- const text = this.options.formats.includes("text") ? formatToText(
1737
- [page],
1738
- url,
1739
- scrapedAt,
1740
- duration,
1741
- websiteMetadata,
1742
- this.options.includeMetadata
1743
- ) : void 0;
1444
+ const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
1445
+ const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
1744
1446
  if (this.options.onProgress) {
1745
1447
  this.options.onProgress({
1746
1448
  completed: index + 1,
@@ -1772,8 +1474,6 @@ var Scraper = class {
1772
1474
  const result = {
1773
1475
  markdown,
1774
1476
  html: htmlOutput,
1775
- json,
1776
- text,
1777
1477
  metadata: {
1778
1478
  baseUrl: url,
1779
1479
  totalPages: 1,
@@ -1828,7 +1528,7 @@ async function scrape(options) {
1828
1528
  }
1829
1529
 
1830
1530
  // src/crawler.ts
1831
- import { parseHTML as parseHTML4 } from "linkedom";
1531
+ import { parseHTML as parseHTML3 } from "linkedom";
1832
1532
 
1833
1533
  // src/utils/rate-limiter.ts
1834
1534
  import pLimit2 from "p-limit";
@@ -1977,12 +1677,26 @@ var Crawler = class {
1977
1677
  */
1978
1678
  extractLinks(html, baseUrl, depth) {
1979
1679
  const links = [];
1980
- const { document } = parseHTML4(html);
1680
+ const { document } = parseHTML3(html);
1981
1681
  document.querySelectorAll("a[href]").forEach((anchor) => {
1982
- const href = anchor.getAttribute("href");
1682
+ const rawHref = anchor.getAttribute("href");
1683
+ if (!rawHref) return;
1684
+ const href = rawHref.trim();
1983
1685
  if (!href) return;
1984
- const resolved = resolveUrl(href, baseUrl);
1686
+ if (href.startsWith("#")) return;
1687
+ const lowerHref = href.toLowerCase();
1688
+ if (lowerHref.startsWith("javascript:") || lowerHref.startsWith("mailto:") || lowerHref.startsWith("tel:") || lowerHref.startsWith("data:") || lowerHref.startsWith("blob:") || lowerHref.startsWith("ftp:")) {
1689
+ return;
1690
+ }
1691
+ let resolved = resolveUrl(href, baseUrl);
1985
1692
  if (!resolved || !isValidUrl(resolved)) return;
1693
+ try {
1694
+ const parsed = new URL(resolved);
1695
+ parsed.hash = "";
1696
+ resolved = parsed.toString();
1697
+ } catch {
1698
+ return;
1699
+ }
1986
1700
  if (!isSameDomain(resolved, this.options.url)) return;
1987
1701
  if (!isContentUrl(resolved)) return;
1988
1702
  if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
@@ -3030,16 +2744,251 @@ async function isDaemonRunning(port = DEFAULT_DAEMON_PORT) {
3030
2744
  const client = new DaemonClient({ port, timeoutMs: 5e3 });
3031
2745
  return client.isRunning();
3032
2746
  }
2747
+
2748
+ // src/formatters/html.ts
2749
+ function formatToHTML(html) {
2750
+ return html;
2751
+ }
2752
+
2753
+ // src/errors.ts
2754
+ var ReaderErrorCode = /* @__PURE__ */ ((ReaderErrorCode2) => {
2755
+ ReaderErrorCode2["NETWORK_ERROR"] = "NETWORK_ERROR";
2756
+ ReaderErrorCode2["TIMEOUT"] = "TIMEOUT";
2757
+ ReaderErrorCode2["CONNECTION_REFUSED"] = "CONNECTION_REFUSED";
2758
+ ReaderErrorCode2["CLOUDFLARE_CHALLENGE"] = "CLOUDFLARE_CHALLENGE";
2759
+ ReaderErrorCode2["BOT_DETECTED"] = "BOT_DETECTED";
2760
+ ReaderErrorCode2["ACCESS_DENIED"] = "ACCESS_DENIED";
2761
+ ReaderErrorCode2["CONTENT_EXTRACTION_FAILED"] = "CONTENT_EXTRACTION_FAILED";
2762
+ ReaderErrorCode2["EMPTY_CONTENT"] = "EMPTY_CONTENT";
2763
+ ReaderErrorCode2["INVALID_URL"] = "INVALID_URL";
2764
+ ReaderErrorCode2["INVALID_OPTIONS"] = "INVALID_OPTIONS";
2765
+ ReaderErrorCode2["ROBOTS_BLOCKED"] = "ROBOTS_BLOCKED";
2766
+ ReaderErrorCode2["BROWSER_ERROR"] = "BROWSER_ERROR";
2767
+ ReaderErrorCode2["POOL_EXHAUSTED"] = "POOL_EXHAUSTED";
2768
+ ReaderErrorCode2["CLIENT_CLOSED"] = "CLIENT_CLOSED";
2769
+ ReaderErrorCode2["NOT_INITIALIZED"] = "NOT_INITIALIZED";
2770
+ ReaderErrorCode2["UNKNOWN"] = "UNKNOWN";
2771
+ return ReaderErrorCode2;
2772
+ })(ReaderErrorCode || {});
2773
+ var ReaderError = class extends Error {
2774
+ code;
2775
+ url;
2776
+ cause;
2777
+ timestamp;
2778
+ retryable;
2779
+ constructor(message, code, options) {
2780
+ super(message);
2781
+ this.name = "ReaderError";
2782
+ this.code = code;
2783
+ this.url = options?.url;
2784
+ this.cause = options?.cause;
2785
+ this.timestamp = (/* @__PURE__ */ new Date()).toISOString();
2786
+ this.retryable = options?.retryable ?? false;
2787
+ if (Error.captureStackTrace) {
2788
+ Error.captureStackTrace(this, this.constructor);
2789
+ }
2790
+ }
2791
+ /**
2792
+ * Convert to a plain object for serialization
2793
+ */
2794
+ toJSON() {
2795
+ return {
2796
+ name: this.name,
2797
+ code: this.code,
2798
+ message: this.message,
2799
+ url: this.url,
2800
+ timestamp: this.timestamp,
2801
+ retryable: this.retryable,
2802
+ cause: this.cause?.message,
2803
+ stack: this.stack
2804
+ };
2805
+ }
2806
+ };
2807
+ var NetworkError = class extends ReaderError {
2808
+ constructor(message, options) {
2809
+ super(message, "NETWORK_ERROR" /* NETWORK_ERROR */, {
2810
+ ...options,
2811
+ retryable: true
2812
+ });
2813
+ this.name = "NetworkError";
2814
+ }
2815
+ };
2816
+ var TimeoutError = class extends ReaderError {
2817
+ timeoutMs;
2818
+ constructor(message, timeoutMs, options) {
2819
+ super(message, "TIMEOUT" /* TIMEOUT */, {
2820
+ ...options,
2821
+ retryable: true
2822
+ });
2823
+ this.name = "TimeoutError";
2824
+ this.timeoutMs = timeoutMs;
2825
+ }
2826
+ toJSON() {
2827
+ return {
2828
+ ...super.toJSON(),
2829
+ timeoutMs: this.timeoutMs
2830
+ };
2831
+ }
2832
+ };
2833
+ var CloudflareError = class extends ReaderError {
2834
+ challengeType;
2835
+ constructor(challengeType, options) {
2836
+ super(
2837
+ `Cloudflare ${challengeType} challenge not resolved. Consider using a residential proxy or increasing timeout.`,
2838
+ "CLOUDFLARE_CHALLENGE" /* CLOUDFLARE_CHALLENGE */,
2839
+ {
2840
+ ...options,
2841
+ retryable: true
2842
+ }
2843
+ );
2844
+ this.name = "CloudflareError";
2845
+ this.challengeType = challengeType;
2846
+ }
2847
+ toJSON() {
2848
+ return {
2849
+ ...super.toJSON(),
2850
+ challengeType: this.challengeType
2851
+ };
2852
+ }
2853
+ };
2854
+ var AccessDeniedError = class extends ReaderError {
2855
+ statusCode;
2856
+ constructor(message, options) {
2857
+ super(message, "ACCESS_DENIED" /* ACCESS_DENIED */, {
2858
+ ...options,
2859
+ retryable: false
2860
+ });
2861
+ this.name = "AccessDeniedError";
2862
+ this.statusCode = options?.statusCode;
2863
+ }
2864
+ toJSON() {
2865
+ return {
2866
+ ...super.toJSON(),
2867
+ statusCode: this.statusCode
2868
+ };
2869
+ }
2870
+ };
2871
+ var ContentExtractionError = class extends ReaderError {
2872
+ constructor(message, options) {
2873
+ super(message, "CONTENT_EXTRACTION_FAILED" /* CONTENT_EXTRACTION_FAILED */, {
2874
+ ...options,
2875
+ retryable: false
2876
+ });
2877
+ this.name = "ContentExtractionError";
2878
+ }
2879
+ };
2880
+ var ValidationError = class extends ReaderError {
2881
+ field;
2882
+ constructor(message, options) {
2883
+ super(message, "INVALID_OPTIONS" /* INVALID_OPTIONS */, {
2884
+ url: options?.url,
2885
+ retryable: false
2886
+ });
2887
+ this.name = "ValidationError";
2888
+ this.field = options?.field;
2889
+ }
2890
+ toJSON() {
2891
+ return {
2892
+ ...super.toJSON(),
2893
+ field: this.field
2894
+ };
2895
+ }
2896
+ };
2897
+ var InvalidUrlError = class extends ReaderError {
2898
+ constructor(url, reason) {
2899
+ super(reason ? `Invalid URL "${url}": ${reason}` : `Invalid URL: ${url}`, "INVALID_URL" /* INVALID_URL */, {
2900
+ url,
2901
+ retryable: false
2902
+ });
2903
+ this.name = "InvalidUrlError";
2904
+ }
2905
+ };
2906
+ var RobotsBlockedError = class extends ReaderError {
2907
+ constructor(url) {
2908
+ super(`URL blocked by robots.txt: ${url}. Set respectRobotsTxt: false to override.`, "ROBOTS_BLOCKED" /* ROBOTS_BLOCKED */, {
2909
+ url,
2910
+ retryable: false
2911
+ });
2912
+ this.name = "RobotsBlockedError";
2913
+ }
2914
+ };
2915
+ var BrowserPoolError = class extends ReaderError {
2916
+ constructor(message, options) {
2917
+ super(message, "BROWSER_ERROR" /* BROWSER_ERROR */, {
2918
+ ...options,
2919
+ retryable: true
2920
+ });
2921
+ this.name = "BrowserPoolError";
2922
+ }
2923
+ };
2924
+ var ClientClosedError = class extends ReaderError {
2925
+ constructor() {
2926
+ super("ReaderClient has been closed. Create a new instance to continue.", "CLIENT_CLOSED" /* CLIENT_CLOSED */, {
2927
+ retryable: false
2928
+ });
2929
+ this.name = "ClientClosedError";
2930
+ }
2931
+ };
2932
+ var NotInitializedError = class extends ReaderError {
2933
+ constructor(component) {
2934
+ super(`${component} not initialized. This should not happen - please report this bug.`, "NOT_INITIALIZED" /* NOT_INITIALIZED */, {
2935
+ retryable: false
2936
+ });
2937
+ this.name = "NotInitializedError";
2938
+ }
2939
+ };
2940
+ function wrapError(error, url) {
2941
+ if (error instanceof ReaderError) {
2942
+ return error;
2943
+ }
2944
+ if (error instanceof Error) {
2945
+ const message = error.message.toLowerCase();
2946
+ if (message.includes("timeout") || message.includes("timed out")) {
2947
+ return new TimeoutError(error.message, 3e4, { url, cause: error });
2948
+ }
2949
+ if (message.includes("econnrefused") || message.includes("connection refused")) {
2950
+ return new NetworkError(`Connection refused: ${error.message}`, { url, cause: error });
2951
+ }
2952
+ if (message.includes("enotfound") || message.includes("dns")) {
2953
+ return new NetworkError(`DNS lookup failed: ${error.message}`, { url, cause: error });
2954
+ }
2955
+ if (message.includes("cloudflare") || message.includes("challenge")) {
2956
+ return new CloudflareError("unknown", { url, cause: error });
2957
+ }
2958
+ return new ReaderError(error.message, "UNKNOWN" /* UNKNOWN */, {
2959
+ url,
2960
+ cause: error,
2961
+ retryable: false
2962
+ });
2963
+ }
2964
+ return new ReaderError(String(error), "UNKNOWN" /* UNKNOWN */, {
2965
+ url,
2966
+ retryable: false
2967
+ });
2968
+ }
3033
2969
  export {
2970
+ AccessDeniedError,
3034
2971
  BrowserPool,
2972
+ BrowserPoolError,
2973
+ ClientClosedError,
2974
+ CloudflareError,
2975
+ ContentExtractionError,
3035
2976
  Crawler,
3036
2977
  DEFAULT_DAEMON_PORT,
3037
2978
  DEFAULT_OPTIONS,
3038
2979
  DaemonClient,
3039
2980
  DaemonServer,
3040
2981
  BrowserPool as HeroBrowserPool,
2982
+ InvalidUrlError,
2983
+ NetworkError,
2984
+ NotInitializedError,
3041
2985
  ReaderClient,
2986
+ ReaderError,
2987
+ ReaderErrorCode,
2988
+ RobotsBlockedError,
3042
2989
  Scraper,
2990
+ TimeoutError,
2991
+ ValidationError,
3043
2992
  cleanContent,
3044
2993
  crawl,
3045
2994
  createHeroConfig,
@@ -3047,14 +2996,12 @@ export {
3047
2996
  detectChallenge,
3048
2997
  extractMetadata,
3049
2998
  formatToHTML,
3050
- formatToJson,
3051
- formatToJsonLite,
3052
2999
  formatToMarkdown,
3053
- formatToText,
3054
3000
  getDaemonInfo,
3055
3001
  getPidFilePath,
3056
3002
  getUrlKey,
3057
3003
  handleChallenge,
3004
+ htmlToMarkdown,
3058
3005
  isChallengePage,
3059
3006
  isDaemonRunning,
3060
3007
  isSameDomain,
@@ -3068,6 +3015,7 @@ export {
3068
3015
  shouldCrawlUrl2 as shouldCrawlUrlFn,
3069
3016
  validateUrls,
3070
3017
  waitForChallengeResolution,
3071
- waitForSelector
3018
+ waitForSelector,
3019
+ wrapError
3072
3020
  };
3073
3021
  //# sourceMappingURL=index.js.map