@vakra-dev/reader 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,3073 @@
1
+ // src/client.ts
2
+ import HeroCore from "@ulixee/hero-core";
3
+ import { TransportBridge } from "@ulixee/net";
4
+ import { ConnectionToHeroCore } from "@ulixee/hero";
5
+
6
+ // src/scraper.ts
7
+ import pLimit from "p-limit";
8
+
9
+ // src/cloudflare/detector.ts
10
+ var CHALLENGE_DOM_SELECTORS = [
11
+ "#challenge-running",
12
+ "#challenge-stage",
13
+ "#challenge-form",
14
+ ".cf-browser-verification"
15
+ ];
16
+ var CHALLENGE_TEXT_PATTERNS = [
17
+ "verifying you are human",
18
+ "checking if the site connection is secure",
19
+ "this process is automatic. your browser will redirect"
20
+ ];
21
+ var BLOCKED_SIGNALS = [
22
+ "you have been blocked",
23
+ "access to this page has been denied",
24
+ "sorry, you have been blocked",
25
+ "access denied",
26
+ "403 forbidden"
27
+ ];
28
+ async function detectChallenge(hero) {
29
+ const signals = [];
30
+ let type = "none";
31
+ try {
32
+ if (!hero.document) {
33
+ return {
34
+ isChallenge: false,
35
+ type: "none",
36
+ confidence: 0,
37
+ signals: ["No document available"]
38
+ };
39
+ }
40
+ const html = await hero.document.documentElement.outerHTML;
41
+ const htmlLower = html.toLowerCase();
42
+ for (const selector of CHALLENGE_DOM_SELECTORS) {
43
+ if (htmlLower.includes(selector.toLowerCase())) {
44
+ signals.push(`Challenge element: ${selector}`);
45
+ type = "js_challenge";
46
+ }
47
+ }
48
+ for (const pattern of CHALLENGE_TEXT_PATTERNS) {
49
+ if (htmlLower.includes(pattern)) {
50
+ signals.push(`Challenge text: "${pattern}"`);
51
+ type = type === "none" ? "js_challenge" : type;
52
+ }
53
+ }
54
+ if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
55
+ signals.push('Challenge text: "waiting for...to respond"');
56
+ type = type === "none" ? "js_challenge" : type;
57
+ }
58
+ for (const pattern of BLOCKED_SIGNALS) {
59
+ if (htmlLower.includes(pattern)) {
60
+ signals.push(`Blocked: "${pattern}"`);
61
+ type = "blocked";
62
+ break;
63
+ }
64
+ }
65
+ const isChallenge = signals.length > 0;
66
+ const confidence = isChallenge ? 100 : 0;
67
+ return {
68
+ isChallenge,
69
+ type: isChallenge ? type : "none",
70
+ confidence,
71
+ signals
72
+ };
73
+ } catch (error) {
74
+ return {
75
+ isChallenge: false,
76
+ type: "none",
77
+ confidence: 0,
78
+ signals: [`Error during detection: ${error.message}`]
79
+ };
80
+ }
81
+ }
82
+ async function isChallengePage(hero) {
83
+ const detection = await detectChallenge(hero);
84
+ return detection.isChallenge;
85
+ }
86
+
87
+ // src/cloudflare/handler.ts
88
+ async function waitForChallengeResolution(hero, options) {
89
+ const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
90
+ const startTime = Date.now();
91
+ const log = (msg) => verbose && console.log(` ${msg}`);
92
+ while (Date.now() - startTime < maxWaitMs) {
93
+ const elapsed = Date.now() - startTime;
94
+ try {
95
+ const currentUrl = await hero.url;
96
+ if (currentUrl !== initialUrl) {
97
+ log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
98
+ log(` Waiting for new page to load...`);
99
+ try {
100
+ await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
101
+ log(` DOMContentLoaded`);
102
+ } catch {
103
+ log(` DOMContentLoaded timeout, continuing...`);
104
+ }
105
+ await hero.waitForPaintingStable().catch(() => {
106
+ });
107
+ log(` Page stabilized`);
108
+ return { resolved: true, method: "url_redirect", waitedMs: elapsed };
109
+ }
110
+ } catch {
111
+ }
112
+ const detection = await detectChallenge(hero);
113
+ if (!detection.isChallenge) {
114
+ log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
115
+ log(` Waiting for page to load...`);
116
+ try {
117
+ await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
118
+ log(` DOMContentLoaded`);
119
+ } catch {
120
+ log(` DOMContentLoaded timeout, continuing...`);
121
+ }
122
+ await hero.waitForPaintingStable().catch(() => {
123
+ });
124
+ log(` Page stabilized`);
125
+ return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
126
+ }
127
+ log(
128
+ `\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
129
+ );
130
+ await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
131
+ }
132
+ return {
133
+ resolved: false,
134
+ method: "timeout",
135
+ waitedMs: Date.now() - startTime
136
+ };
137
+ }
138
+ async function waitForSelector(hero, selector, maxWaitMs, verbose = false) {
139
+ const startTime = Date.now();
140
+ const log = (msg) => verbose && console.log(` ${msg}`);
141
+ log(`Waiting for selector: "${selector}"`);
142
+ while (Date.now() - startTime < maxWaitMs) {
143
+ try {
144
+ const element = await hero.document.querySelector(selector);
145
+ if (element) {
146
+ const elapsed = Date.now() - startTime;
147
+ log(`\u2713 Selector found after ${(elapsed / 1e3).toFixed(1)}s`);
148
+ return { found: true, waitedMs: elapsed };
149
+ }
150
+ } catch {
151
+ }
152
+ await new Promise((resolve) => setTimeout(resolve, 300));
153
+ }
154
+ log(`\u2717 Selector not found within timeout`);
155
+ return { found: false, waitedMs: Date.now() - startTime };
156
+ }
157
+ async function handleChallenge(hero, options = {}) {
158
+ const initialUrl = await hero.url;
159
+ const detection = await detectChallenge(hero);
160
+ if (!detection.isChallenge) {
161
+ return { resolved: true, method: "signals_cleared", waitedMs: 0 };
162
+ }
163
+ return waitForChallengeResolution(hero, {
164
+ ...options,
165
+ initialUrl
166
+ });
167
+ }
168
+
169
+ // src/formatters/markdown.ts
170
+ import TurndownService from "turndown";
171
+ var turndownService = new TurndownService({
172
+ headingStyle: "atx",
173
+ hr: "---",
174
+ bulletListMarker: "-",
175
+ codeBlockStyle: "fenced",
176
+ fence: "```",
177
+ emDelimiter: "*",
178
+ strongDelimiter: "**",
179
+ linkStyle: "inlined",
180
+ linkReferenceStyle: "full"
181
+ });
182
+ function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
183
+ const sections = [];
184
+ if (includeMetadata) {
185
+ sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
186
+ }
187
+ if (pages.length > 1) {
188
+ sections.push(createMarkdownTOC(pages));
189
+ }
190
+ sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
191
+ return sections.join("\n\n");
192
+ }
193
+ function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
194
+ const title = website.title || extractDomainFromUrl(baseUrl);
195
+ const description = website.description || "";
196
+ let header = `# Website Scrape: ${title}
197
+
198
+ `;
199
+ header += `**Base URL:** ${baseUrl}
200
+ `;
201
+ header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
202
+ `;
203
+ header += `**Duration:** ${duration}ms
204
+ `;
205
+ header += `**Total pages:** ${totalPages}
206
+ `;
207
+ if (description) {
208
+ header += `**Description:** ${description}
209
+ `;
210
+ }
211
+ if (website.author) {
212
+ header += `**Author:** ${website.author}
213
+ `;
214
+ }
215
+ if (website.language) {
216
+ header += `**Language:** ${website.language}
217
+ `;
218
+ }
219
+ return header;
220
+ }
221
+ function createMarkdownTOC(pages) {
222
+ let toc = "## Table of Contents\n\n";
223
+ pages.forEach((page, index) => {
224
+ const depth = " ".repeat(page.depth);
225
+ const pageNumber = index + 1;
226
+ const title = page.title || `Page ${pageNumber}`;
227
+ const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
228
+ const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
229
+ toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
230
+ `;
231
+ });
232
+ return toc;
233
+ }
234
+ function createMarkdownPage(page, pageNumber) {
235
+ const title = page.title || `Page ${pageNumber}`;
236
+ const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
237
+ const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
238
+ let pageContent = `---
239
+
240
+ `;
241
+ pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
242
+
243
+ `;
244
+ pageContent += `**URL:** ${page.url}
245
+ `;
246
+ pageContent += `**Title:** ${page.title}
247
+ `;
248
+ pageContent += `**Depth:** ${page.depth}
249
+ `;
250
+ pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
251
+
252
+ `;
253
+ pageContent += `---
254
+
255
+ `;
256
+ const markdown = htmlToMarkdown(page.html);
257
+ pageContent += markdown;
258
+ return pageContent;
259
+ }
260
+ function htmlToMarkdown(html) {
261
+ try {
262
+ return turndownService.turndown(html);
263
+ } catch (error) {
264
+ console.warn("Error converting HTML to Markdown:", error);
265
+ return html.replace(/<[^>]*>/g, "").trim();
266
+ }
267
+ }
268
+ function extractDomainFromUrl(url) {
269
+ try {
270
+ return new URL(url).hostname;
271
+ } catch {
272
+ return "Unknown";
273
+ }
274
+ }
275
+
276
+ // src/formatters/html.ts
277
+ function formatToHTML(pages, baseUrl, scrapedAt, duration, website) {
278
+ const html = `<!DOCTYPE html>
279
+ <html lang="${website.language || "en"}">
280
+ <head>
281
+ <meta charset="${website.charset || "UTF-8"}">
282
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
283
+ <title>Scrape: ${website.title || extractDomainFromUrl2(baseUrl)}</title>
284
+ ${generateMetaTags(website)}
285
+ <style>
286
+ ${generateCSS()}
287
+ </style>
288
+ </head>
289
+ <body>
290
+ <header class="header">
291
+ <h1>Website Scrape: ${escapeHtml(website.title || extractDomainFromUrl2(baseUrl))}</h1>
292
+ <div class="meta-info">
293
+ <p><strong>Base URL:</strong> <a href="${escapeHtml(
294
+ baseUrl
295
+ )}" target="_blank">${escapeHtml(baseUrl)}</a></p>
296
+ <p><strong>Scraped at:</strong> ${new Date(scrapedAt).toLocaleString()}</p>
297
+ <p><strong>Duration:</strong> ${duration}ms</p>
298
+ <p><strong>Total pages:</strong> ${pages.length}</p>
299
+ ${website.description ? `<p><strong>Description:</strong> ${escapeHtml(website.description)}</p>` : ""}
300
+ ${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
301
+ ${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
302
+ </div>
303
+ </header>
304
+
305
+ ${pages.length > 1 ? generateTOC(pages) : ""}
306
+
307
+ <main class="content">
308
+ ${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
309
+ </main>
310
+
311
+ <footer class="footer">
312
+ <p>Generated by Reader JS/TS SDK</p>
313
+ </footer>
314
+
315
+ <script>
316
+ ${generateJavaScript()}
317
+ </script>
318
+ </body>
319
+ </html>`;
320
+ return html;
321
+ }
322
+ function generateMetaTags(website) {
323
+ const tags = [];
324
+ if (website.description) {
325
+ tags.push(`<meta name="description" content="${escapeHtml(website.description)}">`);
326
+ }
327
+ if (website.author) {
328
+ tags.push(`<meta name="author" content="${escapeHtml(website.author)}">`);
329
+ }
330
+ if (website.keywords) {
331
+ tags.push(`<meta name="keywords" content="${escapeHtml(website.keywords.join(", "))}">`);
332
+ }
333
+ if (website.robots) {
334
+ tags.push(`<meta name="robots" content="${escapeHtml(website.robots)}">`);
335
+ }
336
+ if (website.themeColor) {
337
+ tags.push(`<meta name="theme-color" content="${escapeHtml(website.themeColor)}">`);
338
+ }
339
+ if (website.favicon) {
340
+ tags.push(`<link rel="icon" href="${escapeHtml(website.favicon)}">`);
341
+ }
342
+ if (website.canonical) {
343
+ tags.push(`<link rel="canonical" href="${escapeHtml(website.canonical)}">`);
344
+ }
345
+ if (website.openGraph) {
346
+ const og = website.openGraph;
347
+ if (og.title) tags.push(`<meta property="og:title" content="${escapeHtml(og.title)}">`);
348
+ if (og.description)
349
+ tags.push(`<meta property="og:description" content="${escapeHtml(og.description)}">`);
350
+ if (og.type) tags.push(`<meta property="og:type" content="${escapeHtml(og.type)}">`);
351
+ if (og.url) tags.push(`<meta property="og:url" content="${escapeHtml(og.url)}">`);
352
+ if (og.image) tags.push(`<meta property="og:image" content="${escapeHtml(og.image)}">`);
353
+ if (og.siteName)
354
+ tags.push(`<meta property="og:site_name" content="${escapeHtml(og.siteName)}">`);
355
+ if (og.locale) tags.push(`<meta property="og:locale" content="${escapeHtml(og.locale)}">`);
356
+ }
357
+ if (website.twitter) {
358
+ const twitter = website.twitter;
359
+ if (twitter.card) tags.push(`<meta name="twitter:card" content="${escapeHtml(twitter.card)}">`);
360
+ if (twitter.site) tags.push(`<meta name="twitter:site" content="${escapeHtml(twitter.site)}">`);
361
+ if (twitter.creator)
362
+ tags.push(`<meta name="twitter:creator" content="${escapeHtml(twitter.creator)}">`);
363
+ if (twitter.title)
364
+ tags.push(`<meta name="twitter:title" content="${escapeHtml(twitter.title)}">`);
365
+ if (twitter.description)
366
+ tags.push(`<meta name="twitter:description" content="${escapeHtml(twitter.description)}">`);
367
+ if (twitter.image)
368
+ tags.push(`<meta name="twitter:image" content="${escapeHtml(twitter.image)}">`);
369
+ }
370
+ return tags.join("\n ");
371
+ }
372
+ function generateCSS() {
373
+ return `
374
+ * {
375
+ margin: 0;
376
+ padding: 0;
377
+ box-sizing: border-box;
378
+ }
379
+
380
+ body {
381
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
382
+ line-height: 1.6;
383
+ color: #333;
384
+ background-color: #f8f9fa;
385
+ }
386
+
387
+ .header {
388
+ background: white;
389
+ padding: 2rem;
390
+ border-bottom: 1px solid #e9ecef;
391
+ margin-bottom: 2rem;
392
+ }
393
+
394
+ .header h1 {
395
+ color: #2c3e50;
396
+ margin-bottom: 1rem;
397
+ font-size: 2rem;
398
+ }
399
+
400
+ .meta-info {
401
+ display: grid;
402
+ grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
403
+ gap: 0.5rem;
404
+ }
405
+
406
+ .meta-info p {
407
+ margin: 0.25rem 0;
408
+ font-size: 0.9rem;
409
+ color: #6c757d;
410
+ }
411
+
412
+ .toc {
413
+ background: white;
414
+ padding: 1.5rem;
415
+ margin: 2rem 0;
416
+ border-radius: 8px;
417
+ border: 1px solid #e9ecef;
418
+ }
419
+
420
+ .toc h2 {
421
+ color: #2c3e50;
422
+ margin-bottom: 1rem;
423
+ font-size: 1.25rem;
424
+ }
425
+
426
+ .toc ul {
427
+ list-style: none;
428
+ }
429
+
430
+ .toc li {
431
+ margin: 0.5rem 0;
432
+ }
433
+
434
+ .toc a {
435
+ color: #007bff;
436
+ text-decoration: none;
437
+ transition: color 0.2s;
438
+ }
439
+
440
+ .toc a:hover {
441
+ color: #0056b3;
442
+ text-decoration: underline;
443
+ }
444
+
445
+ .content {
446
+ max-width: 800px;
447
+ margin: 0 auto;
448
+ padding: 0 1rem;
449
+ }
450
+
451
+ .page {
452
+ background: white;
453
+ margin: 2rem 0;
454
+ padding: 2rem;
455
+ border-radius: 8px;
456
+ border: 1px solid #e9ecef;
457
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
458
+ }
459
+
460
+ .page-header {
461
+ border-bottom: 2px solid #e9ecef;
462
+ padding-bottom: 1rem;
463
+ margin-bottom: 2rem;
464
+ }
465
+
466
+ .page-header h2 {
467
+ color: #2c3e50;
468
+ margin-bottom: 0.5rem;
469
+ font-size: 1.5rem;
470
+ }
471
+
472
+ .page-meta {
473
+ display: flex;
474
+ flex-wrap: wrap;
475
+ gap: 1rem;
476
+ font-size: 0.9rem;
477
+ color: #6c757d;
478
+ }
479
+
480
+ .page-content {
481
+ line-height: 1.8;
482
+ }
483
+
484
+ .page-content h1, .page-content h2, .page-content h3,
485
+ .page-content h4, .page-content h5, .page-content h6 {
486
+ color: #2c3e50;
487
+ margin: 1.5rem 0 0.5rem 0;
488
+ }
489
+
490
+ .page-content p {
491
+ margin: 1rem 0;
492
+ }
493
+
494
+ .page-content a {
495
+ color: #007bff;
496
+ text-decoration: none;
497
+ }
498
+
499
+ .page-content a:hover {
500
+ text-decoration: underline;
501
+ }
502
+
503
+ .page-content code {
504
+ background: #f8f9fa;
505
+ padding: 0.2rem 0.4rem;
506
+ border-radius: 4px;
507
+ font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
508
+ font-size: 0.9em;
509
+ }
510
+
511
+ .page-content pre {
512
+ background: #f8f9fa;
513
+ padding: 1rem;
514
+ border-radius: 4px;
515
+ overflow-x: auto;
516
+ margin: 1rem 0;
517
+ }
518
+
519
+ .page-content blockquote {
520
+ border-left: 4px solid #007bff;
521
+ padding-left: 1rem;
522
+ margin: 1rem 0;
523
+ color: #6c757d;
524
+ }
525
+
526
+ .footer {
527
+ text-align: center;
528
+ padding: 2rem;
529
+ margin-top: 3rem;
530
+ border-top: 1px solid #e9ecef;
531
+ color: #6c757d;
532
+ font-size: 0.9rem;
533
+ }
534
+
535
+ @media (max-width: 768px) {
536
+ .header {
537
+ padding: 1rem;
538
+ }
539
+
540
+ .header h1 {
541
+ font-size: 1.5rem;
542
+ }
543
+
544
+ .page {
545
+ padding: 1rem;
546
+ }
547
+
548
+ .page-meta {
549
+ flex-direction: column;
550
+ gap: 0.5rem;
551
+ }
552
+ }
553
+ `.trim();
554
+ }
555
+ function generateTOC(pages) {
556
+ const tocItems = pages.map((page, index) => {
557
+ const pageNumber = index + 1;
558
+ const title = page.title || `Page ${pageNumber}`;
559
+ const id = `page-${pageNumber}`;
560
+ return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
561
+ }).join("\n");
562
+ return `
563
+ <nav class="toc">
564
+ <h2>Table of Contents</h2>
565
+ <ul>
566
+ ${tocItems}
567
+ </ul>
568
+ </nav>`;
569
+ }
570
+ function generatePageHTML(page, pageNumber) {
571
+ const id = `page-${pageNumber}`;
572
+ const title = page.title || `Page ${pageNumber}`;
573
+ return `
574
+ <article class="page" id="${id}">
575
+ <div class="page-header">
576
+ <h2>${pageNumber}. ${escapeHtml(title)}</h2>
577
+ <div class="page-meta">
578
+ <span><strong>URL:</strong> <a href="${escapeHtml(
579
+ page.url
580
+ )}" target="_blank">${escapeHtml(page.url)}</a></span>
581
+ <span><strong>Depth:</strong> ${page.depth}</span>
582
+ <span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
583
+ </div>
584
+ </div>
585
+ <div class="page-content">
586
+ ${page.html}
587
+ </div>
588
+ </article>`;
589
+ }
590
+ function generateJavaScript() {
591
+ return `
592
+ // Smooth scrolling for TOC links
593
+ document.querySelectorAll('a[href^="#"]').forEach(anchor => {
594
+ anchor.addEventListener('click', function (e) {
595
+ e.preventDefault();
596
+ const target = document.querySelector(this.getAttribute('href'));
597
+ if (target) {
598
+ target.scrollIntoView({
599
+ behavior: 'smooth',
600
+ block: 'start'
601
+ });
602
+ }
603
+ });
604
+ });
605
+
606
+ // Highlight current section in TOC
607
+ window.addEventListener('scroll', function() {
608
+ const pages = document.querySelectorAll('.page');
609
+ const tocLinks = document.querySelectorAll('.toc a');
610
+
611
+ let currentPage = null;
612
+ pages.forEach(page => {
613
+ const rect = page.getBoundingClientRect();
614
+ if (rect.top <= 100) {
615
+ currentPage = page;
616
+ }
617
+ });
618
+
619
+ tocLinks.forEach(link => {
620
+ link.style.fontWeight = 'normal';
621
+ const target = document.querySelector(link.getAttribute('href'));
622
+ if (target === currentPage) {
623
+ link.style.fontWeight = 'bold';
624
+ }
625
+ });
626
+ });
627
+ `;
628
+ }
629
+ function escapeHtml(text) {
630
+ return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, "&quot;").replace(/'/g, "&#039;").replace(/\//g, "&#x2F;");
631
+ }
632
+ function extractDomainFromUrl2(url) {
633
+ try {
634
+ return new URL(url).hostname;
635
+ } catch {
636
+ return "Unknown";
637
+ }
638
+ }
639
+
640
+ // src/formatters/json.ts
641
+ function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
642
+ const jsonResult = {
643
+ metadata: {
644
+ baseUrl,
645
+ totalPages: pages.length,
646
+ scrapedAt,
647
+ duration,
648
+ website
649
+ },
650
+ pages: pages.map((page, index) => ({
651
+ index: index + 1,
652
+ url: page.url,
653
+ title: page.title,
654
+ markdown: page.markdown,
655
+ html: page.html,
656
+ fetchedAt: page.fetchedAt,
657
+ depth: page.depth,
658
+ wordCount: countWords(page.markdown),
659
+ readingTime: estimateReadingTime(page.markdown)
660
+ }))
661
+ };
662
+ return JSON.stringify(jsonResult, null, 2);
663
+ }
664
+ function formatToJsonLite(pages, baseUrl, scrapedAt, duration, website) {
665
+ const jsonResult = {
666
+ metadata: {
667
+ baseUrl,
668
+ totalPages: pages.length,
669
+ scrapedAt,
670
+ duration,
671
+ website
672
+ },
673
+ pages: pages.map((page, index) => ({
674
+ index: index + 1,
675
+ url: page.url,
676
+ title: page.title,
677
+ markdown: page.markdown,
678
+ fetchedAt: page.fetchedAt,
679
+ depth: page.depth,
680
+ wordCount: countWords(page.markdown),
681
+ readingTime: estimateReadingTime(page.markdown)
682
+ }))
683
+ };
684
+ return JSON.stringify(jsonResult, null, 2);
685
+ }
686
+ function countWords(markdown) {
687
+ const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
688
+ return plainText.split(/\s+/).filter((word) => word.length > 0).length;
689
+ }
690
+ function estimateReadingTime(markdown) {
691
+ const wordCount = countWords(markdown);
692
+ return Math.ceil(wordCount / 200);
693
+ }
694
+
695
+ // src/formatters/text.ts
696
+ import { parseHTML } from "linkedom";
697
+ function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
698
+ const sections = [];
699
+ if (includeMetadata) {
700
+ sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
701
+ }
702
+ sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
703
+ return sections.join("\n\n");
704
+ }
705
+ function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
706
+ const title = website.title || extractDomainFromUrl3(baseUrl);
707
+ const lines = [];
708
+ lines.push(`=== ${title} ===`);
709
+ lines.push("");
710
+ lines.push(`URL: ${baseUrl}`);
711
+ lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
712
+ lines.push(`Duration: ${duration}ms`);
713
+ lines.push(`Pages: ${totalPages}`);
714
+ if (website.description) {
715
+ lines.push(`Description: ${website.description}`);
716
+ }
717
+ if (website.author) {
718
+ lines.push(`Author: ${website.author}`);
719
+ }
720
+ if (website.language) {
721
+ lines.push(`Language: ${website.language}`);
722
+ }
723
+ return lines.join("\n");
724
+ }
725
+ function createTextPage(page, pageNumber, showSeparator) {
726
+ const lines = [];
727
+ if (showSeparator) {
728
+ lines.push("\u2500".repeat(60));
729
+ lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
730
+ lines.push(`URL: ${page.url}`);
731
+ lines.push("\u2500".repeat(60));
732
+ }
733
+ const plainText = htmlToPlainText(page.html);
734
+ lines.push(plainText);
735
+ return lines.join("\n");
736
+ }
737
+ function htmlToPlainText(html) {
738
+ const { document } = parseHTML(html);
739
+ const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
740
+ elementsToRemove.forEach((tag) => {
741
+ document.querySelectorAll(tag).forEach((el) => el.remove());
742
+ });
743
+ let text = document.body?.textContent || document.documentElement?.textContent || "";
744
+ text = text.replace(/[ \t]+/g, " ");
745
+ text = text.replace(/\n[ \t]+/g, "\n");
746
+ text = text.replace(/[ \t]+\n/g, "\n");
747
+ text = text.replace(/\n{3,}/g, "\n\n");
748
+ text = text.trim();
749
+ return text;
750
+ }
751
+ function extractDomainFromUrl3(url) {
752
+ try {
753
+ return new URL(url).hostname;
754
+ } catch {
755
+ return "Unknown";
756
+ }
757
+ }
758
+
759
+ // src/utils/content-cleaner.ts
760
+ import { parseHTML as parseHTML2 } from "linkedom";
761
+ var ALWAYS_REMOVE_SELECTORS = [
762
+ // Navigation and menus
763
+ "nav",
764
+ "header nav",
765
+ "footer nav",
766
+ ".nav",
767
+ ".navigation",
768
+ ".menu",
769
+ ".navbar",
770
+ ".sidebar",
771
+ ".aside",
772
+ // Header and footer elements
773
+ "header",
774
+ "footer",
775
+ ".site-header",
776
+ ".page-header",
777
+ ".site-footer",
778
+ ".page-footer",
779
+ // Social media and sharing
780
+ ".social",
781
+ ".share",
782
+ ".sharing",
783
+ ".twitter",
784
+ ".facebook",
785
+ ".linkedin",
786
+ ".instagram",
787
+ // Comments and discussions
788
+ ".comments",
789
+ ".comment",
790
+ ".discussion",
791
+ ".disqus",
792
+ // Forms and interactive elements
793
+ "form",
794
+ "input",
795
+ "button:not([type='submit'])",
796
+ "select",
797
+ "textarea",
798
+ // Scripts and styles
799
+ "script",
800
+ "style",
801
+ "noscript",
802
+ // Hidden elements
803
+ "[hidden]",
804
+ "[style*='display: none']",
805
+ "[style*='display:none']",
806
+ // Common utility classes
807
+ ".cookie",
808
+ ".cookie-banner",
809
+ ".popup",
810
+ ".modal",
811
+ ".overlay",
812
+ ".notification",
813
+ // Breadcrumbs
814
+ ".breadcrumb",
815
+ ".breadcrumbs",
816
+ ".breadcrumb-trail"
817
+ ];
818
+ var AD_SELECTORS = [
819
+ // Ads and promotions
820
+ ".ad",
821
+ ".ads",
822
+ ".advertisement",
823
+ ".promotion",
824
+ ".sponsored",
825
+ "[class*='ad-']",
826
+ "[id*='ad-']",
827
+ "[class*='advert']",
828
+ "[id*='advert']",
829
+ "[class*='banner']",
830
+ "[id*='banner']",
831
+ ".google-ad",
832
+ ".adsense",
833
+ "[data-ad]",
834
+ "[data-ads]",
835
+ "ins.adsbygoogle",
836
+ // Tracking
837
+ "[class*='tracking']",
838
+ "[id*='tracking']",
839
+ "[class*='analytics']",
840
+ "[id*='analytics']"
841
+ ];
842
+ function cleanHtml(html, baseUrl, options = {}) {
843
+ const { removeAds = true, removeBase64Images = true } = options;
844
+ const { document } = parseHTML2(html);
845
+ for (const selector of ALWAYS_REMOVE_SELECTORS) {
846
+ try {
847
+ document.querySelectorAll(selector).forEach((el) => el.remove());
848
+ } catch {
849
+ }
850
+ }
851
+ if (removeAds) {
852
+ for (const selector of AD_SELECTORS) {
853
+ try {
854
+ document.querySelectorAll(selector).forEach((el) => el.remove());
855
+ } catch {
856
+ }
857
+ }
858
+ }
859
+ if (removeBase64Images) {
860
+ removeBase64ImagesFromDocument(document);
861
+ }
862
+ const walker = document.createTreeWalker(
863
+ document,
864
+ 128
865
+ /* NodeFilter.SHOW_COMMENT */
866
+ );
867
+ const comments = [];
868
+ while (walker.nextNode()) {
869
+ comments.push(walker.currentNode);
870
+ }
871
+ comments.forEach((comment) => comment.parentNode?.removeChild(comment));
872
+ convertRelativeUrls(document, baseUrl);
873
+ return document.documentElement?.outerHTML || html;
874
+ }
875
+ function removeBase64ImagesFromDocument(document) {
876
+ document.querySelectorAll("img[src^='data:']").forEach((el) => {
877
+ el.remove();
878
+ });
879
+ document.querySelectorAll("[style*='data:image']").forEach((el) => {
880
+ const style = el.getAttribute("style");
881
+ if (style) {
882
+ const cleanedStyle = style.replace(/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi, "");
883
+ if (cleanedStyle.trim()) {
884
+ el.setAttribute("style", cleanedStyle);
885
+ } else {
886
+ el.removeAttribute("style");
887
+ }
888
+ }
889
+ });
890
+ document.querySelectorAll("source[src^='data:'], source[srcset*='data:']").forEach((el) => {
891
+ el.remove();
892
+ });
893
+ }
894
+ function convertRelativeUrls(document, baseUrl) {
895
+ document.querySelectorAll("[src]").forEach((el) => {
896
+ const src = el.getAttribute("src");
897
+ if (src && !src.startsWith("http") && !src.startsWith("//") && !src.startsWith("data:")) {
898
+ try {
899
+ el.setAttribute("src", new URL(src, baseUrl).toString());
900
+ } catch {
901
+ }
902
+ }
903
+ });
904
+ document.querySelectorAll("[href]").forEach((el) => {
905
+ const href = el.getAttribute("href");
906
+ if (href && !href.startsWith("http") && !href.startsWith("//") && !href.startsWith("#") && !href.startsWith("mailto:") && !href.startsWith("tel:") && !href.startsWith("javascript:")) {
907
+ try {
908
+ el.setAttribute("href", new URL(href, baseUrl).toString());
909
+ } catch {
910
+ }
911
+ }
912
+ });
913
+ }
914
+ function cleanContent(html, baseUrl, options = {}) {
915
+ return cleanHtml(html, baseUrl, options);
916
+ }
917
+
918
+ // src/utils/metadata-extractor.ts
919
+ import { parseHTML as parseHTML3 } from "linkedom";
920
+
921
+ // src/utils/url-helpers.ts
922
+ import { URL as URL2 } from "url";
923
+ import RE2 from "re2";
924
+ function resolveUrl(relative, base) {
925
+ try {
926
+ return new URL2(relative, base).toString();
927
+ } catch {
928
+ return relative;
929
+ }
930
+ }
931
+ function isValidUrl(string) {
932
+ try {
933
+ new URL2(string);
934
+ return true;
935
+ } catch {
936
+ return false;
937
+ }
938
+ }
939
+ function normalizeUrl(url, baseUrl) {
940
+ try {
941
+ let parsedUrl;
942
+ if (url.startsWith("http://") || url.startsWith("https://")) {
943
+ parsedUrl = new URL2(url);
944
+ } else if (baseUrl) {
945
+ parsedUrl = new URL2(url, baseUrl);
946
+ } else {
947
+ throw new Error("Relative URL requires base URL");
948
+ }
949
+ parsedUrl.hash = "";
950
+ return parsedUrl.toString();
951
+ } catch {
952
+ throw new Error(`Invalid URL: ${url}`);
953
+ }
954
+ }
955
+ function extractBaseDomain(url) {
956
+ try {
957
+ const parsedUrl = new URL2(url);
958
+ return parsedUrl.hostname;
959
+ } catch {
960
+ throw new Error(`Invalid URL for domain extraction: ${url}`);
961
+ }
962
+ }
963
+ function getRootDomain(hostname) {
964
+ const parts = hostname.split(".");
965
+ if (parts.length <= 2) {
966
+ return hostname;
967
+ }
968
+ const twoPartTLDs = ["co.uk", "com.au", "co.nz", "com.br", "co.jp", "co.kr", "com.mx", "org.uk"];
969
+ const lastTwo = parts.slice(-2).join(".");
970
+ if (twoPartTLDs.includes(lastTwo)) {
971
+ return parts.slice(-3).join(".");
972
+ }
973
+ return parts.slice(-2).join(".");
974
+ }
975
+ function isSameDomain(url, baseUrl) {
976
+ try {
977
+ const urlDomain = extractBaseDomain(url);
978
+ const baseDomain = extractBaseDomain(baseUrl);
979
+ if (urlDomain === baseDomain) {
980
+ return true;
981
+ }
982
+ const urlRoot = getRootDomain(urlDomain);
983
+ const baseRoot = getRootDomain(baseDomain);
984
+ return urlRoot === baseRoot;
985
+ } catch {
986
+ return false;
987
+ }
988
+ }
989
+ function getUrlKey(url) {
990
+ try {
991
+ const parsedUrl = new URL2(url);
992
+ parsedUrl.search = "";
993
+ return parsedUrl.toString().toLowerCase();
994
+ } catch {
995
+ return url.toLowerCase();
996
+ }
997
+ }
998
+ function validateUrls(urls) {
999
+ const validUrls = [];
1000
+ const errors = [];
1001
+ if (!urls || urls.length === 0) {
1002
+ return {
1003
+ isValid: false,
1004
+ validUrls: [],
1005
+ errors: [{ url: "", error: "At least one URL is required" }]
1006
+ };
1007
+ }
1008
+ for (const url of urls) {
1009
+ if (!url || typeof url !== "string") {
1010
+ errors.push({
1011
+ url: String(url),
1012
+ error: "URL must be a non-empty string"
1013
+ });
1014
+ continue;
1015
+ }
1016
+ const trimmedUrl = url.trim();
1017
+ if (trimmedUrl === "") {
1018
+ errors.push({ url: String(url), error: "URL cannot be empty" });
1019
+ continue;
1020
+ }
1021
+ if (!isValidUrl(trimmedUrl)) {
1022
+ errors.push({ url: trimmedUrl, error: "Invalid URL format" });
1023
+ continue;
1024
+ }
1025
+ if (!trimmedUrl.startsWith("http://") && !trimmedUrl.startsWith("https://")) {
1026
+ errors.push({
1027
+ url: trimmedUrl,
1028
+ error: "URL must start with http:// or https://"
1029
+ });
1030
+ continue;
1031
+ }
1032
+ validUrls.push(trimmedUrl);
1033
+ }
1034
+ const uniqueValidUrls = Array.from(new Set(validUrls));
1035
+ return {
1036
+ isValid: uniqueValidUrls.length > 0 && errors.length === 0,
1037
+ validUrls: uniqueValidUrls,
1038
+ errors
1039
+ };
1040
+ }
1041
+ function matchesPatterns(url, patterns) {
1042
+ if (!patterns || patterns.length === 0) {
1043
+ return false;
1044
+ }
1045
+ return patterns.some((pattern) => {
1046
+ try {
1047
+ const regex = new RE2(pattern, "i");
1048
+ return regex.test(url);
1049
+ } catch {
1050
+ return false;
1051
+ }
1052
+ });
1053
+ }
1054
+ function shouldIncludeUrl(url, includePatterns, excludePatterns) {
1055
+ if (includePatterns && includePatterns.length > 0) {
1056
+ if (!matchesPatterns(url, includePatterns)) {
1057
+ return false;
1058
+ }
1059
+ }
1060
+ if (excludePatterns && excludePatterns.length > 0) {
1061
+ if (matchesPatterns(url, excludePatterns)) {
1062
+ return false;
1063
+ }
1064
+ }
1065
+ return true;
1066
+ }
1067
+ function isContentUrl(url) {
1068
+ const lowerUrl = url.toLowerCase();
1069
+ const nonContentPatterns = [
1070
+ // Legal and policy pages
1071
+ /\/(privacy|terms|tos|legal|cookie|gdpr|disclaimer|imprint|impressum)\b/i,
1072
+ /\/(privacy-policy|terms-of-service|terms-of-use|terms-and-conditions)\b/i,
1073
+ /\/(cookie-policy|data-protection|acceptable-use|user-agreement)\b/i,
1074
+ /\/(refund|cancellation|shipping|return)-?(policy)?\b/i,
1075
+ // Contact and support pages (usually not main content)
1076
+ /\/(contact|support|help|faq|feedback)\/?$/i,
1077
+ // About pages that are typically boilerplate
1078
+ /\/(about-us|careers|jobs|press|investors|team)\/?$/i,
1079
+ // Authentication and admin areas
1080
+ /\/(admin|login|auth|account|dashboard|profile|settings)\//i,
1081
+ // E-commerce utility pages
1082
+ /\/(cart|checkout|payment|subscription|wishlist)\//i,
1083
+ // File downloads and assets
1084
+ /\/(uploads|assets|files|static|media|resources)\//i,
1085
+ // API endpoints
1086
+ /\/(api|graphql|rest|webhook)\//i
1087
+ ];
1088
+ if (nonContentPatterns.some((pattern) => pattern.test(lowerUrl))) {
1089
+ return false;
1090
+ }
1091
+ const skipExtensions = [".pdf", ".doc", ".docx", ".xls", ".xlsx", ".zip", ".exe"];
1092
+ if (skipExtensions.some((ext) => lowerUrl.endsWith(ext))) {
1093
+ return false;
1094
+ }
1095
+ return true;
1096
+ }
1097
+ function shouldCrawlUrl(url, baseUrl, maxDepth, currentDepth, visited) {
1098
+ if (currentDepth > maxDepth) {
1099
+ return false;
1100
+ }
1101
+ const urlKey = getUrlKey(url);
1102
+ if (visited.has(urlKey)) {
1103
+ return false;
1104
+ }
1105
+ if (!isSameDomain(url, baseUrl)) {
1106
+ return false;
1107
+ }
1108
+ const lowerUrl = url.toLowerCase();
1109
+ const skipExtensions = [
1110
+ ".pdf",
1111
+ ".doc",
1112
+ ".docx",
1113
+ ".xls",
1114
+ ".xlsx",
1115
+ ".ppt",
1116
+ ".pptx",
1117
+ ".zip",
1118
+ ".rar",
1119
+ ".tar",
1120
+ ".gz",
1121
+ ".exe",
1122
+ ".dmg",
1123
+ ".pkg",
1124
+ ".deb",
1125
+ ".rpm",
1126
+ ".apk",
1127
+ ".ipa",
1128
+ // Image files
1129
+ ".jpg",
1130
+ ".jpeg",
1131
+ ".png",
1132
+ ".gif",
1133
+ ".bmp",
1134
+ ".svg",
1135
+ ".webp",
1136
+ ".ico",
1137
+ ".favicon",
1138
+ // Video files
1139
+ ".mp4",
1140
+ ".avi",
1141
+ ".mov",
1142
+ ".wmv",
1143
+ ".flv",
1144
+ ".webm",
1145
+ // Audio files
1146
+ ".mp3",
1147
+ ".wav",
1148
+ ".ogg",
1149
+ ".m4a",
1150
+ ".aac",
1151
+ // Font files
1152
+ ".woff",
1153
+ ".woff2",
1154
+ ".ttf",
1155
+ ".otf",
1156
+ ".eot",
1157
+ // Style and script files
1158
+ ".css",
1159
+ ".js",
1160
+ ".mjs",
1161
+ ".ts",
1162
+ ".jsx",
1163
+ ".tsx",
1164
+ // Data and config files
1165
+ ".json",
1166
+ ".xml",
1167
+ ".txt",
1168
+ ".md",
1169
+ ".rss",
1170
+ ".atom",
1171
+ ".sitemap",
1172
+ ".robots",
1173
+ ".webmanifest",
1174
+ // Archive files
1175
+ ".zip",
1176
+ ".tar",
1177
+ ".gz",
1178
+ ".bz2",
1179
+ ".7z"
1180
+ ];
1181
+ if (skipExtensions.some((ext) => lowerUrl.includes(ext))) {
1182
+ return false;
1183
+ }
1184
+ const skipPatterns = [
1185
+ // File downloads and assets
1186
+ /\/(uploads|assets|files|static|media|resources)\//i,
1187
+ // Authentication and admin areas
1188
+ /\/(admin|login|auth|account|dashboard|profile|settings)\//i,
1189
+ // API endpoints
1190
+ /\/(api|graphql|rest|ws:|webhook)\//i,
1191
+ // Common tracking and analytics
1192
+ /\/(analytics|tracking|pixel|beacon|ads)\//i,
1193
+ // Development and testing areas
1194
+ /\/(test|dev|staging|beta|demo)\//i,
1195
+ // Common utility and service pages
1196
+ /\/(search|cart|checkout|payment|subscription)\//i,
1197
+ // Social media and external services
1198
+ /\/(facebook|twitter|instagram|youtube|linkedin|github)\//i,
1199
+ // Legal and policy pages
1200
+ /\/(privacy|terms|tos|legal|cookie|gdpr|disclaimer|imprint|impressum)\b/i,
1201
+ /\/(privacy-policy|terms-of-service|terms-of-use|terms-and-conditions)\b/i,
1202
+ /\/(cookie-policy|data-protection|acceptable-use|user-agreement)\b/i,
1203
+ /\/(refund|cancellation|shipping|return)-?(policy)?\b/i,
1204
+ // Contact and support pages (usually not main content)
1205
+ /\/(contact|support|help|faq|feedback)\/?$/i,
1206
+ // About pages that are typically boilerplate
1207
+ /\/(about-us|careers|jobs|press|investors|team)\/?$/i
1208
+ ];
1209
+ if (skipPatterns.some((pattern) => pattern.test(url))) {
1210
+ return false;
1211
+ }
1212
+ if (url.includes("?") && ["download", "file", "attachment", "export", "print", "share", "email"].some(
1213
+ (param) => url.toLowerCase().includes(param)
1214
+ )) {
1215
+ return false;
1216
+ }
1217
+ if (url.split("/").filter(Boolean).length < 2 && url.split("?")[0].split("/").length <= 2) {
1218
+ return false;
1219
+ }
1220
+ return true;
1221
+ }
1222
+
1223
+ // src/utils/metadata-extractor.ts
1224
+ function extractMetadata(html, baseUrl) {
1225
+ return extractWebsiteMetadata(html, baseUrl);
1226
+ }
1227
+ function extractWebsiteMetadata(html, baseUrl) {
1228
+ const { document } = parseHTML3(html);
1229
+ const metadata = {
1230
+ title: null,
1231
+ description: null,
1232
+ author: null,
1233
+ language: null,
1234
+ charset: null,
1235
+ favicon: null,
1236
+ canonical: null,
1237
+ image: null,
1238
+ keywords: null,
1239
+ robots: null,
1240
+ themeColor: null,
1241
+ openGraph: null,
1242
+ twitter: null
1243
+ };
1244
+ metadata.title = extractTitle(document);
1245
+ metadata.description = extractMetaContent(document, "description");
1246
+ metadata.author = extractMetaContent(document, "author");
1247
+ metadata.language = extractLanguage(document);
1248
+ metadata.charset = extractCharset(document);
1249
+ metadata.favicon = extractFavicon(document, baseUrl);
1250
+ metadata.canonical = extractCanonical(document, baseUrl);
1251
+ metadata.image = extractMetaContent(document, "og:image") || extractMetaContent(document, "twitter:image");
1252
+ metadata.keywords = extractKeywords(document);
1253
+ metadata.robots = extractMetaContent(document, "robots");
1254
+ metadata.themeColor = extractMetaContent(document, "theme-color");
1255
+ metadata.openGraph = extractOpenGraph(document);
1256
+ metadata.twitter = extractTwitterCard(document);
1257
+ return metadata;
1258
+ }
1259
+ function extractTitle(document) {
1260
+ const titleElement = document.querySelector("title");
1261
+ if (titleElement?.textContent) {
1262
+ return titleElement.textContent.trim();
1263
+ }
1264
+ return extractMetaContent(document, "og:title");
1265
+ }
1266
+ function extractMetaContent(document, name) {
1267
+ const byName = document.querySelector(`meta[name="${name}"]`);
1268
+ if (byName) {
1269
+ const content = byName.getAttribute("content");
1270
+ if (content) return content.trim();
1271
+ }
1272
+ const byProperty = document.querySelector(`meta[property="${name}"]`);
1273
+ if (byProperty) {
1274
+ const content = byProperty.getAttribute("content");
1275
+ if (content) return content.trim();
1276
+ }
1277
+ return null;
1278
+ }
1279
+ function extractLanguage(document) {
1280
+ const lang = document.documentElement?.getAttribute("lang");
1281
+ return lang?.trim() || null;
1282
+ }
1283
+ function extractCharset(document) {
1284
+ const charsetMeta = document.querySelector("meta[charset]");
1285
+ if (charsetMeta) {
1286
+ const charset = charsetMeta.getAttribute("charset");
1287
+ if (charset) return charset.trim();
1288
+ }
1289
+ const httpEquivMeta = document.querySelector('meta[http-equiv="Content-Type"]');
1290
+ if (httpEquivMeta) {
1291
+ const content = httpEquivMeta.getAttribute("content");
1292
+ if (content) {
1293
+ const charsetMatch = content.match(/charset=([^\s;]+)/i);
1294
+ if (charsetMatch) return charsetMatch[1].trim();
1295
+ }
1296
+ }
1297
+ return null;
1298
+ }
1299
+ function extractFavicon(document, baseUrl) {
1300
+ const iconSelectors = [
1301
+ 'link[rel="icon"]',
1302
+ 'link[rel="shortcut icon"]',
1303
+ 'link[rel="apple-touch-icon"]',
1304
+ 'link[rel*="icon"]'
1305
+ ];
1306
+ for (const selector of iconSelectors) {
1307
+ const iconLink = document.querySelector(selector);
1308
+ if (iconLink) {
1309
+ const href = iconLink.getAttribute("href");
1310
+ if (href) {
1311
+ return normalizeUrl(href, baseUrl);
1312
+ }
1313
+ }
1314
+ }
1315
+ try {
1316
+ return normalizeUrl("/favicon.ico", baseUrl);
1317
+ } catch {
1318
+ return null;
1319
+ }
1320
+ }
1321
+ function extractCanonical(document, baseUrl) {
1322
+ const canonicalLink = document.querySelector('link[rel="canonical"]');
1323
+ if (canonicalLink) {
1324
+ const href = canonicalLink.getAttribute("href");
1325
+ if (href) {
1326
+ return normalizeUrl(href, baseUrl);
1327
+ }
1328
+ }
1329
+ return null;
1330
+ }
1331
+ function extractKeywords(document) {
1332
+ const keywordsContent = extractMetaContent(document, "keywords");
1333
+ if (!keywordsContent) {
1334
+ return null;
1335
+ }
1336
+ return keywordsContent.split(",").map((keyword) => keyword.trim()).filter((keyword) => keyword.length > 0);
1337
+ }
1338
+ function extractOpenGraph(document) {
1339
+ const openGraph = {
1340
+ title: null,
1341
+ description: null,
1342
+ type: null,
1343
+ url: null,
1344
+ image: null,
1345
+ siteName: null,
1346
+ locale: null
1347
+ };
1348
+ openGraph.title = extractMetaContent(document, "og:title");
1349
+ openGraph.description = extractMetaContent(document, "og:description");
1350
+ openGraph.type = extractMetaContent(document, "og:type");
1351
+ openGraph.url = extractMetaContent(document, "og:url");
1352
+ openGraph.image = extractMetaContent(document, "og:image");
1353
+ openGraph.siteName = extractMetaContent(document, "og:site_name");
1354
+ openGraph.locale = extractMetaContent(document, "og:locale");
1355
+ if (Object.values(openGraph).every((value) => !value)) {
1356
+ return null;
1357
+ }
1358
+ return openGraph;
1359
+ }
1360
+ function extractTwitterCard(document) {
1361
+ const twitter = {
1362
+ card: null,
1363
+ site: null,
1364
+ creator: null,
1365
+ title: null,
1366
+ description: null,
1367
+ image: null
1368
+ };
1369
+ twitter.card = extractMetaContent(document, "twitter:card");
1370
+ twitter.site = extractMetaContent(document, "twitter:site");
1371
+ twitter.creator = extractMetaContent(document, "twitter:creator");
1372
+ twitter.title = extractMetaContent(document, "twitter:title");
1373
+ twitter.description = extractMetaContent(document, "twitter:description");
1374
+ twitter.image = extractMetaContent(document, "twitter:image");
1375
+ if (Object.values(twitter).every((value) => !value)) {
1376
+ return null;
1377
+ }
1378
+ return twitter;
1379
+ }
1380
+
1381
+ // src/utils/logger.ts
1382
+ import pino from "pino";
1383
+ function createLogger(name = "reader", level = process.env.LOG_LEVEL || "info") {
1384
+ return pino({
1385
+ name,
1386
+ level,
1387
+ transport: process.env.NODE_ENV !== "production" ? {
1388
+ target: "pino-pretty",
1389
+ options: {
1390
+ colorize: true,
1391
+ translateTime: "SYS:standard",
1392
+ ignore: "pid,hostname"
1393
+ }
1394
+ } : void 0
1395
+ });
1396
+ }
1397
+ var logger = createLogger();
1398
+
1399
+ // src/utils/robots-parser.ts
1400
+ function parseRobotsTxt(content, userAgent = "*") {
1401
+ const rules = {
1402
+ disallowedPaths: [],
1403
+ allowedPaths: [],
1404
+ crawlDelay: null
1405
+ };
1406
+ const lines = content.split("\n").map((line) => line.trim());
1407
+ let currentUserAgent = "";
1408
+ let matchesUserAgent = false;
1409
+ for (const line of lines) {
1410
+ if (!line || line.startsWith("#")) {
1411
+ continue;
1412
+ }
1413
+ const colonIndex = line.indexOf(":");
1414
+ if (colonIndex === -1) {
1415
+ continue;
1416
+ }
1417
+ const directive = line.substring(0, colonIndex).trim().toLowerCase();
1418
+ const value = line.substring(colonIndex + 1).trim();
1419
+ if (directive === "user-agent") {
1420
+ currentUserAgent = value.toLowerCase();
1421
+ matchesUserAgent = currentUserAgent === "*" || currentUserAgent === userAgent.toLowerCase();
1422
+ } else if (matchesUserAgent) {
1423
+ if (directive === "disallow" && value) {
1424
+ rules.disallowedPaths.push(value);
1425
+ } else if (directive === "allow" && value) {
1426
+ rules.allowedPaths.push(value);
1427
+ } else if (directive === "crawl-delay") {
1428
+ const delay = parseFloat(value);
1429
+ if (!isNaN(delay)) {
1430
+ rules.crawlDelay = delay * 1e3;
1431
+ }
1432
+ }
1433
+ }
1434
+ }
1435
+ return rules;
1436
+ }
1437
+ function isPathAllowed(path, rules) {
1438
+ const normalizedPath = path.startsWith("/") ? path : "/" + path;
1439
+ for (const allowedPath of rules.allowedPaths) {
1440
+ if (pathMatches(normalizedPath, allowedPath)) {
1441
+ return true;
1442
+ }
1443
+ }
1444
+ for (const disallowedPath of rules.disallowedPaths) {
1445
+ if (pathMatches(normalizedPath, disallowedPath)) {
1446
+ return false;
1447
+ }
1448
+ }
1449
+ return true;
1450
+ }
1451
+ function pathMatches(path, pattern) {
1452
+ if (!pattern) {
1453
+ return false;
1454
+ }
1455
+ let regexPattern = pattern.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1456
+ if (regexPattern.endsWith("\\$")) {
1457
+ regexPattern = regexPattern.slice(0, -2) + "$";
1458
+ } else {
1459
+ regexPattern = "^" + regexPattern;
1460
+ }
1461
+ try {
1462
+ const regex = new RegExp(regexPattern);
1463
+ return regex.test(path);
1464
+ } catch {
1465
+ return path.startsWith(pattern);
1466
+ }
1467
+ }
1468
+ async function fetchRobotsTxt(baseUrl) {
1469
+ try {
1470
+ const url = new URL("/robots.txt", baseUrl);
1471
+ const response = await fetch(url.toString(), {
1472
+ headers: {
1473
+ "User-Agent": "ReaderEngine/1.0"
1474
+ }
1475
+ });
1476
+ if (!response.ok) {
1477
+ return null;
1478
+ }
1479
+ const content = await response.text();
1480
+ return parseRobotsTxt(content, "ReaderEngine");
1481
+ } catch {
1482
+ return null;
1483
+ }
1484
+ }
1485
+ function isUrlAllowed(url, rules) {
1486
+ if (!rules) {
1487
+ return true;
1488
+ }
1489
+ try {
1490
+ const parsedUrl = new URL(url);
1491
+ return isPathAllowed(parsedUrl.pathname + parsedUrl.search, rules);
1492
+ } catch {
1493
+ return true;
1494
+ }
1495
+ }
1496
+
1497
+ // src/types.ts
1498
+ var DEFAULT_OPTIONS = {
1499
+ urls: [],
1500
+ formats: ["markdown"],
1501
+ includeMetadata: true,
1502
+ timeoutMs: 3e4,
1503
+ includePatterns: [],
1504
+ excludePatterns: [],
1505
+ // Content cleaning defaults
1506
+ removeAds: true,
1507
+ removeBase64Images: true,
1508
+ skipTLSVerification: true,
1509
+ // Batch defaults
1510
+ batchConcurrency: 1,
1511
+ batchTimeoutMs: 3e5,
1512
+ maxRetries: 2,
1513
+ onProgress: () => {
1514
+ },
1515
+ // Default no-op progress callback
1516
+ // Hero-specific defaults
1517
+ verbose: false,
1518
+ showChrome: false
1519
+ };
1520
+ function isValidFormat(format) {
1521
+ return format === "markdown" || format === "html" || format === "json" || format === "text";
1522
+ }
1523
+ function shouldCrawlUrl2(url, baseDomain) {
1524
+ return url.hostname === baseDomain || url.hostname.endsWith(`.${baseDomain}`);
1525
+ }
1526
+
1527
+ // src/scraper.ts
1528
+ var Scraper = class {
1529
+ options;
1530
+ pool;
1531
+ logger = createLogger("scraper");
1532
+ robotsCache = /* @__PURE__ */ new Map();
1533
+ constructor(options) {
1534
+ this.options = {
1535
+ ...DEFAULT_OPTIONS,
1536
+ ...options
1537
+ };
1538
+ if (!options.pool) {
1539
+ throw new Error("Browser pool must be provided. Use ReaderClient for automatic pool management.");
1540
+ }
1541
+ this.pool = options.pool;
1542
+ }
1543
+ /**
1544
+ * Get robots.txt rules for a URL, cached per domain
1545
+ */
1546
+ async getRobotsRules(url) {
1547
+ const origin = new URL(url).origin;
1548
+ if (!this.robotsCache.has(origin)) {
1549
+ const rules = await fetchRobotsTxt(origin);
1550
+ this.robotsCache.set(origin, rules);
1551
+ }
1552
+ return this.robotsCache.get(origin) ?? null;
1553
+ }
1554
+ /**
1555
+ * Scrape all URLs
1556
+ *
1557
+ * @returns Scrape result with pages and metadata
1558
+ */
1559
+ async scrape() {
1560
+ const startTime = Date.now();
1561
+ const results = await this.scrapeWithConcurrency();
1562
+ return this.buildScrapeResult(results, startTime);
1563
+ }
1564
+ /**
1565
+ * Scrape URLs with concurrency control
1566
+ */
1567
+ async scrapeWithConcurrency() {
1568
+ const limit = pLimit(this.options.batchConcurrency || 1);
1569
+ const tasks = this.options.urls.map(
1570
+ (url, index) => limit(() => this.scrapeSingleUrlWithRetry(url, index))
1571
+ );
1572
+ const batchPromise = Promise.all(tasks);
1573
+ if (this.options.batchTimeoutMs && this.options.batchTimeoutMs > 0) {
1574
+ const timeoutPromise = new Promise((_, reject) => {
1575
+ setTimeout(() => {
1576
+ reject(new Error(`Batch operation timed out after ${this.options.batchTimeoutMs}ms`));
1577
+ }, this.options.batchTimeoutMs);
1578
+ });
1579
+ return Promise.race([batchPromise, timeoutPromise]);
1580
+ }
1581
+ return batchPromise;
1582
+ }
1583
+ /**
1584
+ * Scrape a single URL with retry logic
1585
+ */
1586
+ async scrapeSingleUrlWithRetry(url, index) {
1587
+ const maxRetries = this.options.maxRetries || 2;
1588
+ let lastError;
1589
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
1590
+ try {
1591
+ const result = await this.scrapeSingleUrl(url, index);
1592
+ if (result) {
1593
+ return { result };
1594
+ }
1595
+ lastError = `Failed to scrape ${url}: No content returned`;
1596
+ } catch (error) {
1597
+ lastError = error.message;
1598
+ if (attempt < maxRetries) {
1599
+ const delay = Math.pow(2, attempt) * 1e3;
1600
+ this.logger.warn(`Retry ${attempt + 1}/${maxRetries} for ${url} in ${delay}ms`);
1601
+ await new Promise((resolve) => setTimeout(resolve, delay));
1602
+ }
1603
+ }
1604
+ }
1605
+ this.logger.error(`Failed to scrape ${url} after ${maxRetries + 1} attempts: ${lastError}`);
1606
+ return { result: null, error: lastError };
1607
+ }
1608
+ /**
1609
+ * Wait for the final page to load after any Cloudflare redirects
1610
+ * Cloudflare often does silent redirects even when bypassed, we need to ensure
1611
+ * we're on the actual content page before scraping.
1612
+ */
1613
+ async waitForFinalPage(hero, originalUrl, verbose) {
1614
+ const maxWaitMs = 15e3;
1615
+ const startTime = Date.now();
1616
+ const log = (msg) => verbose && this.logger.info(msg);
1617
+ try {
1618
+ await hero.waitForLoad("AllContentLoaded", { timeoutMs: maxWaitMs });
1619
+ } catch {
1620
+ }
1621
+ let currentUrl = await hero.url;
1622
+ const normalizeUrl2 = (url) => url.replace(/\/+$/, "");
1623
+ const urlChanged = normalizeUrl2(currentUrl) !== normalizeUrl2(originalUrl);
1624
+ if (urlChanged || currentUrl.includes("__cf_chl")) {
1625
+ log(`Cloudflare redirect detected: ${originalUrl} \u2192 ${currentUrl}`);
1626
+ let lastUrl = currentUrl;
1627
+ let stableCount = 0;
1628
+ while (Date.now() - startTime < maxWaitMs) {
1629
+ await new Promise((resolve) => setTimeout(resolve, 500));
1630
+ try {
1631
+ currentUrl = await hero.url;
1632
+ if (currentUrl === lastUrl) {
1633
+ stableCount++;
1634
+ if (stableCount >= 2) {
1635
+ break;
1636
+ }
1637
+ } else {
1638
+ stableCount = 0;
1639
+ lastUrl = currentUrl;
1640
+ log(`URL changed to: ${currentUrl}`);
1641
+ }
1642
+ } catch {
1643
+ }
1644
+ }
1645
+ try {
1646
+ await hero.waitForLoad("AllContentLoaded", { timeoutMs: 1e4 });
1647
+ } catch {
1648
+ }
1649
+ }
1650
+ await hero.waitForPaintingStable();
1651
+ await new Promise((resolve) => setTimeout(resolve, 2e3));
1652
+ }
1653
+ /**
1654
+ * Scrape a single URL
1655
+ */
1656
+ async scrapeSingleUrl(url, index) {
1657
+ const startTime = Date.now();
1658
+ const robotsRules = await this.getRobotsRules(url);
1659
+ if (!isUrlAllowed(url, robotsRules)) {
1660
+ throw new Error(`URL blocked by robots.txt: ${url}`);
1661
+ }
1662
+ try {
1663
+ return await this.pool.withBrowser(async (hero) => {
1664
+ await hero.goto(url, { timeoutMs: this.options.timeoutMs });
1665
+ try {
1666
+ await hero.waitForLoad("DomContentLoaded", { timeoutMs: this.options.timeoutMs });
1667
+ } catch {
1668
+ }
1669
+ await hero.waitForPaintingStable();
1670
+ let hadChallenge = false;
1671
+ let challengeType = "none";
1672
+ let waitTimeMs = 0;
1673
+ const initialUrl = await hero.url;
1674
+ const detection = await detectChallenge(hero);
1675
+ if (detection.isChallenge) {
1676
+ hadChallenge = true;
1677
+ challengeType = detection.type;
1678
+ if (this.options.verbose) {
1679
+ this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
1680
+ }
1681
+ const result2 = await waitForChallengeResolution(hero, {
1682
+ maxWaitMs: 45e3,
1683
+ pollIntervalMs: 500,
1684
+ verbose: this.options.verbose,
1685
+ initialUrl
1686
+ });
1687
+ waitTimeMs = result2.waitedMs;
1688
+ if (!result2.resolved) {
1689
+ throw new Error(`Challenge not resolved: ${detection.type}`);
1690
+ }
1691
+ if (this.options.verbose) {
1692
+ this.logger.info(`Challenge resolved via ${result2.method} in ${waitTimeMs}ms`);
1693
+ }
1694
+ }
1695
+ await this.waitForFinalPage(hero, url, this.options.verbose);
1696
+ if (this.options.waitForSelector) {
1697
+ try {
1698
+ await hero.waitForElement(hero.document.querySelector(this.options.waitForSelector), {
1699
+ timeoutMs: this.options.timeoutMs
1700
+ });
1701
+ } catch (error) {
1702
+ this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
1703
+ }
1704
+ }
1705
+ const pageTitle = await hero.document.title;
1706
+ const html = await hero.document.documentElement.outerHTML;
1707
+ const cleanedHtml = cleanContent(html, url, {
1708
+ removeAds: this.options.removeAds,
1709
+ removeBase64Images: this.options.removeBase64Images
1710
+ });
1711
+ const websiteMetadata = extractMetadata(cleanedHtml, url);
1712
+ const duration = Date.now() - startTime;
1713
+ const scrapedAt = (/* @__PURE__ */ new Date()).toISOString();
1714
+ const page = {
1715
+ url,
1716
+ title: pageTitle,
1717
+ markdown: "",
1718
+ // Will be set by formatter
1719
+ html: cleanedHtml,
1720
+ fetchedAt: scrapedAt,
1721
+ depth: 0,
1722
+ hadChallenge,
1723
+ challengeType,
1724
+ waitTimeMs
1725
+ };
1726
+ const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
1727
+ [page],
1728
+ url,
1729
+ scrapedAt,
1730
+ duration,
1731
+ websiteMetadata,
1732
+ this.options.includeMetadata
1733
+ ) : void 0;
1734
+ const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
1735
+ const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
1736
+ const text = this.options.formats.includes("text") ? formatToText(
1737
+ [page],
1738
+ url,
1739
+ scrapedAt,
1740
+ duration,
1741
+ websiteMetadata,
1742
+ this.options.includeMetadata
1743
+ ) : void 0;
1744
+ if (this.options.onProgress) {
1745
+ this.options.onProgress({
1746
+ completed: index + 1,
1747
+ total: this.options.urls.length,
1748
+ currentUrl: url
1749
+ });
1750
+ }
1751
+ let proxyMetadata;
1752
+ if (this.options.proxy) {
1753
+ const proxy = this.options.proxy;
1754
+ if (proxy.url) {
1755
+ try {
1756
+ const proxyUrl = new URL(proxy.url);
1757
+ proxyMetadata = {
1758
+ host: proxyUrl.hostname,
1759
+ port: parseInt(proxyUrl.port, 10) || 80,
1760
+ country: proxy.country
1761
+ };
1762
+ } catch {
1763
+ }
1764
+ } else if (proxy.host && proxy.port) {
1765
+ proxyMetadata = {
1766
+ host: proxy.host,
1767
+ port: proxy.port,
1768
+ country: proxy.country
1769
+ };
1770
+ }
1771
+ }
1772
+ const result = {
1773
+ markdown,
1774
+ html: htmlOutput,
1775
+ json,
1776
+ text,
1777
+ metadata: {
1778
+ baseUrl: url,
1779
+ totalPages: 1,
1780
+ scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
1781
+ duration,
1782
+ website: websiteMetadata,
1783
+ proxy: proxyMetadata
1784
+ }
1785
+ };
1786
+ return result;
1787
+ });
1788
+ } catch (error) {
1789
+ this.logger.error(`Failed to scrape ${url}: ${error.message}`);
1790
+ if (this.options.onProgress) {
1791
+ this.options.onProgress({
1792
+ completed: index + 1,
1793
+ total: this.options.urls.length,
1794
+ currentUrl: url
1795
+ });
1796
+ }
1797
+ return null;
1798
+ }
1799
+ }
1800
+ /**
1801
+ * Build final scrape result
1802
+ */
1803
+ buildScrapeResult(results, startTime) {
1804
+ const successful = results.filter((r) => r.result !== null).map((r) => r.result);
1805
+ const errors = [];
1806
+ results.forEach((r, index) => {
1807
+ if (r.result === null && r.error) {
1808
+ errors.push({ url: this.options.urls[index], error: r.error });
1809
+ }
1810
+ });
1811
+ const batchMetadata = {
1812
+ totalUrls: this.options.urls.length,
1813
+ successfulUrls: successful.length,
1814
+ failedUrls: results.filter((r) => r.result === null).length,
1815
+ scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
1816
+ totalDuration: Date.now() - startTime,
1817
+ errors
1818
+ };
1819
+ return {
1820
+ data: successful,
1821
+ batchMetadata
1822
+ };
1823
+ }
1824
+ };
1825
+ async function scrape(options) {
1826
+ const scraper = new Scraper(options);
1827
+ return scraper.scrape();
1828
+ }
1829
+
1830
+ // src/crawler.ts
1831
+ import { parseHTML as parseHTML4 } from "linkedom";
1832
+
1833
+ // src/utils/rate-limiter.ts
1834
+ import pLimit2 from "p-limit";
1835
+ async function rateLimit(ms) {
1836
+ return new Promise((resolve) => setTimeout(resolve, ms));
1837
+ }
1838
+
1839
+ // src/crawler.ts
1840
+ var Crawler = class {
1841
+ options;
1842
+ visited = /* @__PURE__ */ new Set();
1843
+ queue = [];
1844
+ urls = [];
1845
+ pool;
1846
+ logger = createLogger("crawler");
1847
+ robotsRules = null;
1848
+ constructor(options) {
1849
+ if (!options.pool) {
1850
+ throw new Error("Browser pool must be provided. Use ReaderClient for automatic pool management.");
1851
+ }
1852
+ this.pool = options.pool;
1853
+ this.options = {
1854
+ url: options.url,
1855
+ depth: options.depth || 1,
1856
+ maxPages: options.maxPages || 20,
1857
+ scrape: options.scrape || false,
1858
+ delayMs: options.delayMs || 1e3,
1859
+ timeoutMs: options.timeoutMs,
1860
+ includePatterns: options.includePatterns,
1861
+ excludePatterns: options.excludePatterns,
1862
+ formats: options.formats || ["markdown", "html"],
1863
+ scrapeConcurrency: options.scrapeConcurrency || 2,
1864
+ proxy: options.proxy,
1865
+ userAgent: options.userAgent,
1866
+ verbose: options.verbose || false,
1867
+ showChrome: options.showChrome || false,
1868
+ connectionToCore: options.connectionToCore,
1869
+ // Content cleaning options
1870
+ removeAds: options.removeAds,
1871
+ removeBase64Images: options.removeBase64Images
1872
+ };
1873
+ }
1874
+ /**
1875
+ * Start crawling
1876
+ */
1877
+ async crawl() {
1878
+ const startTime = Date.now();
1879
+ this.robotsRules = await fetchRobotsTxt(this.options.url);
1880
+ if (this.robotsRules) {
1881
+ this.logger.info("Loaded robots.txt rules");
1882
+ }
1883
+ if (isUrlAllowed(this.options.url, this.robotsRules)) {
1884
+ this.queue.push({ url: this.options.url, depth: 0 });
1885
+ } else {
1886
+ this.logger.warn(`Seed URL blocked by robots.txt: ${this.options.url}`);
1887
+ }
1888
+ while (this.queue.length > 0 && this.urls.length < this.options.maxPages) {
1889
+ if (this.options.timeoutMs && Date.now() - startTime > this.options.timeoutMs) {
1890
+ this.logger.warn(`Crawl timed out after ${this.options.timeoutMs}ms`);
1891
+ break;
1892
+ }
1893
+ const item = this.queue.shift();
1894
+ const urlKey = getUrlKey(item.url);
1895
+ if (this.visited.has(urlKey)) {
1896
+ continue;
1897
+ }
1898
+ const result = await this.fetchPage(item.url);
1899
+ if (result) {
1900
+ this.urls.push(result.crawlUrl);
1901
+ this.visited.add(urlKey);
1902
+ if (item.depth < this.options.depth) {
1903
+ const links = this.extractLinks(result.html, item.url, item.depth + 1);
1904
+ this.queue.push(...links);
1905
+ }
1906
+ }
1907
+ const delay = this.robotsRules?.crawlDelay || this.options.delayMs;
1908
+ await rateLimit(delay);
1909
+ }
1910
+ const metadata = {
1911
+ totalUrls: this.urls.length,
1912
+ maxDepth: this.options.depth,
1913
+ totalDuration: Date.now() - startTime,
1914
+ seedUrl: this.options.url
1915
+ };
1916
+ let scraped;
1917
+ if (this.options.scrape) {
1918
+ scraped = await this.scrapeDiscoveredUrls();
1919
+ }
1920
+ return {
1921
+ urls: this.urls,
1922
+ scraped,
1923
+ metadata
1924
+ };
1925
+ }
1926
+ /**
1927
+ * Fetch a single page and extract basic info
1928
+ */
1929
+ async fetchPage(url) {
1930
+ try {
1931
+ return await this.pool.withBrowser(async (hero) => {
1932
+ await hero.goto(url, { timeoutMs: 3e4 });
1933
+ await hero.waitForPaintingStable();
1934
+ const initialUrl = await hero.url;
1935
+ const detection = await detectChallenge(hero);
1936
+ if (detection.isChallenge) {
1937
+ if (this.options.verbose) {
1938
+ this.logger.info(`Challenge detected on ${url}`);
1939
+ }
1940
+ const result = await waitForChallengeResolution(hero, {
1941
+ maxWaitMs: 45e3,
1942
+ pollIntervalMs: 500,
1943
+ verbose: this.options.verbose,
1944
+ initialUrl
1945
+ });
1946
+ if (!result.resolved) {
1947
+ throw new Error(`Challenge not resolved`);
1948
+ }
1949
+ }
1950
+ const title = await hero.document.title;
1951
+ const html = await hero.document.documentElement.outerHTML;
1952
+ let description = null;
1953
+ try {
1954
+ const metaDesc = await hero.document.querySelector('meta[name="description"]');
1955
+ if (metaDesc) {
1956
+ description = await metaDesc.getAttribute("content");
1957
+ }
1958
+ } catch {
1959
+ }
1960
+ return {
1961
+ crawlUrl: {
1962
+ url,
1963
+ title: title || "Untitled",
1964
+ description
1965
+ },
1966
+ html
1967
+ };
1968
+ });
1969
+ } catch (error) {
1970
+ this.logger.error(`Failed to fetch ${url}: ${error.message}`);
1971
+ return null;
1972
+ }
1973
+ }
1974
+ /**
1975
+ * Extract links from HTML content using DOM parsing
1976
+ * Handles all href formats (single quotes, double quotes, unquoted)
1977
+ */
1978
+ extractLinks(html, baseUrl, depth) {
1979
+ const links = [];
1980
+ const { document } = parseHTML4(html);
1981
+ document.querySelectorAll("a[href]").forEach((anchor) => {
1982
+ const href = anchor.getAttribute("href");
1983
+ if (!href) return;
1984
+ const resolved = resolveUrl(href, baseUrl);
1985
+ if (!resolved || !isValidUrl(resolved)) return;
1986
+ if (!isSameDomain(resolved, this.options.url)) return;
1987
+ if (!isContentUrl(resolved)) return;
1988
+ if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
1989
+ if (!isUrlAllowed(resolved, this.robotsRules)) return;
1990
+ const urlKey = getUrlKey(resolved);
1991
+ if (this.visited.has(urlKey) || this.queue.some((q) => getUrlKey(q.url) === urlKey)) {
1992
+ return;
1993
+ }
1994
+ links.push({ url: resolved, depth });
1995
+ });
1996
+ return links;
1997
+ }
1998
+ /**
1999
+ * Scrape all discovered URLs
2000
+ */
2001
+ async scrapeDiscoveredUrls() {
2002
+ const urls = this.urls.map((u) => u.url);
2003
+ return scrape({
2004
+ urls,
2005
+ formats: this.options.formats,
2006
+ batchConcurrency: this.options.scrapeConcurrency,
2007
+ proxy: this.options.proxy,
2008
+ userAgent: this.options.userAgent,
2009
+ verbose: this.options.verbose,
2010
+ showChrome: this.options.showChrome,
2011
+ pool: this.pool,
2012
+ // Content cleaning options
2013
+ removeAds: this.options.removeAds,
2014
+ removeBase64Images: this.options.removeBase64Images
2015
+ });
2016
+ }
2017
+ };
2018
+ async function crawl(options) {
2019
+ const crawler = new Crawler(options);
2020
+ return crawler.crawl();
2021
+ }
2022
+
2023
+ // src/browser/pool.ts
2024
+ import Hero from "@ulixee/hero";
2025
+
2026
+ // src/proxy/config.ts
2027
+ function createProxyUrl(config) {
2028
+ if (config.url) {
2029
+ return config.url;
2030
+ }
2031
+ if (config.type === "residential") {
2032
+ const sessionId = `hero_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
2033
+ return `http://customer-${config.username}_session-${sessionId}_country-${config.country || "us"}:${config.password}@${config.host}:${config.port}`;
2034
+ }
2035
+ return `http://${config.username}:${config.password}@${config.host}:${config.port}`;
2036
+ }
2037
+ function parseProxyUrl(url) {
2038
+ try {
2039
+ const parsed = new URL(url);
2040
+ return {
2041
+ url,
2042
+ username: parsed.username,
2043
+ password: parsed.password,
2044
+ host: parsed.hostname,
2045
+ port: parsed.port ? parseInt(parsed.port, 10) : void 0
2046
+ };
2047
+ } catch (error) {
2048
+ throw new Error(`Invalid proxy URL: ${url}`);
2049
+ }
2050
+ }
2051
+
2052
+ // src/browser/hero-config.ts
2053
+ function createHeroConfig(options = {}) {
2054
+ const config = {
2055
+ // Show or hide Chrome window
2056
+ showChrome: options.showChrome ?? false,
2057
+ // ============================================================================
2058
+ // CRITICAL: TLS fingerprint emulation
2059
+ // ============================================================================
2060
+ // Setting disableMitm to false enables TLS/TCP fingerprint emulation
2061
+ // This is ESSENTIAL for bypassing Cloudflare and other anti-bot systems
2062
+ disableMitm: false,
2063
+ // ============================================================================
2064
+ // Session management
2065
+ // ============================================================================
2066
+ // Use incognito for clean session state
2067
+ disableIncognito: false,
2068
+ // ============================================================================
2069
+ // Docker compatibility
2070
+ // ============================================================================
2071
+ // Required when running in containerized environments
2072
+ noChromeSandbox: true,
2073
+ // ============================================================================
2074
+ // DNS over TLS (mimics Chrome behavior)
2075
+ // ============================================================================
2076
+ // Using Cloudflare's DNS (1.1.1.1) over TLS makes the connection
2077
+ // look more like a real Chrome browser
2078
+ dnsOverTlsProvider: {
2079
+ host: "1.1.1.1",
2080
+ servername: "cloudflare-dns.com"
2081
+ },
2082
+ // ============================================================================
2083
+ // WebRTC IP leak prevention
2084
+ // ============================================================================
2085
+ // Masks the real IP address in WebRTC connections
2086
+ // Uses ipify.org to detect the public IP
2087
+ upstreamProxyIpMask: {
2088
+ ipLookupService: "https://api.ipify.org?format=json"
2089
+ },
2090
+ // ============================================================================
2091
+ // Locale and timezone
2092
+ // ============================================================================
2093
+ locale: "en-US",
2094
+ timezoneId: "America/New_York",
2095
+ // ============================================================================
2096
+ // Viewport (standard desktop size)
2097
+ // ============================================================================
2098
+ viewport: {
2099
+ width: 1920,
2100
+ height: 1080
2101
+ },
2102
+ // ============================================================================
2103
+ // User agent (if provided)
2104
+ // ============================================================================
2105
+ ...options.userAgent && { userAgent: options.userAgent },
2106
+ // ============================================================================
2107
+ // Connection to Core (if provided)
2108
+ // ============================================================================
2109
+ ...options.connectionToCore && { connectionToCore: options.connectionToCore }
2110
+ };
2111
+ if (options.proxy) {
2112
+ config.upstreamProxyUrl = createProxyUrl(options.proxy);
2113
+ config.upstreamProxyUseSystemDns = false;
2114
+ }
2115
+ return config;
2116
+ }
2117
+
2118
+ // src/browser/pool.ts
2119
+ var DEFAULT_POOL_CONFIG = {
2120
+ size: 2,
2121
+ retireAfterPageCount: 100,
2122
+ retireAfterAgeMs: 30 * 60 * 1e3,
2123
+ // 30 minutes
2124
+ recycleCheckInterval: 60 * 1e3,
2125
+ // 1 minute
2126
+ healthCheckInterval: 5 * 60 * 1e3,
2127
+ // 5 minutes
2128
+ maxConsecutiveFailures: 3,
2129
+ maxQueueSize: 100,
2130
+ queueTimeout: 60 * 1e3
2131
+ // 1 minute
2132
+ };
2133
+ function generateId() {
2134
+ return `browser_${Date.now()}_${Math.random().toString(36).slice(2, 9)}`;
2135
+ }
2136
+ var BrowserPool = class {
2137
+ instances = [];
2138
+ available = [];
2139
+ inUse = /* @__PURE__ */ new Set();
2140
+ queue = [];
2141
+ config;
2142
+ proxy;
2143
+ recycleTimer;
2144
+ healthTimer;
2145
+ totalRequests = 0;
2146
+ totalRequestDuration = 0;
2147
+ showChrome;
2148
+ connectionToCore;
2149
+ userAgent;
2150
+ verbose;
2151
+ logger = createLogger("pool");
2152
+ constructor(config = {}, proxy, showChrome = false, connectionToCore, userAgent, verbose = false) {
2153
+ this.config = { ...DEFAULT_POOL_CONFIG, ...config };
2154
+ this.proxy = proxy;
2155
+ this.showChrome = showChrome;
2156
+ this.connectionToCore = connectionToCore;
2157
+ this.userAgent = userAgent;
2158
+ this.verbose = verbose;
2159
+ }
2160
+ /**
2161
+ * Initialize the pool by pre-launching browsers
2162
+ */
2163
+ async initialize() {
2164
+ if (this.verbose) {
2165
+ this.logger.info(`Initializing pool with ${this.config.size} browsers...`);
2166
+ }
2167
+ const launchPromises = [];
2168
+ for (let i = 0; i < this.config.size; i++) {
2169
+ launchPromises.push(this.createInstance());
2170
+ }
2171
+ this.instances = await Promise.all(launchPromises);
2172
+ this.available = [...this.instances];
2173
+ this.startRecycling();
2174
+ this.startHealthChecks();
2175
+ if (this.verbose) {
2176
+ this.logger.info(`Pool ready: ${this.instances.length} browsers available`);
2177
+ }
2178
+ }
2179
+ /**
2180
+ * Shutdown the pool and close all browsers
2181
+ */
2182
+ async shutdown() {
2183
+ if (this.verbose) {
2184
+ const stats = this.getStats();
2185
+ this.logger.info(
2186
+ `Shutting down pool: ${stats.totalRequests} total requests processed, ${Math.round(stats.avgRequestDuration)}ms avg duration`
2187
+ );
2188
+ }
2189
+ if (this.recycleTimer) clearInterval(this.recycleTimer);
2190
+ if (this.healthTimer) clearInterval(this.healthTimer);
2191
+ for (const item of this.queue) {
2192
+ item.reject(new Error("Pool shutting down"));
2193
+ }
2194
+ this.queue = [];
2195
+ const closePromises = this.instances.map((instance) => instance.hero.close().catch(() => {
2196
+ }));
2197
+ await Promise.all(closePromises);
2198
+ if (this.connectionToCore) {
2199
+ try {
2200
+ await this.connectionToCore.disconnect();
2201
+ } catch {
2202
+ }
2203
+ this.connectionToCore = void 0;
2204
+ }
2205
+ this.instances = [];
2206
+ this.available = [];
2207
+ this.inUse.clear();
2208
+ }
2209
+ /**
2210
+ * Acquire a browser from the pool
2211
+ */
2212
+ async acquire() {
2213
+ const instance = this.available.shift();
2214
+ if (!instance) {
2215
+ if (this.verbose) {
2216
+ this.logger.info(`No browsers available, queuing request (queue: ${this.queue.length + 1})`);
2217
+ }
2218
+ return this.queueRequest();
2219
+ }
2220
+ instance.status = "busy";
2221
+ instance.lastUsed = Date.now();
2222
+ this.inUse.add(instance);
2223
+ if (this.verbose) {
2224
+ this.logger.info(
2225
+ `Acquired browser ${instance.id} (available: ${this.available.length}, busy: ${this.inUse.size})`
2226
+ );
2227
+ }
2228
+ return instance.hero;
2229
+ }
2230
+ /**
2231
+ * Release a browser back to the pool
2232
+ */
2233
+ release(hero) {
2234
+ const instance = this.instances.find((i) => i.hero === hero);
2235
+ if (!instance) return;
2236
+ instance.status = "idle";
2237
+ instance.requestCount++;
2238
+ this.inUse.delete(instance);
2239
+ if (this.verbose) {
2240
+ this.logger.info(
2241
+ `Released browser ${instance.id} (requests: ${instance.requestCount}, available: ${this.available.length + 1})`
2242
+ );
2243
+ }
2244
+ if (this.shouldRecycle(instance)) {
2245
+ if (this.verbose) {
2246
+ this.logger.info(`Recycling browser ${instance.id} (age or request limit reached)`);
2247
+ }
2248
+ this.recycleInstance(instance).catch(() => {
2249
+ });
2250
+ } else {
2251
+ this.available.push(instance);
2252
+ this.processQueue();
2253
+ }
2254
+ }
2255
+ /**
2256
+ * Execute callback with auto-managed browser
2257
+ */
2258
+ async withBrowser(callback) {
2259
+ const startTime = Date.now();
2260
+ const hero = await this.acquire();
2261
+ try {
2262
+ const result = await callback(hero);
2263
+ this.totalRequests++;
2264
+ this.totalRequestDuration += Date.now() - startTime;
2265
+ return result;
2266
+ } finally {
2267
+ this.release(hero);
2268
+ }
2269
+ }
2270
+ /**
2271
+ * Get pool statistics
2272
+ */
2273
+ getStats() {
2274
+ const recycling = this.instances.filter((i) => i.status === "recycling").length;
2275
+ const unhealthy = this.instances.filter((i) => i.status === "unhealthy").length;
2276
+ return {
2277
+ total: this.instances.length,
2278
+ available: this.available.length,
2279
+ busy: this.inUse.size,
2280
+ recycling,
2281
+ unhealthy,
2282
+ queueLength: this.queue.length,
2283
+ totalRequests: this.totalRequests,
2284
+ avgRequestDuration: this.totalRequests > 0 ? this.totalRequestDuration / this.totalRequests : 0
2285
+ };
2286
+ }
2287
+ /**
2288
+ * Run health check
2289
+ */
2290
+ async healthCheck() {
2291
+ const issues = [];
2292
+ const stats = this.getStats();
2293
+ if (stats.unhealthy > 0) {
2294
+ issues.push(`${stats.unhealthy} unhealthy instances`);
2295
+ }
2296
+ if (stats.queueLength > this.config.maxQueueSize * 0.8) {
2297
+ issues.push(`Queue near capacity: ${stats.queueLength}/${this.config.maxQueueSize}`);
2298
+ }
2299
+ if (stats.available === 0 && stats.queueLength > 0) {
2300
+ issues.push("Pool saturated - all browsers busy with pending requests");
2301
+ }
2302
+ return {
2303
+ healthy: issues.length === 0,
2304
+ issues,
2305
+ stats
2306
+ };
2307
+ }
2308
+ // =========================================================================
2309
+ // Private methods
2310
+ // =========================================================================
2311
+ /**
2312
+ * Create a new browser instance
2313
+ */
2314
+ async createInstance() {
2315
+ const heroConfig = createHeroConfig({
2316
+ proxy: this.proxy,
2317
+ showChrome: this.showChrome,
2318
+ connectionToCore: this.connectionToCore,
2319
+ userAgent: this.userAgent
2320
+ });
2321
+ const hero = new Hero(heroConfig);
2322
+ return {
2323
+ hero,
2324
+ id: generateId(),
2325
+ createdAt: Date.now(),
2326
+ lastUsed: Date.now(),
2327
+ requestCount: 0,
2328
+ status: "idle"
2329
+ };
2330
+ }
2331
+ /**
2332
+ * Check if instance should be recycled
2333
+ */
2334
+ shouldRecycle(instance) {
2335
+ const age = Date.now() - instance.createdAt;
2336
+ return instance.requestCount >= this.config.retireAfterPageCount || age >= this.config.retireAfterAgeMs;
2337
+ }
2338
+ /**
2339
+ * Recycle an instance (close old, create new)
2340
+ */
2341
+ async recycleInstance(instance) {
2342
+ instance.status = "recycling";
2343
+ try {
2344
+ await instance.hero.close().catch(() => {
2345
+ });
2346
+ const newInstance = await this.createInstance();
2347
+ const index = this.instances.indexOf(instance);
2348
+ if (index !== -1) {
2349
+ this.instances[index] = newInstance;
2350
+ }
2351
+ this.available.push(newInstance);
2352
+ if (this.verbose) {
2353
+ this.logger.info(`Recycled browser: ${instance.id} \u2192 ${newInstance.id}`);
2354
+ }
2355
+ this.processQueue();
2356
+ } catch (error) {
2357
+ instance.status = "unhealthy";
2358
+ if (this.verbose) {
2359
+ this.logger.warn(`Failed to recycle browser ${instance.id}`);
2360
+ }
2361
+ }
2362
+ }
2363
+ /**
2364
+ * Queue a request when no browsers available
2365
+ */
2366
+ queueRequest() {
2367
+ return new Promise((resolve, reject) => {
2368
+ if (this.queue.length >= this.config.maxQueueSize) {
2369
+ reject(new Error("Queue full"));
2370
+ return;
2371
+ }
2372
+ const item = {
2373
+ resolve,
2374
+ reject,
2375
+ queuedAt: Date.now()
2376
+ };
2377
+ this.queue.push(item);
2378
+ setTimeout(() => {
2379
+ const index = this.queue.indexOf(item);
2380
+ if (index !== -1) {
2381
+ this.queue.splice(index, 1);
2382
+ reject(new Error("Queue timeout"));
2383
+ }
2384
+ }, this.config.queueTimeout);
2385
+ });
2386
+ }
2387
+ /**
2388
+ * Process queued requests
2389
+ */
2390
+ processQueue() {
2391
+ while (this.queue.length > 0 && this.available.length > 0) {
2392
+ const item = this.queue.shift();
2393
+ const age = Date.now() - item.queuedAt;
2394
+ if (age > this.config.queueTimeout) {
2395
+ item.reject(new Error("Queue timeout"));
2396
+ continue;
2397
+ }
2398
+ this.acquire().then(item.resolve).catch(item.reject);
2399
+ }
2400
+ }
2401
+ /**
2402
+ * Start background recycling task
2403
+ */
2404
+ startRecycling() {
2405
+ this.recycleTimer = setInterval(() => {
2406
+ for (const instance of this.instances) {
2407
+ if (instance.status === "idle" && this.shouldRecycle(instance)) {
2408
+ this.recycleInstance(instance).catch(() => {
2409
+ });
2410
+ }
2411
+ }
2412
+ }, this.config.recycleCheckInterval);
2413
+ this.recycleTimer.unref();
2414
+ }
2415
+ /**
2416
+ * Start background health checks
2417
+ */
2418
+ startHealthChecks() {
2419
+ this.healthTimer = setInterval(async () => {
2420
+ const health = await this.healthCheck();
2421
+ if (!health.healthy && health.issues.length > 0) {
2422
+ console.warn("[BrowserPool] Health issues:", health.issues);
2423
+ }
2424
+ }, this.config.healthCheckInterval);
2425
+ this.healthTimer.unref();
2426
+ }
2427
+ };
2428
+
2429
+ // src/client.ts
2430
+ var logger2 = createLogger("client");
2431
+ var ReaderClient = class {
2432
+ heroCore = null;
2433
+ pool = null;
2434
+ initialized = false;
2435
+ initializing = null;
2436
+ closed = false;
2437
+ options;
2438
+ proxyIndex = 0;
2439
+ cleanupHandler = null;
2440
+ constructor(options = {}) {
2441
+ this.options = options;
2442
+ const skipTLS = options.skipTLSVerification ?? true;
2443
+ if (skipTLS) {
2444
+ process.env.MITM_ALLOW_INSECURE = "true";
2445
+ }
2446
+ this.registerCleanup();
2447
+ }
2448
+ /**
2449
+ * Get the next proxy from the rotation pool
2450
+ */
2451
+ getNextProxy() {
2452
+ const { proxies, proxyRotation = "round-robin" } = this.options;
2453
+ if (!proxies || proxies.length === 0) {
2454
+ return void 0;
2455
+ }
2456
+ if (proxyRotation === "random") {
2457
+ return proxies[Math.floor(Math.random() * proxies.length)];
2458
+ }
2459
+ const proxy = proxies[this.proxyIndex % proxies.length];
2460
+ this.proxyIndex++;
2461
+ return proxy;
2462
+ }
2463
+ /**
2464
+ * Initialize HeroCore. Called automatically on first scrape/crawl.
2465
+ * Can be called explicitly if you want to pre-warm the client.
2466
+ */
2467
+ async start() {
2468
+ if (this.closed) {
2469
+ throw new Error("ReaderClient has been closed. Create a new instance.");
2470
+ }
2471
+ if (this.initialized) {
2472
+ return;
2473
+ }
2474
+ if (this.initializing) {
2475
+ await this.initializing;
2476
+ return;
2477
+ }
2478
+ this.initializing = this.initializeCore();
2479
+ await this.initializing;
2480
+ this.initializing = null;
2481
+ }
2482
+ /**
2483
+ * Internal initialization logic
2484
+ */
2485
+ async initializeCore() {
2486
+ try {
2487
+ if (this.options.verbose) {
2488
+ logger2.info("Starting HeroCore...");
2489
+ }
2490
+ this.heroCore = new HeroCore();
2491
+ await this.heroCore.start();
2492
+ if (this.options.verbose) {
2493
+ logger2.info("HeroCore started successfully");
2494
+ }
2495
+ if (this.options.verbose) {
2496
+ logger2.info("Initializing browser pool...");
2497
+ }
2498
+ const browserPoolConfig = this.options.browserPool;
2499
+ const poolConfig = {
2500
+ size: browserPoolConfig?.size ?? 2,
2501
+ retireAfterPageCount: browserPoolConfig?.retireAfterPages ?? 100,
2502
+ retireAfterAgeMs: (browserPoolConfig?.retireAfterMinutes ?? 30) * 60 * 1e3,
2503
+ maxQueueSize: browserPoolConfig?.maxQueueSize ?? 100
2504
+ };
2505
+ this.pool = new BrowserPool(
2506
+ poolConfig,
2507
+ void 0,
2508
+ // proxy set per-request
2509
+ this.options.showChrome,
2510
+ this.createConnection(),
2511
+ void 0,
2512
+ // userAgent
2513
+ this.options.verbose
2514
+ );
2515
+ await this.pool.initialize();
2516
+ this.initialized = true;
2517
+ if (this.options.verbose) {
2518
+ logger2.info("Browser pool initialized successfully");
2519
+ }
2520
+ } catch (error) {
2521
+ if (this.pool) {
2522
+ await this.pool.shutdown().catch(() => {
2523
+ });
2524
+ this.pool = null;
2525
+ }
2526
+ if (this.heroCore) {
2527
+ await this.heroCore.close().catch(() => {
2528
+ });
2529
+ this.heroCore = null;
2530
+ }
2531
+ this.initialized = false;
2532
+ const message = error.message || String(error);
2533
+ if (message.includes("EADDRINUSE")) {
2534
+ throw new Error(
2535
+ "Failed to start HeroCore: Port already in use. Another instance may be running. Close it or use a different port."
2536
+ );
2537
+ }
2538
+ if (message.includes("chrome") || message.includes("Chrome")) {
2539
+ throw new Error(
2540
+ "Failed to start HeroCore: Chrome/Chromium not found. Please install Chrome or set CHROME_PATH environment variable."
2541
+ );
2542
+ }
2543
+ throw new Error(`Failed to start HeroCore: ${message}`);
2544
+ }
2545
+ }
2546
+ /**
2547
+ * Create a connection to the HeroCore instance
2548
+ */
2549
+ createConnection() {
2550
+ if (!this.heroCore) {
2551
+ throw new Error("HeroCore not initialized. This should not happen.");
2552
+ }
2553
+ const bridge = new TransportBridge();
2554
+ this.heroCore.addConnection(bridge.transportToClient);
2555
+ return new ConnectionToHeroCore(bridge.transportToCore);
2556
+ }
2557
+ /**
2558
+ * Ensure client is initialized before operation
2559
+ */
2560
+ async ensureInitialized() {
2561
+ if (this.closed) {
2562
+ throw new Error("ReaderClient has been closed. Create a new instance.");
2563
+ }
2564
+ if (!this.initialized) {
2565
+ await this.start();
2566
+ }
2567
+ }
2568
+ /**
2569
+ * Scrape one or more URLs
2570
+ *
2571
+ * @param options - Scrape options (urls, formats, etc.)
2572
+ * @returns Scrape result with data and metadata
2573
+ *
2574
+ * @example
2575
+ * const result = await reader.scrape({
2576
+ * urls: ['https://example.com'],
2577
+ * formats: ['markdown', 'html'],
2578
+ * });
2579
+ */
2580
+ async scrape(options) {
2581
+ await this.ensureInitialized();
2582
+ if (!this.pool) {
2583
+ throw new Error("Browser pool not initialized. This should not happen.");
2584
+ }
2585
+ const proxy = options.proxy ?? this.getNextProxy();
2586
+ return await scrape({
2587
+ ...options,
2588
+ proxy,
2589
+ showChrome: options.showChrome ?? this.options.showChrome,
2590
+ verbose: options.verbose ?? this.options.verbose,
2591
+ pool: this.pool
2592
+ });
2593
+ }
2594
+ /**
2595
+ * Crawl a website to discover URLs
2596
+ *
2597
+ * @param options - Crawl options (url, depth, maxPages, etc.)
2598
+ * @returns Crawl result with discovered URLs and optional scraped content
2599
+ *
2600
+ * @example
2601
+ * const result = await reader.crawl({
2602
+ * url: 'https://example.com',
2603
+ * depth: 2,
2604
+ * maxPages: 50,
2605
+ * scrape: true,
2606
+ * });
2607
+ */
2608
+ async crawl(options) {
2609
+ await this.ensureInitialized();
2610
+ if (!this.pool) {
2611
+ throw new Error("Browser pool not initialized. This should not happen.");
2612
+ }
2613
+ const proxy = options.proxy ?? this.getNextProxy();
2614
+ return await crawl({
2615
+ ...options,
2616
+ proxy,
2617
+ pool: this.pool
2618
+ });
2619
+ }
2620
+ /**
2621
+ * Check if the client is initialized and ready
2622
+ */
2623
+ isReady() {
2624
+ return this.initialized && !this.closed;
2625
+ }
2626
+ /**
2627
+ * Close the client and release resources
2628
+ *
2629
+ * Note: This is optional - the client will auto-close on process exit.
2630
+ */
2631
+ async close() {
2632
+ if (this.closed) {
2633
+ return;
2634
+ }
2635
+ this.closed = true;
2636
+ this.removeCleanupHandlers();
2637
+ if (this.pool) {
2638
+ if (this.options.verbose) {
2639
+ logger2.info("Shutting down browser pool...");
2640
+ }
2641
+ try {
2642
+ await this.pool.shutdown();
2643
+ } catch (error) {
2644
+ if (this.options.verbose) {
2645
+ logger2.warn(`Error shutting down pool: ${error.message}`);
2646
+ }
2647
+ }
2648
+ this.pool = null;
2649
+ }
2650
+ if (this.heroCore) {
2651
+ if (this.options.verbose) {
2652
+ logger2.info("Closing HeroCore...");
2653
+ }
2654
+ try {
2655
+ await this.heroCore.close();
2656
+ await HeroCore.shutdown();
2657
+ } catch (error) {
2658
+ if (this.options.verbose) {
2659
+ logger2.warn(`Error closing HeroCore: ${error.message}`);
2660
+ }
2661
+ }
2662
+ this.heroCore = null;
2663
+ }
2664
+ this.initialized = false;
2665
+ if (this.options.verbose) {
2666
+ logger2.info("ReaderClient closed");
2667
+ }
2668
+ }
2669
+ /**
2670
+ * Register cleanup handlers for process exit
2671
+ */
2672
+ registerCleanup() {
2673
+ this.cleanupHandler = async () => {
2674
+ await this.close();
2675
+ };
2676
+ process.once("beforeExit", this.cleanupHandler);
2677
+ process.once("SIGINT", async () => {
2678
+ await this.cleanupHandler?.();
2679
+ process.exit(0);
2680
+ });
2681
+ process.once("SIGTERM", async () => {
2682
+ await this.cleanupHandler?.();
2683
+ process.exit(0);
2684
+ });
2685
+ }
2686
+ /**
2687
+ * Remove process cleanup handlers
2688
+ */
2689
+ removeCleanupHandlers() {
2690
+ if (this.cleanupHandler) {
2691
+ process.removeListener("beforeExit", this.cleanupHandler);
2692
+ this.cleanupHandler = null;
2693
+ }
2694
+ }
2695
+ };
2696
+
2697
+ // src/daemon/server.ts
2698
+ import http from "http";
2699
+ var logger3 = createLogger("daemon");
2700
+ var DEFAULT_DAEMON_PORT = 3847;
2701
+ var PID_FILE_NAME = ".reader-daemon.pid";
2702
+ var DaemonServer = class {
2703
+ server = null;
2704
+ client = null;
2705
+ options;
2706
+ startTime = 0;
2707
+ constructor(options = {}) {
2708
+ this.options = {
2709
+ port: options.port ?? DEFAULT_DAEMON_PORT,
2710
+ poolSize: options.poolSize ?? 5,
2711
+ verbose: options.verbose ?? false,
2712
+ showChrome: options.showChrome ?? false
2713
+ };
2714
+ }
2715
+ /**
2716
+ * Start the daemon server
2717
+ */
2718
+ async start() {
2719
+ if (this.server) {
2720
+ throw new Error("Daemon is already running");
2721
+ }
2722
+ const clientOptions = {
2723
+ verbose: this.options.verbose,
2724
+ showChrome: this.options.showChrome,
2725
+ browserPool: {
2726
+ size: this.options.poolSize
2727
+ }
2728
+ };
2729
+ this.client = new ReaderClient(clientOptions);
2730
+ await this.client.start();
2731
+ this.server = http.createServer(this.handleRequest.bind(this));
2732
+ await new Promise((resolve, reject) => {
2733
+ this.server.listen(this.options.port, () => {
2734
+ this.startTime = Date.now();
2735
+ if (this.options.verbose) {
2736
+ logger3.info(`Daemon started on port ${this.options.port} with pool size ${this.options.poolSize}`);
2737
+ }
2738
+ resolve();
2739
+ });
2740
+ this.server.on("error", (error) => {
2741
+ if (error.code === "EADDRINUSE") {
2742
+ reject(new Error(`Port ${this.options.port} is already in use. Is another daemon running?`));
2743
+ } else {
2744
+ reject(error);
2745
+ }
2746
+ });
2747
+ });
2748
+ await this.writePidFile();
2749
+ }
2750
+ /**
2751
+ * Stop the daemon server
2752
+ */
2753
+ async stop() {
2754
+ if (this.server) {
2755
+ await new Promise((resolve) => {
2756
+ this.server.close(() => resolve());
2757
+ });
2758
+ this.server = null;
2759
+ }
2760
+ if (this.client) {
2761
+ await this.client.close();
2762
+ this.client = null;
2763
+ }
2764
+ await this.removePidFile();
2765
+ if (this.options.verbose) {
2766
+ logger3.info("Daemon stopped");
2767
+ }
2768
+ }
2769
+ /**
2770
+ * Get the port the daemon is running on
2771
+ */
2772
+ getPort() {
2773
+ return this.options.port;
2774
+ }
2775
+ /**
2776
+ * Handle incoming HTTP requests
2777
+ */
2778
+ async handleRequest(req, res) {
2779
+ if (req.method !== "POST" || req.url !== "/") {
2780
+ res.writeHead(404, { "Content-Type": "application/json" });
2781
+ res.end(JSON.stringify({ success: false, error: "Not found" }));
2782
+ return;
2783
+ }
2784
+ let body = "";
2785
+ for await (const chunk of req) {
2786
+ body += chunk;
2787
+ }
2788
+ let request;
2789
+ try {
2790
+ request = JSON.parse(body);
2791
+ } catch {
2792
+ this.sendResponse(res, 400, { success: false, error: "Invalid JSON" });
2793
+ return;
2794
+ }
2795
+ try {
2796
+ switch (request.action) {
2797
+ case "scrape":
2798
+ await this.handleScrape(res, request.options);
2799
+ break;
2800
+ case "crawl":
2801
+ await this.handleCrawl(res, request.options);
2802
+ break;
2803
+ case "status":
2804
+ this.handleStatus(res);
2805
+ break;
2806
+ case "shutdown":
2807
+ await this.handleShutdown(res);
2808
+ break;
2809
+ default:
2810
+ this.sendResponse(res, 400, { success: false, error: "Unknown action" });
2811
+ }
2812
+ } catch (error) {
2813
+ this.sendResponse(res, 500, { success: false, error: error.message });
2814
+ }
2815
+ }
2816
+ /**
2817
+ * Handle scrape request
2818
+ */
2819
+ async handleScrape(res, options) {
2820
+ if (!this.client) {
2821
+ this.sendResponse(res, 500, { success: false, error: "Client not initialized" });
2822
+ return;
2823
+ }
2824
+ const result = await this.client.scrape(options);
2825
+ this.sendResponse(res, 200, { success: true, data: result });
2826
+ }
2827
+ /**
2828
+ * Handle crawl request
2829
+ */
2830
+ async handleCrawl(res, options) {
2831
+ if (!this.client) {
2832
+ this.sendResponse(res, 500, { success: false, error: "Client not initialized" });
2833
+ return;
2834
+ }
2835
+ const result = await this.client.crawl(options);
2836
+ this.sendResponse(res, 200, { success: true, data: result });
2837
+ }
2838
+ /**
2839
+ * Handle status request
2840
+ */
2841
+ handleStatus(res) {
2842
+ const status = {
2843
+ running: true,
2844
+ port: this.options.port,
2845
+ poolSize: this.options.poolSize,
2846
+ uptime: Date.now() - this.startTime,
2847
+ pid: process.pid
2848
+ };
2849
+ this.sendResponse(res, 200, { success: true, data: status });
2850
+ }
2851
+ /**
2852
+ * Handle shutdown request
2853
+ */
2854
+ async handleShutdown(res) {
2855
+ this.sendResponse(res, 200, { success: true, data: { message: "Shutting down" } });
2856
+ setTimeout(() => {
2857
+ this.stop().then(() => process.exit(0));
2858
+ }, 100);
2859
+ }
2860
+ /**
2861
+ * Send JSON response
2862
+ */
2863
+ sendResponse(res, statusCode, data) {
2864
+ res.writeHead(statusCode, { "Content-Type": "application/json" });
2865
+ res.end(JSON.stringify(data));
2866
+ }
2867
+ /**
2868
+ * Write PID file
2869
+ */
2870
+ async writePidFile() {
2871
+ const fs = await import("fs/promises");
2872
+ const path = await import("path");
2873
+ const os = await import("os");
2874
+ const pidFile = path.join(os.tmpdir(), PID_FILE_NAME);
2875
+ const data = JSON.stringify({
2876
+ pid: process.pid,
2877
+ port: this.options.port,
2878
+ startedAt: (/* @__PURE__ */ new Date()).toISOString()
2879
+ });
2880
+ await fs.writeFile(pidFile, data);
2881
+ }
2882
+ /**
2883
+ * Remove PID file
2884
+ */
2885
+ async removePidFile() {
2886
+ const fs = await import("fs/promises");
2887
+ const path = await import("path");
2888
+ const os = await import("os");
2889
+ const pidFile = path.join(os.tmpdir(), PID_FILE_NAME);
2890
+ try {
2891
+ await fs.unlink(pidFile);
2892
+ } catch {
2893
+ }
2894
+ }
2895
+ };
2896
+ async function getPidFilePath() {
2897
+ const path = await import("path");
2898
+ const os = await import("os");
2899
+ return path.join(os.tmpdir(), PID_FILE_NAME);
2900
+ }
2901
+ async function getDaemonInfo() {
2902
+ const fs = await import("fs/promises");
2903
+ const pidFile = await getPidFilePath();
2904
+ try {
2905
+ const data = await fs.readFile(pidFile, "utf-8");
2906
+ const info = JSON.parse(data);
2907
+ try {
2908
+ process.kill(info.pid, 0);
2909
+ return info;
2910
+ } catch {
2911
+ await fs.unlink(pidFile).catch(() => {
2912
+ });
2913
+ return null;
2914
+ }
2915
+ } catch {
2916
+ return null;
2917
+ }
2918
+ }
2919
+
2920
+ // src/daemon/client.ts
2921
+ import http2 from "http";
2922
+ var DaemonClient = class {
2923
+ options;
2924
+ constructor(options = {}) {
2925
+ this.options = {
2926
+ port: options.port ?? DEFAULT_DAEMON_PORT,
2927
+ timeoutMs: options.timeoutMs ?? 6e5
2928
+ // 10 minutes default
2929
+ };
2930
+ }
2931
+ /**
2932
+ * Scrape URLs via daemon
2933
+ */
2934
+ async scrape(options) {
2935
+ return this.request({
2936
+ action: "scrape",
2937
+ options
2938
+ });
2939
+ }
2940
+ /**
2941
+ * Crawl URL via daemon
2942
+ */
2943
+ async crawl(options) {
2944
+ return this.request({
2945
+ action: "crawl",
2946
+ options
2947
+ });
2948
+ }
2949
+ /**
2950
+ * Get daemon status
2951
+ */
2952
+ async status() {
2953
+ return this.request({
2954
+ action: "status"
2955
+ });
2956
+ }
2957
+ /**
2958
+ * Request daemon shutdown
2959
+ */
2960
+ async shutdown() {
2961
+ await this.request({
2962
+ action: "shutdown"
2963
+ });
2964
+ }
2965
+ /**
2966
+ * Check if daemon is reachable
2967
+ */
2968
+ async isRunning() {
2969
+ try {
2970
+ await this.status();
2971
+ return true;
2972
+ } catch {
2973
+ return false;
2974
+ }
2975
+ }
2976
+ /**
2977
+ * Make HTTP request to daemon
2978
+ */
2979
+ request(body) {
2980
+ return new Promise((resolve, reject) => {
2981
+ const data = JSON.stringify(body);
2982
+ const req = http2.request(
2983
+ {
2984
+ hostname: "127.0.0.1",
2985
+ port: this.options.port,
2986
+ path: "/",
2987
+ method: "POST",
2988
+ headers: {
2989
+ "Content-Type": "application/json",
2990
+ "Content-Length": Buffer.byteLength(data)
2991
+ },
2992
+ timeout: this.options.timeoutMs
2993
+ },
2994
+ (res) => {
2995
+ let responseBody = "";
2996
+ res.on("data", (chunk) => {
2997
+ responseBody += chunk;
2998
+ });
2999
+ res.on("end", () => {
3000
+ try {
3001
+ const response = JSON.parse(responseBody);
3002
+ if (response.success) {
3003
+ resolve(response.data);
3004
+ } else {
3005
+ reject(new Error(response.error || "Unknown daemon error"));
3006
+ }
3007
+ } catch (error) {
3008
+ reject(new Error(`Failed to parse daemon response: ${responseBody}`));
3009
+ }
3010
+ });
3011
+ }
3012
+ );
3013
+ req.on("error", (error) => {
3014
+ if (error.code === "ECONNREFUSED") {
3015
+ reject(new Error(`Cannot connect to daemon on port ${this.options.port}. Is it running?`));
3016
+ } else {
3017
+ reject(error);
3018
+ }
3019
+ });
3020
+ req.on("timeout", () => {
3021
+ req.destroy();
3022
+ reject(new Error(`Request to daemon timed out after ${this.options.timeoutMs}ms`));
3023
+ });
3024
+ req.write(data);
3025
+ req.end();
3026
+ });
3027
+ }
3028
+ };
3029
+ async function isDaemonRunning(port = DEFAULT_DAEMON_PORT) {
3030
+ const client = new DaemonClient({ port, timeoutMs: 5e3 });
3031
+ return client.isRunning();
3032
+ }
3033
+ export {
3034
+ BrowserPool,
3035
+ Crawler,
3036
+ DEFAULT_DAEMON_PORT,
3037
+ DEFAULT_OPTIONS,
3038
+ DaemonClient,
3039
+ DaemonServer,
3040
+ BrowserPool as HeroBrowserPool,
3041
+ ReaderClient,
3042
+ Scraper,
3043
+ cleanContent,
3044
+ crawl,
3045
+ createHeroConfig,
3046
+ createProxyUrl,
3047
+ detectChallenge,
3048
+ extractMetadata,
3049
+ formatToHTML,
3050
+ formatToJson,
3051
+ formatToJsonLite,
3052
+ formatToMarkdown,
3053
+ formatToText,
3054
+ getDaemonInfo,
3055
+ getPidFilePath,
3056
+ getUrlKey,
3057
+ handleChallenge,
3058
+ isChallengePage,
3059
+ isDaemonRunning,
3060
+ isSameDomain,
3061
+ isValidFormat,
3062
+ isValidUrl,
3063
+ parseProxyUrl,
3064
+ rateLimit,
3065
+ resolveUrl,
3066
+ scrape,
3067
+ shouldCrawlUrl,
3068
+ shouldCrawlUrl2 as shouldCrawlUrlFn,
3069
+ validateUrls,
3070
+ waitForChallengeResolution,
3071
+ waitForSelector
3072
+ };
3073
+ //# sourceMappingURL=index.js.map