@vakra-dev/reader 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3046 @@
1
+ #!/usr/bin/env node
2
+
3
+ // src/cli/index.ts
4
+ import { Command } from "commander";
5
+
6
+ // src/client.ts
7
+ import HeroCore from "@ulixee/hero-core";
8
+ import { TransportBridge } from "@ulixee/net";
9
+ import { ConnectionToHeroCore } from "@ulixee/hero";
10
+
11
+ // src/scraper.ts
12
+ import pLimit from "p-limit";
13
+
14
+ // src/cloudflare/detector.ts
15
+ var CHALLENGE_DOM_SELECTORS = [
16
+ "#challenge-running",
17
+ "#challenge-stage",
18
+ "#challenge-form",
19
+ ".cf-browser-verification"
20
+ ];
21
+ var CHALLENGE_TEXT_PATTERNS = [
22
+ "verifying you are human",
23
+ "checking if the site connection is secure",
24
+ "this process is automatic. your browser will redirect"
25
+ ];
26
+ var BLOCKED_SIGNALS = [
27
+ "you have been blocked",
28
+ "access to this page has been denied",
29
+ "sorry, you have been blocked",
30
+ "access denied",
31
+ "403 forbidden"
32
+ ];
33
+ async function detectChallenge(hero) {
34
+ const signals = [];
35
+ let type = "none";
36
+ try {
37
+ if (!hero.document) {
38
+ return {
39
+ isChallenge: false,
40
+ type: "none",
41
+ confidence: 0,
42
+ signals: ["No document available"]
43
+ };
44
+ }
45
+ const html = await hero.document.documentElement.outerHTML;
46
+ const htmlLower = html.toLowerCase();
47
+ for (const selector of CHALLENGE_DOM_SELECTORS) {
48
+ if (htmlLower.includes(selector.toLowerCase())) {
49
+ signals.push(`Challenge element: ${selector}`);
50
+ type = "js_challenge";
51
+ }
52
+ }
53
+ for (const pattern of CHALLENGE_TEXT_PATTERNS) {
54
+ if (htmlLower.includes(pattern)) {
55
+ signals.push(`Challenge text: "${pattern}"`);
56
+ type = type === "none" ? "js_challenge" : type;
57
+ }
58
+ }
59
+ if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
60
+ signals.push('Challenge text: "waiting for...to respond"');
61
+ type = type === "none" ? "js_challenge" : type;
62
+ }
63
+ for (const pattern of BLOCKED_SIGNALS) {
64
+ if (htmlLower.includes(pattern)) {
65
+ signals.push(`Blocked: "${pattern}"`);
66
+ type = "blocked";
67
+ break;
68
+ }
69
+ }
70
+ const isChallenge = signals.length > 0;
71
+ const confidence = isChallenge ? 100 : 0;
72
+ return {
73
+ isChallenge,
74
+ type: isChallenge ? type : "none",
75
+ confidence,
76
+ signals
77
+ };
78
+ } catch (error) {
79
+ return {
80
+ isChallenge: false,
81
+ type: "none",
82
+ confidence: 0,
83
+ signals: [`Error during detection: ${error.message}`]
84
+ };
85
+ }
86
+ }
87
+
88
+ // src/cloudflare/handler.ts
89
+ async function waitForChallengeResolution(hero, options) {
90
+ const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
91
+ const startTime = Date.now();
92
+ const log = (msg) => verbose && console.log(` ${msg}`);
93
+ while (Date.now() - startTime < maxWaitMs) {
94
+ const elapsed = Date.now() - startTime;
95
+ try {
96
+ const currentUrl = await hero.url;
97
+ if (currentUrl !== initialUrl) {
98
+ log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
99
+ log(` Waiting for new page to load...`);
100
+ try {
101
+ await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
102
+ log(` DOMContentLoaded`);
103
+ } catch {
104
+ log(` DOMContentLoaded timeout, continuing...`);
105
+ }
106
+ await hero.waitForPaintingStable().catch(() => {
107
+ });
108
+ log(` Page stabilized`);
109
+ return { resolved: true, method: "url_redirect", waitedMs: elapsed };
110
+ }
111
+ } catch {
112
+ }
113
+ const detection = await detectChallenge(hero);
114
+ if (!detection.isChallenge) {
115
+ log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
116
+ log(` Waiting for page to load...`);
117
+ try {
118
+ await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
119
+ log(` DOMContentLoaded`);
120
+ } catch {
121
+ log(` DOMContentLoaded timeout, continuing...`);
122
+ }
123
+ await hero.waitForPaintingStable().catch(() => {
124
+ });
125
+ log(` Page stabilized`);
126
+ return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
127
+ }
128
+ log(
129
+ `\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
130
+ );
131
+ await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
132
+ }
133
+ return {
134
+ resolved: false,
135
+ method: "timeout",
136
+ waitedMs: Date.now() - startTime
137
+ };
138
+ }
139
+
140
+ // src/formatters/markdown.ts
141
+ import TurndownService from "turndown";
142
+ var turndownService = new TurndownService({
143
+ headingStyle: "atx",
144
+ hr: "---",
145
+ bulletListMarker: "-",
146
+ codeBlockStyle: "fenced",
147
+ fence: "```",
148
+ emDelimiter: "*",
149
+ strongDelimiter: "**",
150
+ linkStyle: "inlined",
151
+ linkReferenceStyle: "full"
152
+ });
153
+ function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
154
+ const sections = [];
155
+ if (includeMetadata) {
156
+ sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
157
+ }
158
+ if (pages.length > 1) {
159
+ sections.push(createMarkdownTOC(pages));
160
+ }
161
+ sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
162
+ return sections.join("\n\n");
163
+ }
164
+ function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
165
+ const title = website.title || extractDomainFromUrl(baseUrl);
166
+ const description = website.description || "";
167
+ let header = `# Website Scrape: ${title}
168
+
169
+ `;
170
+ header += `**Base URL:** ${baseUrl}
171
+ `;
172
+ header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
173
+ `;
174
+ header += `**Duration:** ${duration}ms
175
+ `;
176
+ header += `**Total pages:** ${totalPages}
177
+ `;
178
+ if (description) {
179
+ header += `**Description:** ${description}
180
+ `;
181
+ }
182
+ if (website.author) {
183
+ header += `**Author:** ${website.author}
184
+ `;
185
+ }
186
+ if (website.language) {
187
+ header += `**Language:** ${website.language}
188
+ `;
189
+ }
190
+ return header;
191
+ }
192
+ function createMarkdownTOC(pages) {
193
+ let toc = "## Table of Contents\n\n";
194
+ pages.forEach((page, index) => {
195
+ const depth = " ".repeat(page.depth);
196
+ const pageNumber = index + 1;
197
+ const title = page.title || `Page ${pageNumber}`;
198
+ const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
199
+ const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
200
+ toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
201
+ `;
202
+ });
203
+ return toc;
204
+ }
205
+ function createMarkdownPage(page, pageNumber) {
206
+ const title = page.title || `Page ${pageNumber}`;
207
+ const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
208
+ const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
209
+ let pageContent = `---
210
+
211
+ `;
212
+ pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
213
+
214
+ `;
215
+ pageContent += `**URL:** ${page.url}
216
+ `;
217
+ pageContent += `**Title:** ${page.title}
218
+ `;
219
+ pageContent += `**Depth:** ${page.depth}
220
+ `;
221
+ pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
222
+
223
+ `;
224
+ pageContent += `---
225
+
226
+ `;
227
+ const markdown = htmlToMarkdown(page.html);
228
+ pageContent += markdown;
229
+ return pageContent;
230
+ }
231
+ function htmlToMarkdown(html) {
232
+ try {
233
+ return turndownService.turndown(html);
234
+ } catch (error) {
235
+ console.warn("Error converting HTML to Markdown:", error);
236
+ return html.replace(/<[^>]*>/g, "").trim();
237
+ }
238
+ }
239
+ function extractDomainFromUrl(url) {
240
+ try {
241
+ return new URL(url).hostname;
242
+ } catch {
243
+ return "Unknown";
244
+ }
245
+ }
246
+
247
+ // src/formatters/html.ts
248
+ function formatToHTML(pages, baseUrl, scrapedAt, duration, website) {
249
+ const html = `<!DOCTYPE html>
250
+ <html lang="${website.language || "en"}">
251
+ <head>
252
+ <meta charset="${website.charset || "UTF-8"}">
253
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
254
+ <title>Scrape: ${website.title || extractDomainFromUrl2(baseUrl)}</title>
255
+ ${generateMetaTags(website)}
256
+ <style>
257
+ ${generateCSS()}
258
+ </style>
259
+ </head>
260
+ <body>
261
+ <header class="header">
262
+ <h1>Website Scrape: ${escapeHtml(website.title || extractDomainFromUrl2(baseUrl))}</h1>
263
+ <div class="meta-info">
264
+ <p><strong>Base URL:</strong> <a href="${escapeHtml(
265
+ baseUrl
266
+ )}" target="_blank">${escapeHtml(baseUrl)}</a></p>
267
+ <p><strong>Scraped at:</strong> ${new Date(scrapedAt).toLocaleString()}</p>
268
+ <p><strong>Duration:</strong> ${duration}ms</p>
269
+ <p><strong>Total pages:</strong> ${pages.length}</p>
270
+ ${website.description ? `<p><strong>Description:</strong> ${escapeHtml(website.description)}</p>` : ""}
271
+ ${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
272
+ ${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
273
+ </div>
274
+ </header>
275
+
276
+ ${pages.length > 1 ? generateTOC(pages) : ""}
277
+
278
+ <main class="content">
279
+ ${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
280
+ </main>
281
+
282
+ <footer class="footer">
283
+ <p>Generated by Reader JS/TS SDK</p>
284
+ </footer>
285
+
286
+ <script>
287
+ ${generateJavaScript()}
288
+ </script>
289
+ </body>
290
+ </html>`;
291
+ return html;
292
+ }
293
+ function generateMetaTags(website) {
294
+ const tags = [];
295
+ if (website.description) {
296
+ tags.push(`<meta name="description" content="${escapeHtml(website.description)}">`);
297
+ }
298
+ if (website.author) {
299
+ tags.push(`<meta name="author" content="${escapeHtml(website.author)}">`);
300
+ }
301
+ if (website.keywords) {
302
+ tags.push(`<meta name="keywords" content="${escapeHtml(website.keywords.join(", "))}">`);
303
+ }
304
+ if (website.robots) {
305
+ tags.push(`<meta name="robots" content="${escapeHtml(website.robots)}">`);
306
+ }
307
+ if (website.themeColor) {
308
+ tags.push(`<meta name="theme-color" content="${escapeHtml(website.themeColor)}">`);
309
+ }
310
+ if (website.favicon) {
311
+ tags.push(`<link rel="icon" href="${escapeHtml(website.favicon)}">`);
312
+ }
313
+ if (website.canonical) {
314
+ tags.push(`<link rel="canonical" href="${escapeHtml(website.canonical)}">`);
315
+ }
316
+ if (website.openGraph) {
317
+ const og = website.openGraph;
318
+ if (og.title) tags.push(`<meta property="og:title" content="${escapeHtml(og.title)}">`);
319
+ if (og.description)
320
+ tags.push(`<meta property="og:description" content="${escapeHtml(og.description)}">`);
321
+ if (og.type) tags.push(`<meta property="og:type" content="${escapeHtml(og.type)}">`);
322
+ if (og.url) tags.push(`<meta property="og:url" content="${escapeHtml(og.url)}">`);
323
+ if (og.image) tags.push(`<meta property="og:image" content="${escapeHtml(og.image)}">`);
324
+ if (og.siteName)
325
+ tags.push(`<meta property="og:site_name" content="${escapeHtml(og.siteName)}">`);
326
+ if (og.locale) tags.push(`<meta property="og:locale" content="${escapeHtml(og.locale)}">`);
327
+ }
328
+ if (website.twitter) {
329
+ const twitter = website.twitter;
330
+ if (twitter.card) tags.push(`<meta name="twitter:card" content="${escapeHtml(twitter.card)}">`);
331
+ if (twitter.site) tags.push(`<meta name="twitter:site" content="${escapeHtml(twitter.site)}">`);
332
+ if (twitter.creator)
333
+ tags.push(`<meta name="twitter:creator" content="${escapeHtml(twitter.creator)}">`);
334
+ if (twitter.title)
335
+ tags.push(`<meta name="twitter:title" content="${escapeHtml(twitter.title)}">`);
336
+ if (twitter.description)
337
+ tags.push(`<meta name="twitter:description" content="${escapeHtml(twitter.description)}">`);
338
+ if (twitter.image)
339
+ tags.push(`<meta name="twitter:image" content="${escapeHtml(twitter.image)}">`);
340
+ }
341
+ return tags.join("\n ");
342
+ }
343
+ function generateCSS() {
344
+ return `
345
+ * {
346
+ margin: 0;
347
+ padding: 0;
348
+ box-sizing: border-box;
349
+ }
350
+
351
+ body {
352
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
353
+ line-height: 1.6;
354
+ color: #333;
355
+ background-color: #f8f9fa;
356
+ }
357
+
358
+ .header {
359
+ background: white;
360
+ padding: 2rem;
361
+ border-bottom: 1px solid #e9ecef;
362
+ margin-bottom: 2rem;
363
+ }
364
+
365
+ .header h1 {
366
+ color: #2c3e50;
367
+ margin-bottom: 1rem;
368
+ font-size: 2rem;
369
+ }
370
+
371
+ .meta-info {
372
+ display: grid;
373
+ grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
374
+ gap: 0.5rem;
375
+ }
376
+
377
+ .meta-info p {
378
+ margin: 0.25rem 0;
379
+ font-size: 0.9rem;
380
+ color: #6c757d;
381
+ }
382
+
383
+ .toc {
384
+ background: white;
385
+ padding: 1.5rem;
386
+ margin: 2rem 0;
387
+ border-radius: 8px;
388
+ border: 1px solid #e9ecef;
389
+ }
390
+
391
+ .toc h2 {
392
+ color: #2c3e50;
393
+ margin-bottom: 1rem;
394
+ font-size: 1.25rem;
395
+ }
396
+
397
+ .toc ul {
398
+ list-style: none;
399
+ }
400
+
401
+ .toc li {
402
+ margin: 0.5rem 0;
403
+ }
404
+
405
+ .toc a {
406
+ color: #007bff;
407
+ text-decoration: none;
408
+ transition: color 0.2s;
409
+ }
410
+
411
+ .toc a:hover {
412
+ color: #0056b3;
413
+ text-decoration: underline;
414
+ }
415
+
416
+ .content {
417
+ max-width: 800px;
418
+ margin: 0 auto;
419
+ padding: 0 1rem;
420
+ }
421
+
422
+ .page {
423
+ background: white;
424
+ margin: 2rem 0;
425
+ padding: 2rem;
426
+ border-radius: 8px;
427
+ border: 1px solid #e9ecef;
428
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
429
+ }
430
+
431
+ .page-header {
432
+ border-bottom: 2px solid #e9ecef;
433
+ padding-bottom: 1rem;
434
+ margin-bottom: 2rem;
435
+ }
436
+
437
+ .page-header h2 {
438
+ color: #2c3e50;
439
+ margin-bottom: 0.5rem;
440
+ font-size: 1.5rem;
441
+ }
442
+
443
+ .page-meta {
444
+ display: flex;
445
+ flex-wrap: wrap;
446
+ gap: 1rem;
447
+ font-size: 0.9rem;
448
+ color: #6c757d;
449
+ }
450
+
451
+ .page-content {
452
+ line-height: 1.8;
453
+ }
454
+
455
+ .page-content h1, .page-content h2, .page-content h3,
456
+ .page-content h4, .page-content h5, .page-content h6 {
457
+ color: #2c3e50;
458
+ margin: 1.5rem 0 0.5rem 0;
459
+ }
460
+
461
+ .page-content p {
462
+ margin: 1rem 0;
463
+ }
464
+
465
+ .page-content a {
466
+ color: #007bff;
467
+ text-decoration: none;
468
+ }
469
+
470
+ .page-content a:hover {
471
+ text-decoration: underline;
472
+ }
473
+
474
+ .page-content code {
475
+ background: #f8f9fa;
476
+ padding: 0.2rem 0.4rem;
477
+ border-radius: 4px;
478
+ font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
479
+ font-size: 0.9em;
480
+ }
481
+
482
+ .page-content pre {
483
+ background: #f8f9fa;
484
+ padding: 1rem;
485
+ border-radius: 4px;
486
+ overflow-x: auto;
487
+ margin: 1rem 0;
488
+ }
489
+
490
+ .page-content blockquote {
491
+ border-left: 4px solid #007bff;
492
+ padding-left: 1rem;
493
+ margin: 1rem 0;
494
+ color: #6c757d;
495
+ }
496
+
497
+ .footer {
498
+ text-align: center;
499
+ padding: 2rem;
500
+ margin-top: 3rem;
501
+ border-top: 1px solid #e9ecef;
502
+ color: #6c757d;
503
+ font-size: 0.9rem;
504
+ }
505
+
506
+ @media (max-width: 768px) {
507
+ .header {
508
+ padding: 1rem;
509
+ }
510
+
511
+ .header h1 {
512
+ font-size: 1.5rem;
513
+ }
514
+
515
+ .page {
516
+ padding: 1rem;
517
+ }
518
+
519
+ .page-meta {
520
+ flex-direction: column;
521
+ gap: 0.5rem;
522
+ }
523
+ }
524
+ `.trim();
525
+ }
526
+ function generateTOC(pages) {
527
+ const tocItems = pages.map((page, index) => {
528
+ const pageNumber = index + 1;
529
+ const title = page.title || `Page ${pageNumber}`;
530
+ const id = `page-${pageNumber}`;
531
+ return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
532
+ }).join("\n");
533
+ return `
534
+ <nav class="toc">
535
+ <h2>Table of Contents</h2>
536
+ <ul>
537
+ ${tocItems}
538
+ </ul>
539
+ </nav>`;
540
+ }
541
+ function generatePageHTML(page, pageNumber) {
542
+ const id = `page-${pageNumber}`;
543
+ const title = page.title || `Page ${pageNumber}`;
544
+ return `
545
+ <article class="page" id="${id}">
546
+ <div class="page-header">
547
+ <h2>${pageNumber}. ${escapeHtml(title)}</h2>
548
+ <div class="page-meta">
549
+ <span><strong>URL:</strong> <a href="${escapeHtml(
550
+ page.url
551
+ )}" target="_blank">${escapeHtml(page.url)}</a></span>
552
+ <span><strong>Depth:</strong> ${page.depth}</span>
553
+ <span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
554
+ </div>
555
+ </div>
556
+ <div class="page-content">
557
+ ${page.html}
558
+ </div>
559
+ </article>`;
560
+ }
561
+ function generateJavaScript() {
562
+ return `
563
+ // Smooth scrolling for TOC links
564
+ document.querySelectorAll('a[href^="#"]').forEach(anchor => {
565
+ anchor.addEventListener('click', function (e) {
566
+ e.preventDefault();
567
+ const target = document.querySelector(this.getAttribute('href'));
568
+ if (target) {
569
+ target.scrollIntoView({
570
+ behavior: 'smooth',
571
+ block: 'start'
572
+ });
573
+ }
574
+ });
575
+ });
576
+
577
+ // Highlight current section in TOC
578
+ window.addEventListener('scroll', function() {
579
+ const pages = document.querySelectorAll('.page');
580
+ const tocLinks = document.querySelectorAll('.toc a');
581
+
582
+ let currentPage = null;
583
+ pages.forEach(page => {
584
+ const rect = page.getBoundingClientRect();
585
+ if (rect.top <= 100) {
586
+ currentPage = page;
587
+ }
588
+ });
589
+
590
+ tocLinks.forEach(link => {
591
+ link.style.fontWeight = 'normal';
592
+ const target = document.querySelector(link.getAttribute('href'));
593
+ if (target === currentPage) {
594
+ link.style.fontWeight = 'bold';
595
+ }
596
+ });
597
+ });
598
+ `;
599
+ }
600
+ function escapeHtml(text) {
601
+ return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, "&quot;").replace(/'/g, "&#039;").replace(/\//g, "&#x2F;");
602
+ }
603
+ function extractDomainFromUrl2(url) {
604
+ try {
605
+ return new URL(url).hostname;
606
+ } catch {
607
+ return "Unknown";
608
+ }
609
+ }
610
+
611
+ // src/formatters/json.ts
612
+ function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
613
+ const jsonResult = {
614
+ metadata: {
615
+ baseUrl,
616
+ totalPages: pages.length,
617
+ scrapedAt,
618
+ duration,
619
+ website
620
+ },
621
+ pages: pages.map((page, index) => ({
622
+ index: index + 1,
623
+ url: page.url,
624
+ title: page.title,
625
+ markdown: page.markdown,
626
+ html: page.html,
627
+ fetchedAt: page.fetchedAt,
628
+ depth: page.depth,
629
+ wordCount: countWords(page.markdown),
630
+ readingTime: estimateReadingTime(page.markdown)
631
+ }))
632
+ };
633
+ return JSON.stringify(jsonResult, null, 2);
634
+ }
635
+ function countWords(markdown) {
636
+ const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
637
+ return plainText.split(/\s+/).filter((word) => word.length > 0).length;
638
+ }
639
+ function estimateReadingTime(markdown) {
640
+ const wordCount = countWords(markdown);
641
+ return Math.ceil(wordCount / 200);
642
+ }
643
+
644
+ // src/formatters/text.ts
645
+ import { parseHTML } from "linkedom";
646
+ function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
647
+ const sections = [];
648
+ if (includeMetadata) {
649
+ sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
650
+ }
651
+ sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
652
+ return sections.join("\n\n");
653
+ }
654
+ function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
655
+ const title = website.title || extractDomainFromUrl3(baseUrl);
656
+ const lines = [];
657
+ lines.push(`=== ${title} ===`);
658
+ lines.push("");
659
+ lines.push(`URL: ${baseUrl}`);
660
+ lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
661
+ lines.push(`Duration: ${duration}ms`);
662
+ lines.push(`Pages: ${totalPages}`);
663
+ if (website.description) {
664
+ lines.push(`Description: ${website.description}`);
665
+ }
666
+ if (website.author) {
667
+ lines.push(`Author: ${website.author}`);
668
+ }
669
+ if (website.language) {
670
+ lines.push(`Language: ${website.language}`);
671
+ }
672
+ return lines.join("\n");
673
+ }
674
+ function createTextPage(page, pageNumber, showSeparator) {
675
+ const lines = [];
676
+ if (showSeparator) {
677
+ lines.push("\u2500".repeat(60));
678
+ lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
679
+ lines.push(`URL: ${page.url}`);
680
+ lines.push("\u2500".repeat(60));
681
+ }
682
+ const plainText = htmlToPlainText(page.html);
683
+ lines.push(plainText);
684
+ return lines.join("\n");
685
+ }
686
+ function htmlToPlainText(html) {
687
+ const { document } = parseHTML(html);
688
+ const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
689
+ elementsToRemove.forEach((tag) => {
690
+ document.querySelectorAll(tag).forEach((el) => el.remove());
691
+ });
692
+ let text = document.body?.textContent || document.documentElement?.textContent || "";
693
+ text = text.replace(/[ \t]+/g, " ");
694
+ text = text.replace(/\n[ \t]+/g, "\n");
695
+ text = text.replace(/[ \t]+\n/g, "\n");
696
+ text = text.replace(/\n{3,}/g, "\n\n");
697
+ text = text.trim();
698
+ return text;
699
+ }
700
+ function extractDomainFromUrl3(url) {
701
+ try {
702
+ return new URL(url).hostname;
703
+ } catch {
704
+ return "Unknown";
705
+ }
706
+ }
707
+
708
+ // src/utils/content-cleaner.ts
709
+ import { parseHTML as parseHTML2 } from "linkedom";
710
+ var ALWAYS_REMOVE_SELECTORS = [
711
+ // Navigation and menus
712
+ "nav",
713
+ "header nav",
714
+ "footer nav",
715
+ ".nav",
716
+ ".navigation",
717
+ ".menu",
718
+ ".navbar",
719
+ ".sidebar",
720
+ ".aside",
721
+ // Header and footer elements
722
+ "header",
723
+ "footer",
724
+ ".site-header",
725
+ ".page-header",
726
+ ".site-footer",
727
+ ".page-footer",
728
+ // Social media and sharing
729
+ ".social",
730
+ ".share",
731
+ ".sharing",
732
+ ".twitter",
733
+ ".facebook",
734
+ ".linkedin",
735
+ ".instagram",
736
+ // Comments and discussions
737
+ ".comments",
738
+ ".comment",
739
+ ".discussion",
740
+ ".disqus",
741
+ // Forms and interactive elements
742
+ "form",
743
+ "input",
744
+ "button:not([type='submit'])",
745
+ "select",
746
+ "textarea",
747
+ // Scripts and styles
748
+ "script",
749
+ "style",
750
+ "noscript",
751
+ // Hidden elements
752
+ "[hidden]",
753
+ "[style*='display: none']",
754
+ "[style*='display:none']",
755
+ // Common utility classes
756
+ ".cookie",
757
+ ".cookie-banner",
758
+ ".popup",
759
+ ".modal",
760
+ ".overlay",
761
+ ".notification",
762
+ // Breadcrumbs
763
+ ".breadcrumb",
764
+ ".breadcrumbs",
765
+ ".breadcrumb-trail"
766
+ ];
767
+ var AD_SELECTORS = [
768
+ // Ads and promotions
769
+ ".ad",
770
+ ".ads",
771
+ ".advertisement",
772
+ ".promotion",
773
+ ".sponsored",
774
+ "[class*='ad-']",
775
+ "[id*='ad-']",
776
+ "[class*='advert']",
777
+ "[id*='advert']",
778
+ "[class*='banner']",
779
+ "[id*='banner']",
780
+ ".google-ad",
781
+ ".adsense",
782
+ "[data-ad]",
783
+ "[data-ads]",
784
+ "ins.adsbygoogle",
785
+ // Tracking
786
+ "[class*='tracking']",
787
+ "[id*='tracking']",
788
+ "[class*='analytics']",
789
+ "[id*='analytics']"
790
+ ];
791
+ function cleanHtml(html, baseUrl, options = {}) {
792
+ const { removeAds = true, removeBase64Images = true } = options;
793
+ const { document } = parseHTML2(html);
794
+ for (const selector of ALWAYS_REMOVE_SELECTORS) {
795
+ try {
796
+ document.querySelectorAll(selector).forEach((el) => el.remove());
797
+ } catch {
798
+ }
799
+ }
800
+ if (removeAds) {
801
+ for (const selector of AD_SELECTORS) {
802
+ try {
803
+ document.querySelectorAll(selector).forEach((el) => el.remove());
804
+ } catch {
805
+ }
806
+ }
807
+ }
808
+ if (removeBase64Images) {
809
+ removeBase64ImagesFromDocument(document);
810
+ }
811
+ const walker = document.createTreeWalker(
812
+ document,
813
+ 128
814
+ /* NodeFilter.SHOW_COMMENT */
815
+ );
816
+ const comments = [];
817
+ while (walker.nextNode()) {
818
+ comments.push(walker.currentNode);
819
+ }
820
+ comments.forEach((comment) => comment.parentNode?.removeChild(comment));
821
+ convertRelativeUrls(document, baseUrl);
822
+ return document.documentElement?.outerHTML || html;
823
+ }
824
+ function removeBase64ImagesFromDocument(document) {
825
+ document.querySelectorAll("img[src^='data:']").forEach((el) => {
826
+ el.remove();
827
+ });
828
+ document.querySelectorAll("[style*='data:image']").forEach((el) => {
829
+ const style = el.getAttribute("style");
830
+ if (style) {
831
+ const cleanedStyle = style.replace(/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi, "");
832
+ if (cleanedStyle.trim()) {
833
+ el.setAttribute("style", cleanedStyle);
834
+ } else {
835
+ el.removeAttribute("style");
836
+ }
837
+ }
838
+ });
839
+ document.querySelectorAll("source[src^='data:'], source[srcset*='data:']").forEach((el) => {
840
+ el.remove();
841
+ });
842
+ }
843
+ function convertRelativeUrls(document, baseUrl) {
844
+ document.querySelectorAll("[src]").forEach((el) => {
845
+ const src = el.getAttribute("src");
846
+ if (src && !src.startsWith("http") && !src.startsWith("//") && !src.startsWith("data:")) {
847
+ try {
848
+ el.setAttribute("src", new URL(src, baseUrl).toString());
849
+ } catch {
850
+ }
851
+ }
852
+ });
853
+ document.querySelectorAll("[href]").forEach((el) => {
854
+ const href = el.getAttribute("href");
855
+ if (href && !href.startsWith("http") && !href.startsWith("//") && !href.startsWith("#") && !href.startsWith("mailto:") && !href.startsWith("tel:") && !href.startsWith("javascript:")) {
856
+ try {
857
+ el.setAttribute("href", new URL(href, baseUrl).toString());
858
+ } catch {
859
+ }
860
+ }
861
+ });
862
+ }
863
+ function cleanContent(html, baseUrl, options = {}) {
864
+ return cleanHtml(html, baseUrl, options);
865
+ }
866
+
867
+ // src/utils/metadata-extractor.ts
868
+ import { parseHTML as parseHTML3 } from "linkedom";
869
+
870
+ // src/utils/url-helpers.ts
871
+ import { URL as URL2 } from "url";
872
+ import RE2 from "re2";
873
+ function resolveUrl(relative, base) {
874
+ try {
875
+ return new URL2(relative, base).toString();
876
+ } catch {
877
+ return relative;
878
+ }
879
+ }
880
+ function isValidUrl(string) {
881
+ try {
882
+ new URL2(string);
883
+ return true;
884
+ } catch {
885
+ return false;
886
+ }
887
+ }
888
+ function normalizeUrl(url, baseUrl) {
889
+ try {
890
+ let parsedUrl;
891
+ if (url.startsWith("http://") || url.startsWith("https://")) {
892
+ parsedUrl = new URL2(url);
893
+ } else if (baseUrl) {
894
+ parsedUrl = new URL2(url, baseUrl);
895
+ } else {
896
+ throw new Error("Relative URL requires base URL");
897
+ }
898
+ parsedUrl.hash = "";
899
+ return parsedUrl.toString();
900
+ } catch {
901
+ throw new Error(`Invalid URL: ${url}`);
902
+ }
903
+ }
904
+ function extractBaseDomain(url) {
905
+ try {
906
+ const parsedUrl = new URL2(url);
907
+ return parsedUrl.hostname;
908
+ } catch {
909
+ throw new Error(`Invalid URL for domain extraction: ${url}`);
910
+ }
911
+ }
912
+ function getRootDomain(hostname) {
913
+ const parts = hostname.split(".");
914
+ if (parts.length <= 2) {
915
+ return hostname;
916
+ }
917
+ const twoPartTLDs = ["co.uk", "com.au", "co.nz", "com.br", "co.jp", "co.kr", "com.mx", "org.uk"];
918
+ const lastTwo = parts.slice(-2).join(".");
919
+ if (twoPartTLDs.includes(lastTwo)) {
920
+ return parts.slice(-3).join(".");
921
+ }
922
+ return parts.slice(-2).join(".");
923
+ }
924
+ function isSameDomain(url, baseUrl) {
925
+ try {
926
+ const urlDomain = extractBaseDomain(url);
927
+ const baseDomain = extractBaseDomain(baseUrl);
928
+ if (urlDomain === baseDomain) {
929
+ return true;
930
+ }
931
+ const urlRoot = getRootDomain(urlDomain);
932
+ const baseRoot = getRootDomain(baseDomain);
933
+ return urlRoot === baseRoot;
934
+ } catch {
935
+ return false;
936
+ }
937
+ }
938
+ function getUrlKey(url) {
939
+ try {
940
+ const parsedUrl = new URL2(url);
941
+ parsedUrl.search = "";
942
+ return parsedUrl.toString().toLowerCase();
943
+ } catch {
944
+ return url.toLowerCase();
945
+ }
946
+ }
947
+ function matchesPatterns(url, patterns) {
948
+ if (!patterns || patterns.length === 0) {
949
+ return false;
950
+ }
951
+ return patterns.some((pattern) => {
952
+ try {
953
+ const regex = new RE2(pattern, "i");
954
+ return regex.test(url);
955
+ } catch {
956
+ return false;
957
+ }
958
+ });
959
+ }
960
+ function shouldIncludeUrl(url, includePatterns, excludePatterns) {
961
+ if (includePatterns && includePatterns.length > 0) {
962
+ if (!matchesPatterns(url, includePatterns)) {
963
+ return false;
964
+ }
965
+ }
966
+ if (excludePatterns && excludePatterns.length > 0) {
967
+ if (matchesPatterns(url, excludePatterns)) {
968
+ return false;
969
+ }
970
+ }
971
+ return true;
972
+ }
973
+ function isContentUrl(url) {
974
+ const lowerUrl = url.toLowerCase();
975
+ const nonContentPatterns = [
976
+ // Legal and policy pages
977
+ /\/(privacy|terms|tos|legal|cookie|gdpr|disclaimer|imprint|impressum)\b/i,
978
+ /\/(privacy-policy|terms-of-service|terms-of-use|terms-and-conditions)\b/i,
979
+ /\/(cookie-policy|data-protection|acceptable-use|user-agreement)\b/i,
980
+ /\/(refund|cancellation|shipping|return)-?(policy)?\b/i,
981
+ // Contact and support pages (usually not main content)
982
+ /\/(contact|support|help|faq|feedback)\/?$/i,
983
+ // About pages that are typically boilerplate
984
+ /\/(about-us|careers|jobs|press|investors|team)\/?$/i,
985
+ // Authentication and admin areas
986
+ /\/(admin|login|auth|account|dashboard|profile|settings)\//i,
987
+ // E-commerce utility pages
988
+ /\/(cart|checkout|payment|subscription|wishlist)\//i,
989
+ // File downloads and assets
990
+ /\/(uploads|assets|files|static|media|resources)\//i,
991
+ // API endpoints
992
+ /\/(api|graphql|rest|webhook)\//i
993
+ ];
994
+ if (nonContentPatterns.some((pattern) => pattern.test(lowerUrl))) {
995
+ return false;
996
+ }
997
+ const skipExtensions = [".pdf", ".doc", ".docx", ".xls", ".xlsx", ".zip", ".exe"];
998
+ if (skipExtensions.some((ext) => lowerUrl.endsWith(ext))) {
999
+ return false;
1000
+ }
1001
+ return true;
1002
+ }
1003
+
1004
+ // src/utils/metadata-extractor.ts
1005
+ function extractMetadata(html, baseUrl) {
1006
+ return extractWebsiteMetadata(html, baseUrl);
1007
+ }
1008
+ function extractWebsiteMetadata(html, baseUrl) {
1009
+ const { document } = parseHTML3(html);
1010
+ const metadata = {
1011
+ title: null,
1012
+ description: null,
1013
+ author: null,
1014
+ language: null,
1015
+ charset: null,
1016
+ favicon: null,
1017
+ canonical: null,
1018
+ image: null,
1019
+ keywords: null,
1020
+ robots: null,
1021
+ themeColor: null,
1022
+ openGraph: null,
1023
+ twitter: null
1024
+ };
1025
+ metadata.title = extractTitle(document);
1026
+ metadata.description = extractMetaContent(document, "description");
1027
+ metadata.author = extractMetaContent(document, "author");
1028
+ metadata.language = extractLanguage(document);
1029
+ metadata.charset = extractCharset(document);
1030
+ metadata.favicon = extractFavicon(document, baseUrl);
1031
+ metadata.canonical = extractCanonical(document, baseUrl);
1032
+ metadata.image = extractMetaContent(document, "og:image") || extractMetaContent(document, "twitter:image");
1033
+ metadata.keywords = extractKeywords(document);
1034
+ metadata.robots = extractMetaContent(document, "robots");
1035
+ metadata.themeColor = extractMetaContent(document, "theme-color");
1036
+ metadata.openGraph = extractOpenGraph(document);
1037
+ metadata.twitter = extractTwitterCard(document);
1038
+ return metadata;
1039
+ }
1040
+ function extractTitle(document) {
1041
+ const titleElement = document.querySelector("title");
1042
+ if (titleElement?.textContent) {
1043
+ return titleElement.textContent.trim();
1044
+ }
1045
+ return extractMetaContent(document, "og:title");
1046
+ }
1047
+ function extractMetaContent(document, name) {
1048
+ const byName = document.querySelector(`meta[name="${name}"]`);
1049
+ if (byName) {
1050
+ const content = byName.getAttribute("content");
1051
+ if (content) return content.trim();
1052
+ }
1053
+ const byProperty = document.querySelector(`meta[property="${name}"]`);
1054
+ if (byProperty) {
1055
+ const content = byProperty.getAttribute("content");
1056
+ if (content) return content.trim();
1057
+ }
1058
+ return null;
1059
+ }
1060
+ function extractLanguage(document) {
1061
+ const lang = document.documentElement?.getAttribute("lang");
1062
+ return lang?.trim() || null;
1063
+ }
1064
+ function extractCharset(document) {
1065
+ const charsetMeta = document.querySelector("meta[charset]");
1066
+ if (charsetMeta) {
1067
+ const charset = charsetMeta.getAttribute("charset");
1068
+ if (charset) return charset.trim();
1069
+ }
1070
+ const httpEquivMeta = document.querySelector('meta[http-equiv="Content-Type"]');
1071
+ if (httpEquivMeta) {
1072
+ const content = httpEquivMeta.getAttribute("content");
1073
+ if (content) {
1074
+ const charsetMatch = content.match(/charset=([^\s;]+)/i);
1075
+ if (charsetMatch) return charsetMatch[1].trim();
1076
+ }
1077
+ }
1078
+ return null;
1079
+ }
1080
+ function extractFavicon(document, baseUrl) {
1081
+ const iconSelectors = [
1082
+ 'link[rel="icon"]',
1083
+ 'link[rel="shortcut icon"]',
1084
+ 'link[rel="apple-touch-icon"]',
1085
+ 'link[rel*="icon"]'
1086
+ ];
1087
+ for (const selector of iconSelectors) {
1088
+ const iconLink = document.querySelector(selector);
1089
+ if (iconLink) {
1090
+ const href = iconLink.getAttribute("href");
1091
+ if (href) {
1092
+ return normalizeUrl(href, baseUrl);
1093
+ }
1094
+ }
1095
+ }
1096
+ try {
1097
+ return normalizeUrl("/favicon.ico", baseUrl);
1098
+ } catch {
1099
+ return null;
1100
+ }
1101
+ }
1102
+ function extractCanonical(document, baseUrl) {
1103
+ const canonicalLink = document.querySelector('link[rel="canonical"]');
1104
+ if (canonicalLink) {
1105
+ const href = canonicalLink.getAttribute("href");
1106
+ if (href) {
1107
+ return normalizeUrl(href, baseUrl);
1108
+ }
1109
+ }
1110
+ return null;
1111
+ }
1112
+ function extractKeywords(document) {
1113
+ const keywordsContent = extractMetaContent(document, "keywords");
1114
+ if (!keywordsContent) {
1115
+ return null;
1116
+ }
1117
+ return keywordsContent.split(",").map((keyword) => keyword.trim()).filter((keyword) => keyword.length > 0);
1118
+ }
1119
+ function extractOpenGraph(document) {
1120
+ const openGraph = {
1121
+ title: null,
1122
+ description: null,
1123
+ type: null,
1124
+ url: null,
1125
+ image: null,
1126
+ siteName: null,
1127
+ locale: null
1128
+ };
1129
+ openGraph.title = extractMetaContent(document, "og:title");
1130
+ openGraph.description = extractMetaContent(document, "og:description");
1131
+ openGraph.type = extractMetaContent(document, "og:type");
1132
+ openGraph.url = extractMetaContent(document, "og:url");
1133
+ openGraph.image = extractMetaContent(document, "og:image");
1134
+ openGraph.siteName = extractMetaContent(document, "og:site_name");
1135
+ openGraph.locale = extractMetaContent(document, "og:locale");
1136
+ if (Object.values(openGraph).every((value) => !value)) {
1137
+ return null;
1138
+ }
1139
+ return openGraph;
1140
+ }
1141
+ function extractTwitterCard(document) {
1142
+ const twitter = {
1143
+ card: null,
1144
+ site: null,
1145
+ creator: null,
1146
+ title: null,
1147
+ description: null,
1148
+ image: null
1149
+ };
1150
+ twitter.card = extractMetaContent(document, "twitter:card");
1151
+ twitter.site = extractMetaContent(document, "twitter:site");
1152
+ twitter.creator = extractMetaContent(document, "twitter:creator");
1153
+ twitter.title = extractMetaContent(document, "twitter:title");
1154
+ twitter.description = extractMetaContent(document, "twitter:description");
1155
+ twitter.image = extractMetaContent(document, "twitter:image");
1156
+ if (Object.values(twitter).every((value) => !value)) {
1157
+ return null;
1158
+ }
1159
+ return twitter;
1160
+ }
1161
+
1162
+ // src/utils/logger.ts
1163
+ import pino from "pino";
1164
+ function createLogger(name = "reader", level = process.env.LOG_LEVEL || "info") {
1165
+ return pino({
1166
+ name,
1167
+ level,
1168
+ transport: process.env.NODE_ENV !== "production" ? {
1169
+ target: "pino-pretty",
1170
+ options: {
1171
+ colorize: true,
1172
+ translateTime: "SYS:standard",
1173
+ ignore: "pid,hostname"
1174
+ }
1175
+ } : void 0
1176
+ });
1177
+ }
1178
+ var logger = createLogger();
1179
+
1180
+ // src/utils/robots-parser.ts
1181
+ function parseRobotsTxt(content, userAgent = "*") {
1182
+ const rules = {
1183
+ disallowedPaths: [],
1184
+ allowedPaths: [],
1185
+ crawlDelay: null
1186
+ };
1187
+ const lines = content.split("\n").map((line) => line.trim());
1188
+ let currentUserAgent = "";
1189
+ let matchesUserAgent = false;
1190
+ for (const line of lines) {
1191
+ if (!line || line.startsWith("#")) {
1192
+ continue;
1193
+ }
1194
+ const colonIndex = line.indexOf(":");
1195
+ if (colonIndex === -1) {
1196
+ continue;
1197
+ }
1198
+ const directive = line.substring(0, colonIndex).trim().toLowerCase();
1199
+ const value = line.substring(colonIndex + 1).trim();
1200
+ if (directive === "user-agent") {
1201
+ currentUserAgent = value.toLowerCase();
1202
+ matchesUserAgent = currentUserAgent === "*" || currentUserAgent === userAgent.toLowerCase();
1203
+ } else if (matchesUserAgent) {
1204
+ if (directive === "disallow" && value) {
1205
+ rules.disallowedPaths.push(value);
1206
+ } else if (directive === "allow" && value) {
1207
+ rules.allowedPaths.push(value);
1208
+ } else if (directive === "crawl-delay") {
1209
+ const delay = parseFloat(value);
1210
+ if (!isNaN(delay)) {
1211
+ rules.crawlDelay = delay * 1e3;
1212
+ }
1213
+ }
1214
+ }
1215
+ }
1216
+ return rules;
1217
+ }
1218
+ function isPathAllowed(path, rules) {
1219
+ const normalizedPath = path.startsWith("/") ? path : "/" + path;
1220
+ for (const allowedPath of rules.allowedPaths) {
1221
+ if (pathMatches(normalizedPath, allowedPath)) {
1222
+ return true;
1223
+ }
1224
+ }
1225
+ for (const disallowedPath of rules.disallowedPaths) {
1226
+ if (pathMatches(normalizedPath, disallowedPath)) {
1227
+ return false;
1228
+ }
1229
+ }
1230
+ return true;
1231
+ }
1232
+ function pathMatches(path, pattern) {
1233
+ if (!pattern) {
1234
+ return false;
1235
+ }
1236
+ let regexPattern = pattern.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1237
+ if (regexPattern.endsWith("\\$")) {
1238
+ regexPattern = regexPattern.slice(0, -2) + "$";
1239
+ } else {
1240
+ regexPattern = "^" + regexPattern;
1241
+ }
1242
+ try {
1243
+ const regex = new RegExp(regexPattern);
1244
+ return regex.test(path);
1245
+ } catch {
1246
+ return path.startsWith(pattern);
1247
+ }
1248
+ }
1249
+ async function fetchRobotsTxt(baseUrl) {
1250
+ try {
1251
+ const url = new URL("/robots.txt", baseUrl);
1252
+ const response = await fetch(url.toString(), {
1253
+ headers: {
1254
+ "User-Agent": "ReaderEngine/1.0"
1255
+ }
1256
+ });
1257
+ if (!response.ok) {
1258
+ return null;
1259
+ }
1260
+ const content = await response.text();
1261
+ return parseRobotsTxt(content, "ReaderEngine");
1262
+ } catch {
1263
+ return null;
1264
+ }
1265
+ }
1266
+ function isUrlAllowed(url, rules) {
1267
+ if (!rules) {
1268
+ return true;
1269
+ }
1270
+ try {
1271
+ const parsedUrl = new URL(url);
1272
+ return isPathAllowed(parsedUrl.pathname + parsedUrl.search, rules);
1273
+ } catch {
1274
+ return true;
1275
+ }
1276
+ }
1277
+
1278
+ // src/types.ts
1279
+ var DEFAULT_OPTIONS = {
1280
+ urls: [],
1281
+ formats: ["markdown"],
1282
+ includeMetadata: true,
1283
+ timeoutMs: 3e4,
1284
+ includePatterns: [],
1285
+ excludePatterns: [],
1286
+ // Content cleaning defaults
1287
+ removeAds: true,
1288
+ removeBase64Images: true,
1289
+ skipTLSVerification: true,
1290
+ // Batch defaults
1291
+ batchConcurrency: 1,
1292
+ batchTimeoutMs: 3e5,
1293
+ maxRetries: 2,
1294
+ onProgress: () => {
1295
+ },
1296
+ // Default no-op progress callback
1297
+ // Hero-specific defaults
1298
+ verbose: false,
1299
+ showChrome: false
1300
+ };
1301
+
1302
+ // src/scraper.ts
1303
+ var Scraper = class {
1304
+ options;
1305
+ pool;
1306
+ logger = createLogger("scraper");
1307
+ robotsCache = /* @__PURE__ */ new Map();
1308
+ constructor(options) {
1309
+ this.options = {
1310
+ ...DEFAULT_OPTIONS,
1311
+ ...options
1312
+ };
1313
+ if (!options.pool) {
1314
+ throw new Error("Browser pool must be provided. Use ReaderClient for automatic pool management.");
1315
+ }
1316
+ this.pool = options.pool;
1317
+ }
1318
+ /**
1319
+ * Get robots.txt rules for a URL, cached per domain
1320
+ */
1321
+ async getRobotsRules(url) {
1322
+ const origin = new URL(url).origin;
1323
+ if (!this.robotsCache.has(origin)) {
1324
+ const rules = await fetchRobotsTxt(origin);
1325
+ this.robotsCache.set(origin, rules);
1326
+ }
1327
+ return this.robotsCache.get(origin) ?? null;
1328
+ }
1329
+ /**
1330
+ * Scrape all URLs
1331
+ *
1332
+ * @returns Scrape result with pages and metadata
1333
+ */
1334
+ async scrape() {
1335
+ const startTime = Date.now();
1336
+ const results = await this.scrapeWithConcurrency();
1337
+ return this.buildScrapeResult(results, startTime);
1338
+ }
1339
+ /**
1340
+ * Scrape URLs with concurrency control
1341
+ */
1342
+ async scrapeWithConcurrency() {
1343
+ const limit = pLimit(this.options.batchConcurrency || 1);
1344
+ const tasks = this.options.urls.map(
1345
+ (url, index) => limit(() => this.scrapeSingleUrlWithRetry(url, index))
1346
+ );
1347
+ const batchPromise = Promise.all(tasks);
1348
+ if (this.options.batchTimeoutMs && this.options.batchTimeoutMs > 0) {
1349
+ const timeoutPromise = new Promise((_, reject) => {
1350
+ setTimeout(() => {
1351
+ reject(new Error(`Batch operation timed out after ${this.options.batchTimeoutMs}ms`));
1352
+ }, this.options.batchTimeoutMs);
1353
+ });
1354
+ return Promise.race([batchPromise, timeoutPromise]);
1355
+ }
1356
+ return batchPromise;
1357
+ }
1358
+ /**
1359
+ * Scrape a single URL with retry logic
1360
+ */
1361
+ async scrapeSingleUrlWithRetry(url, index) {
1362
+ const maxRetries = this.options.maxRetries || 2;
1363
+ let lastError;
1364
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
1365
+ try {
1366
+ const result = await this.scrapeSingleUrl(url, index);
1367
+ if (result) {
1368
+ return { result };
1369
+ }
1370
+ lastError = `Failed to scrape ${url}: No content returned`;
1371
+ } catch (error) {
1372
+ lastError = error.message;
1373
+ if (attempt < maxRetries) {
1374
+ const delay = Math.pow(2, attempt) * 1e3;
1375
+ this.logger.warn(`Retry ${attempt + 1}/${maxRetries} for ${url} in ${delay}ms`);
1376
+ await new Promise((resolve) => setTimeout(resolve, delay));
1377
+ }
1378
+ }
1379
+ }
1380
+ this.logger.error(`Failed to scrape ${url} after ${maxRetries + 1} attempts: ${lastError}`);
1381
+ return { result: null, error: lastError };
1382
+ }
1383
+ /**
1384
+ * Wait for the final page to load after any Cloudflare redirects
1385
+ * Cloudflare often does silent redirects even when bypassed, we need to ensure
1386
+ * we're on the actual content page before scraping.
1387
+ */
1388
+ async waitForFinalPage(hero, originalUrl, verbose) {
1389
+ const maxWaitMs = 15e3;
1390
+ const startTime = Date.now();
1391
+ const log = (msg) => verbose && this.logger.info(msg);
1392
+ try {
1393
+ await hero.waitForLoad("AllContentLoaded", { timeoutMs: maxWaitMs });
1394
+ } catch {
1395
+ }
1396
+ let currentUrl = await hero.url;
1397
+ const normalizeUrl2 = (url) => url.replace(/\/+$/, "");
1398
+ const urlChanged = normalizeUrl2(currentUrl) !== normalizeUrl2(originalUrl);
1399
+ if (urlChanged || currentUrl.includes("__cf_chl")) {
1400
+ log(`Cloudflare redirect detected: ${originalUrl} \u2192 ${currentUrl}`);
1401
+ let lastUrl = currentUrl;
1402
+ let stableCount = 0;
1403
+ while (Date.now() - startTime < maxWaitMs) {
1404
+ await new Promise((resolve) => setTimeout(resolve, 500));
1405
+ try {
1406
+ currentUrl = await hero.url;
1407
+ if (currentUrl === lastUrl) {
1408
+ stableCount++;
1409
+ if (stableCount >= 2) {
1410
+ break;
1411
+ }
1412
+ } else {
1413
+ stableCount = 0;
1414
+ lastUrl = currentUrl;
1415
+ log(`URL changed to: ${currentUrl}`);
1416
+ }
1417
+ } catch {
1418
+ }
1419
+ }
1420
+ try {
1421
+ await hero.waitForLoad("AllContentLoaded", { timeoutMs: 1e4 });
1422
+ } catch {
1423
+ }
1424
+ }
1425
+ await hero.waitForPaintingStable();
1426
+ await new Promise((resolve) => setTimeout(resolve, 2e3));
1427
+ }
1428
+ /**
1429
+ * Scrape a single URL
1430
+ */
1431
+ async scrapeSingleUrl(url, index) {
1432
+ const startTime = Date.now();
1433
+ const robotsRules = await this.getRobotsRules(url);
1434
+ if (!isUrlAllowed(url, robotsRules)) {
1435
+ throw new Error(`URL blocked by robots.txt: ${url}`);
1436
+ }
1437
+ try {
1438
+ return await this.pool.withBrowser(async (hero) => {
1439
+ await hero.goto(url, { timeoutMs: this.options.timeoutMs });
1440
+ try {
1441
+ await hero.waitForLoad("DomContentLoaded", { timeoutMs: this.options.timeoutMs });
1442
+ } catch {
1443
+ }
1444
+ await hero.waitForPaintingStable();
1445
+ let hadChallenge = false;
1446
+ let challengeType = "none";
1447
+ let waitTimeMs = 0;
1448
+ const initialUrl = await hero.url;
1449
+ const detection = await detectChallenge(hero);
1450
+ if (detection.isChallenge) {
1451
+ hadChallenge = true;
1452
+ challengeType = detection.type;
1453
+ if (this.options.verbose) {
1454
+ this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
1455
+ }
1456
+ const result2 = await waitForChallengeResolution(hero, {
1457
+ maxWaitMs: 45e3,
1458
+ pollIntervalMs: 500,
1459
+ verbose: this.options.verbose,
1460
+ initialUrl
1461
+ });
1462
+ waitTimeMs = result2.waitedMs;
1463
+ if (!result2.resolved) {
1464
+ throw new Error(`Challenge not resolved: ${detection.type}`);
1465
+ }
1466
+ if (this.options.verbose) {
1467
+ this.logger.info(`Challenge resolved via ${result2.method} in ${waitTimeMs}ms`);
1468
+ }
1469
+ }
1470
+ await this.waitForFinalPage(hero, url, this.options.verbose);
1471
+ if (this.options.waitForSelector) {
1472
+ try {
1473
+ await hero.waitForElement(hero.document.querySelector(this.options.waitForSelector), {
1474
+ timeoutMs: this.options.timeoutMs
1475
+ });
1476
+ } catch (error) {
1477
+ this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
1478
+ }
1479
+ }
1480
+ const pageTitle = await hero.document.title;
1481
+ const html = await hero.document.documentElement.outerHTML;
1482
+ const cleanedHtml = cleanContent(html, url, {
1483
+ removeAds: this.options.removeAds,
1484
+ removeBase64Images: this.options.removeBase64Images
1485
+ });
1486
+ const websiteMetadata = extractMetadata(cleanedHtml, url);
1487
+ const duration = Date.now() - startTime;
1488
+ const scrapedAt = (/* @__PURE__ */ new Date()).toISOString();
1489
+ const page = {
1490
+ url,
1491
+ title: pageTitle,
1492
+ markdown: "",
1493
+ // Will be set by formatter
1494
+ html: cleanedHtml,
1495
+ fetchedAt: scrapedAt,
1496
+ depth: 0,
1497
+ hadChallenge,
1498
+ challengeType,
1499
+ waitTimeMs
1500
+ };
1501
+ const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
1502
+ [page],
1503
+ url,
1504
+ scrapedAt,
1505
+ duration,
1506
+ websiteMetadata,
1507
+ this.options.includeMetadata
1508
+ ) : void 0;
1509
+ const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
1510
+ const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
1511
+ const text = this.options.formats.includes("text") ? formatToText(
1512
+ [page],
1513
+ url,
1514
+ scrapedAt,
1515
+ duration,
1516
+ websiteMetadata,
1517
+ this.options.includeMetadata
1518
+ ) : void 0;
1519
+ if (this.options.onProgress) {
1520
+ this.options.onProgress({
1521
+ completed: index + 1,
1522
+ total: this.options.urls.length,
1523
+ currentUrl: url
1524
+ });
1525
+ }
1526
+ let proxyMetadata;
1527
+ if (this.options.proxy) {
1528
+ const proxy = this.options.proxy;
1529
+ if (proxy.url) {
1530
+ try {
1531
+ const proxyUrl = new URL(proxy.url);
1532
+ proxyMetadata = {
1533
+ host: proxyUrl.hostname,
1534
+ port: parseInt(proxyUrl.port, 10) || 80,
1535
+ country: proxy.country
1536
+ };
1537
+ } catch {
1538
+ }
1539
+ } else if (proxy.host && proxy.port) {
1540
+ proxyMetadata = {
1541
+ host: proxy.host,
1542
+ port: proxy.port,
1543
+ country: proxy.country
1544
+ };
1545
+ }
1546
+ }
1547
+ const result = {
1548
+ markdown,
1549
+ html: htmlOutput,
1550
+ json,
1551
+ text,
1552
+ metadata: {
1553
+ baseUrl: url,
1554
+ totalPages: 1,
1555
+ scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
1556
+ duration,
1557
+ website: websiteMetadata,
1558
+ proxy: proxyMetadata
1559
+ }
1560
+ };
1561
+ return result;
1562
+ });
1563
+ } catch (error) {
1564
+ this.logger.error(`Failed to scrape ${url}: ${error.message}`);
1565
+ if (this.options.onProgress) {
1566
+ this.options.onProgress({
1567
+ completed: index + 1,
1568
+ total: this.options.urls.length,
1569
+ currentUrl: url
1570
+ });
1571
+ }
1572
+ return null;
1573
+ }
1574
+ }
1575
+ /**
1576
+ * Build final scrape result
1577
+ */
1578
+ buildScrapeResult(results, startTime) {
1579
+ const successful = results.filter((r) => r.result !== null).map((r) => r.result);
1580
+ const errors = [];
1581
+ results.forEach((r, index) => {
1582
+ if (r.result === null && r.error) {
1583
+ errors.push({ url: this.options.urls[index], error: r.error });
1584
+ }
1585
+ });
1586
+ const batchMetadata = {
1587
+ totalUrls: this.options.urls.length,
1588
+ successfulUrls: successful.length,
1589
+ failedUrls: results.filter((r) => r.result === null).length,
1590
+ scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
1591
+ totalDuration: Date.now() - startTime,
1592
+ errors
1593
+ };
1594
+ return {
1595
+ data: successful,
1596
+ batchMetadata
1597
+ };
1598
+ }
1599
+ };
1600
+ async function scrape(options) {
1601
+ const scraper = new Scraper(options);
1602
+ return scraper.scrape();
1603
+ }
1604
+
1605
+ // src/crawler.ts
1606
+ import { parseHTML as parseHTML4 } from "linkedom";
1607
+
1608
+ // src/utils/rate-limiter.ts
1609
+ import pLimit2 from "p-limit";
1610
+ async function rateLimit(ms) {
1611
+ return new Promise((resolve) => setTimeout(resolve, ms));
1612
+ }
1613
+
1614
+ // src/crawler.ts
1615
+ var Crawler = class {
1616
+ options;
1617
+ visited = /* @__PURE__ */ new Set();
1618
+ queue = [];
1619
+ urls = [];
1620
+ pool;
1621
+ logger = createLogger("crawler");
1622
+ robotsRules = null;
1623
+ constructor(options) {
1624
+ if (!options.pool) {
1625
+ throw new Error("Browser pool must be provided. Use ReaderClient for automatic pool management.");
1626
+ }
1627
+ this.pool = options.pool;
1628
+ this.options = {
1629
+ url: options.url,
1630
+ depth: options.depth || 1,
1631
+ maxPages: options.maxPages || 20,
1632
+ scrape: options.scrape || false,
1633
+ delayMs: options.delayMs || 1e3,
1634
+ timeoutMs: options.timeoutMs,
1635
+ includePatterns: options.includePatterns,
1636
+ excludePatterns: options.excludePatterns,
1637
+ formats: options.formats || ["markdown", "html"],
1638
+ scrapeConcurrency: options.scrapeConcurrency || 2,
1639
+ proxy: options.proxy,
1640
+ userAgent: options.userAgent,
1641
+ verbose: options.verbose || false,
1642
+ showChrome: options.showChrome || false,
1643
+ connectionToCore: options.connectionToCore,
1644
+ // Content cleaning options
1645
+ removeAds: options.removeAds,
1646
+ removeBase64Images: options.removeBase64Images
1647
+ };
1648
+ }
1649
+ /**
1650
+ * Start crawling
1651
+ */
1652
+ async crawl() {
1653
+ const startTime = Date.now();
1654
+ this.robotsRules = await fetchRobotsTxt(this.options.url);
1655
+ if (this.robotsRules) {
1656
+ this.logger.info("Loaded robots.txt rules");
1657
+ }
1658
+ if (isUrlAllowed(this.options.url, this.robotsRules)) {
1659
+ this.queue.push({ url: this.options.url, depth: 0 });
1660
+ } else {
1661
+ this.logger.warn(`Seed URL blocked by robots.txt: ${this.options.url}`);
1662
+ }
1663
+ while (this.queue.length > 0 && this.urls.length < this.options.maxPages) {
1664
+ if (this.options.timeoutMs && Date.now() - startTime > this.options.timeoutMs) {
1665
+ this.logger.warn(`Crawl timed out after ${this.options.timeoutMs}ms`);
1666
+ break;
1667
+ }
1668
+ const item = this.queue.shift();
1669
+ const urlKey = getUrlKey(item.url);
1670
+ if (this.visited.has(urlKey)) {
1671
+ continue;
1672
+ }
1673
+ const result = await this.fetchPage(item.url);
1674
+ if (result) {
1675
+ this.urls.push(result.crawlUrl);
1676
+ this.visited.add(urlKey);
1677
+ if (item.depth < this.options.depth) {
1678
+ const links = this.extractLinks(result.html, item.url, item.depth + 1);
1679
+ this.queue.push(...links);
1680
+ }
1681
+ }
1682
+ const delay = this.robotsRules?.crawlDelay || this.options.delayMs;
1683
+ await rateLimit(delay);
1684
+ }
1685
+ const metadata = {
1686
+ totalUrls: this.urls.length,
1687
+ maxDepth: this.options.depth,
1688
+ totalDuration: Date.now() - startTime,
1689
+ seedUrl: this.options.url
1690
+ };
1691
+ let scraped;
1692
+ if (this.options.scrape) {
1693
+ scraped = await this.scrapeDiscoveredUrls();
1694
+ }
1695
+ return {
1696
+ urls: this.urls,
1697
+ scraped,
1698
+ metadata
1699
+ };
1700
+ }
1701
+ /**
1702
+ * Fetch a single page and extract basic info
1703
+ */
1704
+ async fetchPage(url) {
1705
+ try {
1706
+ return await this.pool.withBrowser(async (hero) => {
1707
+ await hero.goto(url, { timeoutMs: 3e4 });
1708
+ await hero.waitForPaintingStable();
1709
+ const initialUrl = await hero.url;
1710
+ const detection = await detectChallenge(hero);
1711
+ if (detection.isChallenge) {
1712
+ if (this.options.verbose) {
1713
+ this.logger.info(`Challenge detected on ${url}`);
1714
+ }
1715
+ const result = await waitForChallengeResolution(hero, {
1716
+ maxWaitMs: 45e3,
1717
+ pollIntervalMs: 500,
1718
+ verbose: this.options.verbose,
1719
+ initialUrl
1720
+ });
1721
+ if (!result.resolved) {
1722
+ throw new Error(`Challenge not resolved`);
1723
+ }
1724
+ }
1725
+ const title = await hero.document.title;
1726
+ const html = await hero.document.documentElement.outerHTML;
1727
+ let description = null;
1728
+ try {
1729
+ const metaDesc = await hero.document.querySelector('meta[name="description"]');
1730
+ if (metaDesc) {
1731
+ description = await metaDesc.getAttribute("content");
1732
+ }
1733
+ } catch {
1734
+ }
1735
+ return {
1736
+ crawlUrl: {
1737
+ url,
1738
+ title: title || "Untitled",
1739
+ description
1740
+ },
1741
+ html
1742
+ };
1743
+ });
1744
+ } catch (error) {
1745
+ this.logger.error(`Failed to fetch ${url}: ${error.message}`);
1746
+ return null;
1747
+ }
1748
+ }
1749
+ /**
1750
+ * Extract links from HTML content using DOM parsing
1751
+ * Handles all href formats (single quotes, double quotes, unquoted)
1752
+ */
1753
+ extractLinks(html, baseUrl, depth) {
1754
+ const links = [];
1755
+ const { document } = parseHTML4(html);
1756
+ document.querySelectorAll("a[href]").forEach((anchor) => {
1757
+ const href = anchor.getAttribute("href");
1758
+ if (!href) return;
1759
+ const resolved = resolveUrl(href, baseUrl);
1760
+ if (!resolved || !isValidUrl(resolved)) return;
1761
+ if (!isSameDomain(resolved, this.options.url)) return;
1762
+ if (!isContentUrl(resolved)) return;
1763
+ if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
1764
+ if (!isUrlAllowed(resolved, this.robotsRules)) return;
1765
+ const urlKey = getUrlKey(resolved);
1766
+ if (this.visited.has(urlKey) || this.queue.some((q) => getUrlKey(q.url) === urlKey)) {
1767
+ return;
1768
+ }
1769
+ links.push({ url: resolved, depth });
1770
+ });
1771
+ return links;
1772
+ }
1773
+ /**
1774
+ * Scrape all discovered URLs
1775
+ */
1776
+ async scrapeDiscoveredUrls() {
1777
+ const urls = this.urls.map((u) => u.url);
1778
+ return scrape({
1779
+ urls,
1780
+ formats: this.options.formats,
1781
+ batchConcurrency: this.options.scrapeConcurrency,
1782
+ proxy: this.options.proxy,
1783
+ userAgent: this.options.userAgent,
1784
+ verbose: this.options.verbose,
1785
+ showChrome: this.options.showChrome,
1786
+ pool: this.pool,
1787
+ // Content cleaning options
1788
+ removeAds: this.options.removeAds,
1789
+ removeBase64Images: this.options.removeBase64Images
1790
+ });
1791
+ }
1792
+ };
1793
+ async function crawl(options) {
1794
+ const crawler = new Crawler(options);
1795
+ return crawler.crawl();
1796
+ }
1797
+
1798
+ // src/browser/pool.ts
1799
+ import Hero from "@ulixee/hero";
1800
+
1801
+ // src/proxy/config.ts
1802
+ function createProxyUrl(config) {
1803
+ if (config.url) {
1804
+ return config.url;
1805
+ }
1806
+ if (config.type === "residential") {
1807
+ const sessionId = `hero_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
1808
+ return `http://customer-${config.username}_session-${sessionId}_country-${config.country || "us"}:${config.password}@${config.host}:${config.port}`;
1809
+ }
1810
+ return `http://${config.username}:${config.password}@${config.host}:${config.port}`;
1811
+ }
1812
+
1813
+ // src/browser/hero-config.ts
1814
+ function createHeroConfig(options = {}) {
1815
+ const config = {
1816
+ // Show or hide Chrome window
1817
+ showChrome: options.showChrome ?? false,
1818
+ // ============================================================================
1819
+ // CRITICAL: TLS fingerprint emulation
1820
+ // ============================================================================
1821
+ // Setting disableMitm to false enables TLS/TCP fingerprint emulation
1822
+ // This is ESSENTIAL for bypassing Cloudflare and other anti-bot systems
1823
+ disableMitm: false,
1824
+ // ============================================================================
1825
+ // Session management
1826
+ // ============================================================================
1827
+ // Use incognito for clean session state
1828
+ disableIncognito: false,
1829
+ // ============================================================================
1830
+ // Docker compatibility
1831
+ // ============================================================================
1832
+ // Required when running in containerized environments
1833
+ noChromeSandbox: true,
1834
+ // ============================================================================
1835
+ // DNS over TLS (mimics Chrome behavior)
1836
+ // ============================================================================
1837
+ // Using Cloudflare's DNS (1.1.1.1) over TLS makes the connection
1838
+ // look more like a real Chrome browser
1839
+ dnsOverTlsProvider: {
1840
+ host: "1.1.1.1",
1841
+ servername: "cloudflare-dns.com"
1842
+ },
1843
+ // ============================================================================
1844
+ // WebRTC IP leak prevention
1845
+ // ============================================================================
1846
+ // Masks the real IP address in WebRTC connections
1847
+ // Uses ipify.org to detect the public IP
1848
+ upstreamProxyIpMask: {
1849
+ ipLookupService: "https://api.ipify.org?format=json"
1850
+ },
1851
+ // ============================================================================
1852
+ // Locale and timezone
1853
+ // ============================================================================
1854
+ locale: "en-US",
1855
+ timezoneId: "America/New_York",
1856
+ // ============================================================================
1857
+ // Viewport (standard desktop size)
1858
+ // ============================================================================
1859
+ viewport: {
1860
+ width: 1920,
1861
+ height: 1080
1862
+ },
1863
+ // ============================================================================
1864
+ // User agent (if provided)
1865
+ // ============================================================================
1866
+ ...options.userAgent && { userAgent: options.userAgent },
1867
+ // ============================================================================
1868
+ // Connection to Core (if provided)
1869
+ // ============================================================================
1870
+ ...options.connectionToCore && { connectionToCore: options.connectionToCore }
1871
+ };
1872
+ if (options.proxy) {
1873
+ config.upstreamProxyUrl = createProxyUrl(options.proxy);
1874
+ config.upstreamProxyUseSystemDns = false;
1875
+ }
1876
+ return config;
1877
+ }
1878
+
1879
+ // src/browser/pool.ts
1880
+ var DEFAULT_POOL_CONFIG = {
1881
+ size: 2,
1882
+ retireAfterPageCount: 100,
1883
+ retireAfterAgeMs: 30 * 60 * 1e3,
1884
+ // 30 minutes
1885
+ recycleCheckInterval: 60 * 1e3,
1886
+ // 1 minute
1887
+ healthCheckInterval: 5 * 60 * 1e3,
1888
+ // 5 minutes
1889
+ maxConsecutiveFailures: 3,
1890
+ maxQueueSize: 100,
1891
+ queueTimeout: 60 * 1e3
1892
+ // 1 minute
1893
+ };
1894
+ function generateId() {
1895
+ return `browser_${Date.now()}_${Math.random().toString(36).slice(2, 9)}`;
1896
+ }
1897
+ var BrowserPool = class {
1898
+ instances = [];
1899
+ available = [];
1900
+ inUse = /* @__PURE__ */ new Set();
1901
+ queue = [];
1902
+ config;
1903
+ proxy;
1904
+ recycleTimer;
1905
+ healthTimer;
1906
+ totalRequests = 0;
1907
+ totalRequestDuration = 0;
1908
+ showChrome;
1909
+ connectionToCore;
1910
+ userAgent;
1911
+ verbose;
1912
+ logger = createLogger("pool");
1913
+ constructor(config = {}, proxy, showChrome = false, connectionToCore, userAgent, verbose = false) {
1914
+ this.config = { ...DEFAULT_POOL_CONFIG, ...config };
1915
+ this.proxy = proxy;
1916
+ this.showChrome = showChrome;
1917
+ this.connectionToCore = connectionToCore;
1918
+ this.userAgent = userAgent;
1919
+ this.verbose = verbose;
1920
+ }
1921
+ /**
1922
+ * Initialize the pool by pre-launching browsers
1923
+ */
1924
+ async initialize() {
1925
+ if (this.verbose) {
1926
+ this.logger.info(`Initializing pool with ${this.config.size} browsers...`);
1927
+ }
1928
+ const launchPromises = [];
1929
+ for (let i = 0; i < this.config.size; i++) {
1930
+ launchPromises.push(this.createInstance());
1931
+ }
1932
+ this.instances = await Promise.all(launchPromises);
1933
+ this.available = [...this.instances];
1934
+ this.startRecycling();
1935
+ this.startHealthChecks();
1936
+ if (this.verbose) {
1937
+ this.logger.info(`Pool ready: ${this.instances.length} browsers available`);
1938
+ }
1939
+ }
1940
+ /**
1941
+ * Shutdown the pool and close all browsers
1942
+ */
1943
+ async shutdown() {
1944
+ if (this.verbose) {
1945
+ const stats = this.getStats();
1946
+ this.logger.info(
1947
+ `Shutting down pool: ${stats.totalRequests} total requests processed, ${Math.round(stats.avgRequestDuration)}ms avg duration`
1948
+ );
1949
+ }
1950
+ if (this.recycleTimer) clearInterval(this.recycleTimer);
1951
+ if (this.healthTimer) clearInterval(this.healthTimer);
1952
+ for (const item of this.queue) {
1953
+ item.reject(new Error("Pool shutting down"));
1954
+ }
1955
+ this.queue = [];
1956
+ const closePromises = this.instances.map((instance) => instance.hero.close().catch(() => {
1957
+ }));
1958
+ await Promise.all(closePromises);
1959
+ if (this.connectionToCore) {
1960
+ try {
1961
+ await this.connectionToCore.disconnect();
1962
+ } catch {
1963
+ }
1964
+ this.connectionToCore = void 0;
1965
+ }
1966
+ this.instances = [];
1967
+ this.available = [];
1968
+ this.inUse.clear();
1969
+ }
1970
+ /**
1971
+ * Acquire a browser from the pool
1972
+ */
1973
+ async acquire() {
1974
+ const instance = this.available.shift();
1975
+ if (!instance) {
1976
+ if (this.verbose) {
1977
+ this.logger.info(`No browsers available, queuing request (queue: ${this.queue.length + 1})`);
1978
+ }
1979
+ return this.queueRequest();
1980
+ }
1981
+ instance.status = "busy";
1982
+ instance.lastUsed = Date.now();
1983
+ this.inUse.add(instance);
1984
+ if (this.verbose) {
1985
+ this.logger.info(
1986
+ `Acquired browser ${instance.id} (available: ${this.available.length}, busy: ${this.inUse.size})`
1987
+ );
1988
+ }
1989
+ return instance.hero;
1990
+ }
1991
+ /**
1992
+ * Release a browser back to the pool
1993
+ */
1994
+ release(hero) {
1995
+ const instance = this.instances.find((i) => i.hero === hero);
1996
+ if (!instance) return;
1997
+ instance.status = "idle";
1998
+ instance.requestCount++;
1999
+ this.inUse.delete(instance);
2000
+ if (this.verbose) {
2001
+ this.logger.info(
2002
+ `Released browser ${instance.id} (requests: ${instance.requestCount}, available: ${this.available.length + 1})`
2003
+ );
2004
+ }
2005
+ if (this.shouldRecycle(instance)) {
2006
+ if (this.verbose) {
2007
+ this.logger.info(`Recycling browser ${instance.id} (age or request limit reached)`);
2008
+ }
2009
+ this.recycleInstance(instance).catch(() => {
2010
+ });
2011
+ } else {
2012
+ this.available.push(instance);
2013
+ this.processQueue();
2014
+ }
2015
+ }
2016
+ /**
2017
+ * Execute callback with auto-managed browser
2018
+ */
2019
+ async withBrowser(callback) {
2020
+ const startTime = Date.now();
2021
+ const hero = await this.acquire();
2022
+ try {
2023
+ const result = await callback(hero);
2024
+ this.totalRequests++;
2025
+ this.totalRequestDuration += Date.now() - startTime;
2026
+ return result;
2027
+ } finally {
2028
+ this.release(hero);
2029
+ }
2030
+ }
2031
+ /**
2032
+ * Get pool statistics
2033
+ */
2034
+ getStats() {
2035
+ const recycling = this.instances.filter((i) => i.status === "recycling").length;
2036
+ const unhealthy = this.instances.filter((i) => i.status === "unhealthy").length;
2037
+ return {
2038
+ total: this.instances.length,
2039
+ available: this.available.length,
2040
+ busy: this.inUse.size,
2041
+ recycling,
2042
+ unhealthy,
2043
+ queueLength: this.queue.length,
2044
+ totalRequests: this.totalRequests,
2045
+ avgRequestDuration: this.totalRequests > 0 ? this.totalRequestDuration / this.totalRequests : 0
2046
+ };
2047
+ }
2048
+ /**
2049
+ * Run health check
2050
+ */
2051
+ async healthCheck() {
2052
+ const issues = [];
2053
+ const stats = this.getStats();
2054
+ if (stats.unhealthy > 0) {
2055
+ issues.push(`${stats.unhealthy} unhealthy instances`);
2056
+ }
2057
+ if (stats.queueLength > this.config.maxQueueSize * 0.8) {
2058
+ issues.push(`Queue near capacity: ${stats.queueLength}/${this.config.maxQueueSize}`);
2059
+ }
2060
+ if (stats.available === 0 && stats.queueLength > 0) {
2061
+ issues.push("Pool saturated - all browsers busy with pending requests");
2062
+ }
2063
+ return {
2064
+ healthy: issues.length === 0,
2065
+ issues,
2066
+ stats
2067
+ };
2068
+ }
2069
+ // =========================================================================
2070
+ // Private methods
2071
+ // =========================================================================
2072
+ /**
2073
+ * Create a new browser instance
2074
+ */
2075
+ async createInstance() {
2076
+ const heroConfig = createHeroConfig({
2077
+ proxy: this.proxy,
2078
+ showChrome: this.showChrome,
2079
+ connectionToCore: this.connectionToCore,
2080
+ userAgent: this.userAgent
2081
+ });
2082
+ const hero = new Hero(heroConfig);
2083
+ return {
2084
+ hero,
2085
+ id: generateId(),
2086
+ createdAt: Date.now(),
2087
+ lastUsed: Date.now(),
2088
+ requestCount: 0,
2089
+ status: "idle"
2090
+ };
2091
+ }
2092
+ /**
2093
+ * Check if instance should be recycled
2094
+ */
2095
+ shouldRecycle(instance) {
2096
+ const age = Date.now() - instance.createdAt;
2097
+ return instance.requestCount >= this.config.retireAfterPageCount || age >= this.config.retireAfterAgeMs;
2098
+ }
2099
+ /**
2100
+ * Recycle an instance (close old, create new)
2101
+ */
2102
+ async recycleInstance(instance) {
2103
+ instance.status = "recycling";
2104
+ try {
2105
+ await instance.hero.close().catch(() => {
2106
+ });
2107
+ const newInstance = await this.createInstance();
2108
+ const index = this.instances.indexOf(instance);
2109
+ if (index !== -1) {
2110
+ this.instances[index] = newInstance;
2111
+ }
2112
+ this.available.push(newInstance);
2113
+ if (this.verbose) {
2114
+ this.logger.info(`Recycled browser: ${instance.id} \u2192 ${newInstance.id}`);
2115
+ }
2116
+ this.processQueue();
2117
+ } catch (error) {
2118
+ instance.status = "unhealthy";
2119
+ if (this.verbose) {
2120
+ this.logger.warn(`Failed to recycle browser ${instance.id}`);
2121
+ }
2122
+ }
2123
+ }
2124
+ /**
2125
+ * Queue a request when no browsers available
2126
+ */
2127
+ queueRequest() {
2128
+ return new Promise((resolve, reject) => {
2129
+ if (this.queue.length >= this.config.maxQueueSize) {
2130
+ reject(new Error("Queue full"));
2131
+ return;
2132
+ }
2133
+ const item = {
2134
+ resolve,
2135
+ reject,
2136
+ queuedAt: Date.now()
2137
+ };
2138
+ this.queue.push(item);
2139
+ setTimeout(() => {
2140
+ const index = this.queue.indexOf(item);
2141
+ if (index !== -1) {
2142
+ this.queue.splice(index, 1);
2143
+ reject(new Error("Queue timeout"));
2144
+ }
2145
+ }, this.config.queueTimeout);
2146
+ });
2147
+ }
2148
+ /**
2149
+ * Process queued requests
2150
+ */
2151
+ processQueue() {
2152
+ while (this.queue.length > 0 && this.available.length > 0) {
2153
+ const item = this.queue.shift();
2154
+ const age = Date.now() - item.queuedAt;
2155
+ if (age > this.config.queueTimeout) {
2156
+ item.reject(new Error("Queue timeout"));
2157
+ continue;
2158
+ }
2159
+ this.acquire().then(item.resolve).catch(item.reject);
2160
+ }
2161
+ }
2162
+ /**
2163
+ * Start background recycling task
2164
+ */
2165
+ startRecycling() {
2166
+ this.recycleTimer = setInterval(() => {
2167
+ for (const instance of this.instances) {
2168
+ if (instance.status === "idle" && this.shouldRecycle(instance)) {
2169
+ this.recycleInstance(instance).catch(() => {
2170
+ });
2171
+ }
2172
+ }
2173
+ }, this.config.recycleCheckInterval);
2174
+ this.recycleTimer.unref();
2175
+ }
2176
+ /**
2177
+ * Start background health checks
2178
+ */
2179
+ startHealthChecks() {
2180
+ this.healthTimer = setInterval(async () => {
2181
+ const health = await this.healthCheck();
2182
+ if (!health.healthy && health.issues.length > 0) {
2183
+ console.warn("[BrowserPool] Health issues:", health.issues);
2184
+ }
2185
+ }, this.config.healthCheckInterval);
2186
+ this.healthTimer.unref();
2187
+ }
2188
+ };
2189
+
2190
+ // src/client.ts
2191
+ var logger2 = createLogger("client");
2192
+ var ReaderClient = class {
2193
+ heroCore = null;
2194
+ pool = null;
2195
+ initialized = false;
2196
+ initializing = null;
2197
+ closed = false;
2198
+ options;
2199
+ proxyIndex = 0;
2200
+ cleanupHandler = null;
2201
+ constructor(options = {}) {
2202
+ this.options = options;
2203
+ const skipTLS = options.skipTLSVerification ?? true;
2204
+ if (skipTLS) {
2205
+ process.env.MITM_ALLOW_INSECURE = "true";
2206
+ }
2207
+ this.registerCleanup();
2208
+ }
2209
+ /**
2210
+ * Get the next proxy from the rotation pool
2211
+ */
2212
+ getNextProxy() {
2213
+ const { proxies, proxyRotation = "round-robin" } = this.options;
2214
+ if (!proxies || proxies.length === 0) {
2215
+ return void 0;
2216
+ }
2217
+ if (proxyRotation === "random") {
2218
+ return proxies[Math.floor(Math.random() * proxies.length)];
2219
+ }
2220
+ const proxy = proxies[this.proxyIndex % proxies.length];
2221
+ this.proxyIndex++;
2222
+ return proxy;
2223
+ }
2224
+ /**
2225
+ * Initialize HeroCore. Called automatically on first scrape/crawl.
2226
+ * Can be called explicitly if you want to pre-warm the client.
2227
+ */
2228
+ async start() {
2229
+ if (this.closed) {
2230
+ throw new Error("ReaderClient has been closed. Create a new instance.");
2231
+ }
2232
+ if (this.initialized) {
2233
+ return;
2234
+ }
2235
+ if (this.initializing) {
2236
+ await this.initializing;
2237
+ return;
2238
+ }
2239
+ this.initializing = this.initializeCore();
2240
+ await this.initializing;
2241
+ this.initializing = null;
2242
+ }
2243
+ /**
2244
+ * Internal initialization logic
2245
+ */
2246
+ async initializeCore() {
2247
+ try {
2248
+ if (this.options.verbose) {
2249
+ logger2.info("Starting HeroCore...");
2250
+ }
2251
+ this.heroCore = new HeroCore();
2252
+ await this.heroCore.start();
2253
+ if (this.options.verbose) {
2254
+ logger2.info("HeroCore started successfully");
2255
+ }
2256
+ if (this.options.verbose) {
2257
+ logger2.info("Initializing browser pool...");
2258
+ }
2259
+ const browserPoolConfig = this.options.browserPool;
2260
+ const poolConfig = {
2261
+ size: browserPoolConfig?.size ?? 2,
2262
+ retireAfterPageCount: browserPoolConfig?.retireAfterPages ?? 100,
2263
+ retireAfterAgeMs: (browserPoolConfig?.retireAfterMinutes ?? 30) * 60 * 1e3,
2264
+ maxQueueSize: browserPoolConfig?.maxQueueSize ?? 100
2265
+ };
2266
+ this.pool = new BrowserPool(
2267
+ poolConfig,
2268
+ void 0,
2269
+ // proxy set per-request
2270
+ this.options.showChrome,
2271
+ this.createConnection(),
2272
+ void 0,
2273
+ // userAgent
2274
+ this.options.verbose
2275
+ );
2276
+ await this.pool.initialize();
2277
+ this.initialized = true;
2278
+ if (this.options.verbose) {
2279
+ logger2.info("Browser pool initialized successfully");
2280
+ }
2281
+ } catch (error) {
2282
+ if (this.pool) {
2283
+ await this.pool.shutdown().catch(() => {
2284
+ });
2285
+ this.pool = null;
2286
+ }
2287
+ if (this.heroCore) {
2288
+ await this.heroCore.close().catch(() => {
2289
+ });
2290
+ this.heroCore = null;
2291
+ }
2292
+ this.initialized = false;
2293
+ const message = error.message || String(error);
2294
+ if (message.includes("EADDRINUSE")) {
2295
+ throw new Error(
2296
+ "Failed to start HeroCore: Port already in use. Another instance may be running. Close it or use a different port."
2297
+ );
2298
+ }
2299
+ if (message.includes("chrome") || message.includes("Chrome")) {
2300
+ throw new Error(
2301
+ "Failed to start HeroCore: Chrome/Chromium not found. Please install Chrome or set CHROME_PATH environment variable."
2302
+ );
2303
+ }
2304
+ throw new Error(`Failed to start HeroCore: ${message}`);
2305
+ }
2306
+ }
2307
+ /**
2308
+ * Create a connection to the HeroCore instance
2309
+ */
2310
+ createConnection() {
2311
+ if (!this.heroCore) {
2312
+ throw new Error("HeroCore not initialized. This should not happen.");
2313
+ }
2314
+ const bridge = new TransportBridge();
2315
+ this.heroCore.addConnection(bridge.transportToClient);
2316
+ return new ConnectionToHeroCore(bridge.transportToCore);
2317
+ }
2318
+ /**
2319
+ * Ensure client is initialized before operation
2320
+ */
2321
+ async ensureInitialized() {
2322
+ if (this.closed) {
2323
+ throw new Error("ReaderClient has been closed. Create a new instance.");
2324
+ }
2325
+ if (!this.initialized) {
2326
+ await this.start();
2327
+ }
2328
+ }
2329
+ /**
2330
+ * Scrape one or more URLs
2331
+ *
2332
+ * @param options - Scrape options (urls, formats, etc.)
2333
+ * @returns Scrape result with data and metadata
2334
+ *
2335
+ * @example
2336
+ * const result = await reader.scrape({
2337
+ * urls: ['https://example.com'],
2338
+ * formats: ['markdown', 'html'],
2339
+ * });
2340
+ */
2341
+ async scrape(options) {
2342
+ await this.ensureInitialized();
2343
+ if (!this.pool) {
2344
+ throw new Error("Browser pool not initialized. This should not happen.");
2345
+ }
2346
+ const proxy = options.proxy ?? this.getNextProxy();
2347
+ return await scrape({
2348
+ ...options,
2349
+ proxy,
2350
+ showChrome: options.showChrome ?? this.options.showChrome,
2351
+ verbose: options.verbose ?? this.options.verbose,
2352
+ pool: this.pool
2353
+ });
2354
+ }
2355
+ /**
2356
+ * Crawl a website to discover URLs
2357
+ *
2358
+ * @param options - Crawl options (url, depth, maxPages, etc.)
2359
+ * @returns Crawl result with discovered URLs and optional scraped content
2360
+ *
2361
+ * @example
2362
+ * const result = await reader.crawl({
2363
+ * url: 'https://example.com',
2364
+ * depth: 2,
2365
+ * maxPages: 50,
2366
+ * scrape: true,
2367
+ * });
2368
+ */
2369
+ async crawl(options) {
2370
+ await this.ensureInitialized();
2371
+ if (!this.pool) {
2372
+ throw new Error("Browser pool not initialized. This should not happen.");
2373
+ }
2374
+ const proxy = options.proxy ?? this.getNextProxy();
2375
+ return await crawl({
2376
+ ...options,
2377
+ proxy,
2378
+ pool: this.pool
2379
+ });
2380
+ }
2381
+ /**
2382
+ * Check if the client is initialized and ready
2383
+ */
2384
+ isReady() {
2385
+ return this.initialized && !this.closed;
2386
+ }
2387
+ /**
2388
+ * Close the client and release resources
2389
+ *
2390
+ * Note: This is optional - the client will auto-close on process exit.
2391
+ */
2392
+ async close() {
2393
+ if (this.closed) {
2394
+ return;
2395
+ }
2396
+ this.closed = true;
2397
+ this.removeCleanupHandlers();
2398
+ if (this.pool) {
2399
+ if (this.options.verbose) {
2400
+ logger2.info("Shutting down browser pool...");
2401
+ }
2402
+ try {
2403
+ await this.pool.shutdown();
2404
+ } catch (error) {
2405
+ if (this.options.verbose) {
2406
+ logger2.warn(`Error shutting down pool: ${error.message}`);
2407
+ }
2408
+ }
2409
+ this.pool = null;
2410
+ }
2411
+ if (this.heroCore) {
2412
+ if (this.options.verbose) {
2413
+ logger2.info("Closing HeroCore...");
2414
+ }
2415
+ try {
2416
+ await this.heroCore.close();
2417
+ await HeroCore.shutdown();
2418
+ } catch (error) {
2419
+ if (this.options.verbose) {
2420
+ logger2.warn(`Error closing HeroCore: ${error.message}`);
2421
+ }
2422
+ }
2423
+ this.heroCore = null;
2424
+ }
2425
+ this.initialized = false;
2426
+ if (this.options.verbose) {
2427
+ logger2.info("ReaderClient closed");
2428
+ }
2429
+ }
2430
+ /**
2431
+ * Register cleanup handlers for process exit
2432
+ */
2433
+ registerCleanup() {
2434
+ this.cleanupHandler = async () => {
2435
+ await this.close();
2436
+ };
2437
+ process.once("beforeExit", this.cleanupHandler);
2438
+ process.once("SIGINT", async () => {
2439
+ await this.cleanupHandler?.();
2440
+ process.exit(0);
2441
+ });
2442
+ process.once("SIGTERM", async () => {
2443
+ await this.cleanupHandler?.();
2444
+ process.exit(0);
2445
+ });
2446
+ }
2447
+ /**
2448
+ * Remove process cleanup handlers
2449
+ */
2450
+ removeCleanupHandlers() {
2451
+ if (this.cleanupHandler) {
2452
+ process.removeListener("beforeExit", this.cleanupHandler);
2453
+ this.cleanupHandler = null;
2454
+ }
2455
+ }
2456
+ };
2457
+
2458
+ // src/daemon/server.ts
2459
+ import http from "http";
2460
+ var logger3 = createLogger("daemon");
2461
+ var DEFAULT_DAEMON_PORT = 3847;
2462
+ var PID_FILE_NAME = ".reader-daemon.pid";
2463
+ var DaemonServer = class {
2464
+ server = null;
2465
+ client = null;
2466
+ options;
2467
+ startTime = 0;
2468
+ constructor(options = {}) {
2469
+ this.options = {
2470
+ port: options.port ?? DEFAULT_DAEMON_PORT,
2471
+ poolSize: options.poolSize ?? 5,
2472
+ verbose: options.verbose ?? false,
2473
+ showChrome: options.showChrome ?? false
2474
+ };
2475
+ }
2476
+ /**
2477
+ * Start the daemon server
2478
+ */
2479
+ async start() {
2480
+ if (this.server) {
2481
+ throw new Error("Daemon is already running");
2482
+ }
2483
+ const clientOptions = {
2484
+ verbose: this.options.verbose,
2485
+ showChrome: this.options.showChrome,
2486
+ browserPool: {
2487
+ size: this.options.poolSize
2488
+ }
2489
+ };
2490
+ this.client = new ReaderClient(clientOptions);
2491
+ await this.client.start();
2492
+ this.server = http.createServer(this.handleRequest.bind(this));
2493
+ await new Promise((resolve, reject) => {
2494
+ this.server.listen(this.options.port, () => {
2495
+ this.startTime = Date.now();
2496
+ if (this.options.verbose) {
2497
+ logger3.info(`Daemon started on port ${this.options.port} with pool size ${this.options.poolSize}`);
2498
+ }
2499
+ resolve();
2500
+ });
2501
+ this.server.on("error", (error) => {
2502
+ if (error.code === "EADDRINUSE") {
2503
+ reject(new Error(`Port ${this.options.port} is already in use. Is another daemon running?`));
2504
+ } else {
2505
+ reject(error);
2506
+ }
2507
+ });
2508
+ });
2509
+ await this.writePidFile();
2510
+ }
2511
+ /**
2512
+ * Stop the daemon server
2513
+ */
2514
+ async stop() {
2515
+ if (this.server) {
2516
+ await new Promise((resolve) => {
2517
+ this.server.close(() => resolve());
2518
+ });
2519
+ this.server = null;
2520
+ }
2521
+ if (this.client) {
2522
+ await this.client.close();
2523
+ this.client = null;
2524
+ }
2525
+ await this.removePidFile();
2526
+ if (this.options.verbose) {
2527
+ logger3.info("Daemon stopped");
2528
+ }
2529
+ }
2530
+ /**
2531
+ * Get the port the daemon is running on
2532
+ */
2533
+ getPort() {
2534
+ return this.options.port;
2535
+ }
2536
+ /**
2537
+ * Handle incoming HTTP requests
2538
+ */
2539
+ async handleRequest(req, res) {
2540
+ if (req.method !== "POST" || req.url !== "/") {
2541
+ res.writeHead(404, { "Content-Type": "application/json" });
2542
+ res.end(JSON.stringify({ success: false, error: "Not found" }));
2543
+ return;
2544
+ }
2545
+ let body = "";
2546
+ for await (const chunk of req) {
2547
+ body += chunk;
2548
+ }
2549
+ let request;
2550
+ try {
2551
+ request = JSON.parse(body);
2552
+ } catch {
2553
+ this.sendResponse(res, 400, { success: false, error: "Invalid JSON" });
2554
+ return;
2555
+ }
2556
+ try {
2557
+ switch (request.action) {
2558
+ case "scrape":
2559
+ await this.handleScrape(res, request.options);
2560
+ break;
2561
+ case "crawl":
2562
+ await this.handleCrawl(res, request.options);
2563
+ break;
2564
+ case "status":
2565
+ this.handleStatus(res);
2566
+ break;
2567
+ case "shutdown":
2568
+ await this.handleShutdown(res);
2569
+ break;
2570
+ default:
2571
+ this.sendResponse(res, 400, { success: false, error: "Unknown action" });
2572
+ }
2573
+ } catch (error) {
2574
+ this.sendResponse(res, 500, { success: false, error: error.message });
2575
+ }
2576
+ }
2577
+ /**
2578
+ * Handle scrape request
2579
+ */
2580
+ async handleScrape(res, options) {
2581
+ if (!this.client) {
2582
+ this.sendResponse(res, 500, { success: false, error: "Client not initialized" });
2583
+ return;
2584
+ }
2585
+ const result = await this.client.scrape(options);
2586
+ this.sendResponse(res, 200, { success: true, data: result });
2587
+ }
2588
+ /**
2589
+ * Handle crawl request
2590
+ */
2591
+ async handleCrawl(res, options) {
2592
+ if (!this.client) {
2593
+ this.sendResponse(res, 500, { success: false, error: "Client not initialized" });
2594
+ return;
2595
+ }
2596
+ const result = await this.client.crawl(options);
2597
+ this.sendResponse(res, 200, { success: true, data: result });
2598
+ }
2599
+ /**
2600
+ * Handle status request
2601
+ */
2602
+ handleStatus(res) {
2603
+ const status = {
2604
+ running: true,
2605
+ port: this.options.port,
2606
+ poolSize: this.options.poolSize,
2607
+ uptime: Date.now() - this.startTime,
2608
+ pid: process.pid
2609
+ };
2610
+ this.sendResponse(res, 200, { success: true, data: status });
2611
+ }
2612
+ /**
2613
+ * Handle shutdown request
2614
+ */
2615
+ async handleShutdown(res) {
2616
+ this.sendResponse(res, 200, { success: true, data: { message: "Shutting down" } });
2617
+ setTimeout(() => {
2618
+ this.stop().then(() => process.exit(0));
2619
+ }, 100);
2620
+ }
2621
+ /**
2622
+ * Send JSON response
2623
+ */
2624
+ sendResponse(res, statusCode, data) {
2625
+ res.writeHead(statusCode, { "Content-Type": "application/json" });
2626
+ res.end(JSON.stringify(data));
2627
+ }
2628
+ /**
2629
+ * Write PID file
2630
+ */
2631
+ async writePidFile() {
2632
+ const fs = await import("fs/promises");
2633
+ const path = await import("path");
2634
+ const os = await import("os");
2635
+ const pidFile = path.join(os.tmpdir(), PID_FILE_NAME);
2636
+ const data = JSON.stringify({
2637
+ pid: process.pid,
2638
+ port: this.options.port,
2639
+ startedAt: (/* @__PURE__ */ new Date()).toISOString()
2640
+ });
2641
+ await fs.writeFile(pidFile, data);
2642
+ }
2643
+ /**
2644
+ * Remove PID file
2645
+ */
2646
+ async removePidFile() {
2647
+ const fs = await import("fs/promises");
2648
+ const path = await import("path");
2649
+ const os = await import("os");
2650
+ const pidFile = path.join(os.tmpdir(), PID_FILE_NAME);
2651
+ try {
2652
+ await fs.unlink(pidFile);
2653
+ } catch {
2654
+ }
2655
+ }
2656
+ };
2657
+ async function getPidFilePath() {
2658
+ const path = await import("path");
2659
+ const os = await import("os");
2660
+ return path.join(os.tmpdir(), PID_FILE_NAME);
2661
+ }
2662
+ async function getDaemonInfo() {
2663
+ const fs = await import("fs/promises");
2664
+ const pidFile = await getPidFilePath();
2665
+ try {
2666
+ const data = await fs.readFile(pidFile, "utf-8");
2667
+ const info = JSON.parse(data);
2668
+ try {
2669
+ process.kill(info.pid, 0);
2670
+ return info;
2671
+ } catch {
2672
+ await fs.unlink(pidFile).catch(() => {
2673
+ });
2674
+ return null;
2675
+ }
2676
+ } catch {
2677
+ return null;
2678
+ }
2679
+ }
2680
+
2681
+ // src/daemon/client.ts
2682
+ import http2 from "http";
2683
+ var DaemonClient = class {
2684
+ options;
2685
+ constructor(options = {}) {
2686
+ this.options = {
2687
+ port: options.port ?? DEFAULT_DAEMON_PORT,
2688
+ timeoutMs: options.timeoutMs ?? 6e5
2689
+ // 10 minutes default
2690
+ };
2691
+ }
2692
+ /**
2693
+ * Scrape URLs via daemon
2694
+ */
2695
+ async scrape(options) {
2696
+ return this.request({
2697
+ action: "scrape",
2698
+ options
2699
+ });
2700
+ }
2701
+ /**
2702
+ * Crawl URL via daemon
2703
+ */
2704
+ async crawl(options) {
2705
+ return this.request({
2706
+ action: "crawl",
2707
+ options
2708
+ });
2709
+ }
2710
+ /**
2711
+ * Get daemon status
2712
+ */
2713
+ async status() {
2714
+ return this.request({
2715
+ action: "status"
2716
+ });
2717
+ }
2718
+ /**
2719
+ * Request daemon shutdown
2720
+ */
2721
+ async shutdown() {
2722
+ await this.request({
2723
+ action: "shutdown"
2724
+ });
2725
+ }
2726
+ /**
2727
+ * Check if daemon is reachable
2728
+ */
2729
+ async isRunning() {
2730
+ try {
2731
+ await this.status();
2732
+ return true;
2733
+ } catch {
2734
+ return false;
2735
+ }
2736
+ }
2737
+ /**
2738
+ * Make HTTP request to daemon
2739
+ */
2740
+ request(body) {
2741
+ return new Promise((resolve, reject) => {
2742
+ const data = JSON.stringify(body);
2743
+ const req = http2.request(
2744
+ {
2745
+ hostname: "127.0.0.1",
2746
+ port: this.options.port,
2747
+ path: "/",
2748
+ method: "POST",
2749
+ headers: {
2750
+ "Content-Type": "application/json",
2751
+ "Content-Length": Buffer.byteLength(data)
2752
+ },
2753
+ timeout: this.options.timeoutMs
2754
+ },
2755
+ (res) => {
2756
+ let responseBody = "";
2757
+ res.on("data", (chunk) => {
2758
+ responseBody += chunk;
2759
+ });
2760
+ res.on("end", () => {
2761
+ try {
2762
+ const response = JSON.parse(responseBody);
2763
+ if (response.success) {
2764
+ resolve(response.data);
2765
+ } else {
2766
+ reject(new Error(response.error || "Unknown daemon error"));
2767
+ }
2768
+ } catch (error) {
2769
+ reject(new Error(`Failed to parse daemon response: ${responseBody}`));
2770
+ }
2771
+ });
2772
+ }
2773
+ );
2774
+ req.on("error", (error) => {
2775
+ if (error.code === "ECONNREFUSED") {
2776
+ reject(new Error(`Cannot connect to daemon on port ${this.options.port}. Is it running?`));
2777
+ } else {
2778
+ reject(error);
2779
+ }
2780
+ });
2781
+ req.on("timeout", () => {
2782
+ req.destroy();
2783
+ reject(new Error(`Request to daemon timed out after ${this.options.timeoutMs}ms`));
2784
+ });
2785
+ req.write(data);
2786
+ req.end();
2787
+ });
2788
+ }
2789
+ };
2790
+ async function isDaemonRunning(port = DEFAULT_DAEMON_PORT) {
2791
+ const client = new DaemonClient({ port, timeoutMs: 5e3 });
2792
+ return client.isRunning();
2793
+ }
2794
+
2795
+ // src/cli/index.ts
2796
+ import { readFileSync, writeFileSync } from "fs";
2797
+ import { dirname, join } from "path";
2798
+ import { fileURLToPath } from "url";
2799
+ var __dirname = dirname(fileURLToPath(import.meta.url));
2800
+ var pkg = JSON.parse(readFileSync(join(__dirname, "../../package.json"), "utf-8"));
2801
+ var program = new Command();
2802
+ program.name("reader").description(
2803
+ "Production-grade web scraping engine for LLMs. Clean markdown output, ready for your agents."
2804
+ ).version(pkg.version);
2805
+ program.command("start").description("Start the reader daemon server").option("-p, --port <n>", `Port to listen on (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("--pool-size <n>", "Browser pool size", "5").option("--show-chrome", "Show browser windows for debugging").option("-v, --verbose", "Enable verbose logging").action(async (options) => {
2806
+ const port = parseInt(options.port, 10);
2807
+ if (await isDaemonRunning(port)) {
2808
+ console.error(`Error: Daemon is already running on port ${port}`);
2809
+ process.exit(1);
2810
+ }
2811
+ const daemon = new DaemonServer({
2812
+ port,
2813
+ poolSize: parseInt(options.poolSize, 10),
2814
+ verbose: options.verbose || false,
2815
+ showChrome: options.showChrome || false
2816
+ });
2817
+ try {
2818
+ await daemon.start();
2819
+ console.log(`Reader daemon started on port ${port} with pool size ${options.poolSize}`);
2820
+ console.log(`Use "npx reader stop" to stop the daemon`);
2821
+ process.on("SIGINT", async () => {
2822
+ console.log("\nShutting down daemon...");
2823
+ await daemon.stop();
2824
+ process.exit(0);
2825
+ });
2826
+ process.on("SIGTERM", async () => {
2827
+ await daemon.stop();
2828
+ process.exit(0);
2829
+ });
2830
+ } catch (error) {
2831
+ console.error(`Error: ${error.message}`);
2832
+ process.exit(1);
2833
+ }
2834
+ });
2835
+ program.command("stop").description("Stop the running reader daemon").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).action(async (options) => {
2836
+ const port = parseInt(options.port, 10);
2837
+ const client = new DaemonClient({ port });
2838
+ try {
2839
+ if (!await client.isRunning()) {
2840
+ console.log("Daemon is not running");
2841
+ return;
2842
+ }
2843
+ await client.shutdown();
2844
+ console.log("Daemon stopped");
2845
+ } catch (error) {
2846
+ console.error(`Error: ${error.message}`);
2847
+ process.exit(1);
2848
+ }
2849
+ });
2850
+ program.command("status").description("Check daemon status").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).action(async (options) => {
2851
+ const daemonInfo = await getDaemonInfo();
2852
+ if (!daemonInfo) {
2853
+ console.log("Daemon is not running");
2854
+ return;
2855
+ }
2856
+ const port = options.port ? parseInt(options.port, 10) : daemonInfo.port;
2857
+ const client = new DaemonClient({ port });
2858
+ try {
2859
+ const status = await client.status();
2860
+ console.log("Daemon is running:");
2861
+ console.log(` Port: ${status.port}`);
2862
+ console.log(` PID: ${status.pid}`);
2863
+ console.log(` Pool size: ${status.poolSize}`);
2864
+ console.log(` Uptime: ${Math.round(status.uptime / 1e3)}s`);
2865
+ } catch {
2866
+ console.log("Daemon is not running (stale PID file)");
2867
+ }
2868
+ });
2869
+ program.command("scrape <urls...>").description("Scrape one or more URLs").option(
2870
+ "-f, --format <formats>",
2871
+ "Output formats (comma-separated: markdown,html,json,text)",
2872
+ "markdown"
2873
+ ).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--no-metadata", "Exclude metadata from output").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").action(async (urls, options) => {
2874
+ const port = parseInt(options.port, 10);
2875
+ const useStandalone = options.standalone || false;
2876
+ let useDaemon = false;
2877
+ if (!useStandalone) {
2878
+ useDaemon = await isDaemonRunning(port);
2879
+ if (options.verbose && useDaemon) {
2880
+ console.error(`Using daemon on port ${port}`);
2881
+ }
2882
+ }
2883
+ const daemonClient = useDaemon ? new DaemonClient({ port }) : null;
2884
+ const standaloneClient = !useDaemon ? new ReaderClient({
2885
+ verbose: options.verbose || false,
2886
+ showChrome: options.showChrome || false
2887
+ }) : null;
2888
+ try {
2889
+ const formats = options.format.split(",").map((f) => f.trim());
2890
+ const validFormats = ["markdown", "html", "json", "text"];
2891
+ for (const format of formats) {
2892
+ if (!validFormats.includes(format)) {
2893
+ console.error(
2894
+ `Error: Invalid format "${format}". Valid formats: ${validFormats.join(", ")}`
2895
+ );
2896
+ process.exit(1);
2897
+ }
2898
+ }
2899
+ if (options.verbose) {
2900
+ console.error(`Scraping ${urls.length} URL(s)...`);
2901
+ console.error(`Formats: ${formats.join(", ")}`);
2902
+ }
2903
+ const scrapeOptions = {
2904
+ urls,
2905
+ formats,
2906
+ batchConcurrency: parseInt(options.concurrency, 10),
2907
+ timeoutMs: parseInt(options.timeout, 10),
2908
+ batchTimeoutMs: parseInt(options.batchTimeout, 10),
2909
+ proxy: options.proxy ? { url: options.proxy } : void 0,
2910
+ userAgent: options.userAgent,
2911
+ includeMetadata: options.metadata !== false,
2912
+ verbose: options.verbose || false,
2913
+ showChrome: options.showChrome || false,
2914
+ onProgress: options.verbose ? ({ completed, total, currentUrl }) => {
2915
+ console.error(`[${completed}/${total}] ${currentUrl}`);
2916
+ } : void 0
2917
+ };
2918
+ const result = useDaemon ? await daemonClient.scrape(scrapeOptions) : await standaloneClient.scrape(scrapeOptions);
2919
+ let output = "";
2920
+ for (const site of result.data) {
2921
+ if (formats.includes("markdown") && site.markdown) {
2922
+ output += site.markdown + "\n\n";
2923
+ } else if (formats.includes("text") && site.text) {
2924
+ output += site.text + "\n\n";
2925
+ } else if (formats.includes("html") && site.html) {
2926
+ output += site.html + "\n\n";
2927
+ } else if (formats.includes("json") && site.json) {
2928
+ output += site.json + "\n\n";
2929
+ }
2930
+ }
2931
+ if (options.output) {
2932
+ writeFileSync(options.output, output.trim());
2933
+ if (options.verbose) {
2934
+ console.error(`Output written to ${options.output}`);
2935
+ }
2936
+ } else {
2937
+ console.log(output.trim());
2938
+ }
2939
+ if (options.verbose) {
2940
+ console.error(`
2941
+ Summary:`);
2942
+ console.error(
2943
+ ` Successful: ${result.batchMetadata.successfulUrls}/${result.batchMetadata.totalUrls}`
2944
+ );
2945
+ console.error(` Duration: ${result.batchMetadata.totalDuration}ms`);
2946
+ }
2947
+ if (result.batchMetadata.failedUrls > 0) {
2948
+ process.exit(1);
2949
+ }
2950
+ } catch (error) {
2951
+ console.error(`Error: ${error.message}`);
2952
+ process.exit(1);
2953
+ } finally {
2954
+ if (standaloneClient) {
2955
+ await standaloneClient.close();
2956
+ process.exit(0);
2957
+ }
2958
+ }
2959
+ });
2960
+ program.command("crawl <url>").description("Crawl a website to discover and optionally scrape pages").option("-d, --depth <n>", "Maximum crawl depth", "1").option("-m, --max-pages <n>", "Maximum pages to discover", "20").option("-s, --scrape", "Also scrape content of discovered pages").option("-f, --format <formats>", "Output formats when scraping (comma-separated)", "markdown").option("-o, --output <file>", "Output file (stdout if omitted)").option("--delay <ms>", "Delay between requests in milliseconds", "1000").option("-t, --timeout <ms>", "Total timeout for crawl operation in milliseconds").option("--include <patterns>", "URL patterns to include (comma-separated regex)").option("--exclude <patterns>", "URL patterns to exclude (comma-separated regex)").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").action(async (url, options) => {
2961
+ const port = parseInt(options.port, 10);
2962
+ const useStandalone = options.standalone || false;
2963
+ let useDaemon = false;
2964
+ if (!useStandalone) {
2965
+ useDaemon = await isDaemonRunning(port);
2966
+ if (options.verbose && useDaemon) {
2967
+ console.error(`Using daemon on port ${port}`);
2968
+ }
2969
+ }
2970
+ const daemonClient = useDaemon ? new DaemonClient({ port }) : null;
2971
+ const standaloneClient = !useDaemon ? new ReaderClient({
2972
+ verbose: options.verbose || false,
2973
+ showChrome: options.showChrome || false
2974
+ }) : null;
2975
+ try {
2976
+ if (options.verbose) {
2977
+ console.error(`Crawling ${url}...`);
2978
+ console.error(`Max depth: ${options.depth}, Max pages: ${options.maxPages}`);
2979
+ }
2980
+ const includePatterns = options.include ? options.include.split(",").map((p) => p.trim()) : void 0;
2981
+ const excludePatterns = options.exclude ? options.exclude.split(",").map((p) => p.trim()) : void 0;
2982
+ const crawlOptions = {
2983
+ url,
2984
+ depth: parseInt(options.depth, 10),
2985
+ maxPages: parseInt(options.maxPages, 10),
2986
+ scrape: options.scrape || false,
2987
+ delayMs: parseInt(options.delay, 10),
2988
+ timeoutMs: options.timeout ? parseInt(options.timeout, 10) : void 0,
2989
+ includePatterns,
2990
+ excludePatterns,
2991
+ proxy: options.proxy ? { url: options.proxy } : void 0,
2992
+ userAgent: options.userAgent,
2993
+ verbose: options.verbose || false,
2994
+ showChrome: options.showChrome || false
2995
+ };
2996
+ const result = useDaemon ? await daemonClient.crawl(crawlOptions) : await standaloneClient.crawl(crawlOptions);
2997
+ let output = "";
2998
+ if (options.scrape && result.scraped) {
2999
+ const formats = options.format.split(",").map((f) => f.trim());
3000
+ for (const site of result.scraped.data) {
3001
+ if (formats.includes("markdown") && site.markdown) {
3002
+ output += site.markdown + "\n\n";
3003
+ } else if (formats.includes("text") && site.text) {
3004
+ output += site.text + "\n\n";
3005
+ } else if (formats.includes("html") && site.html) {
3006
+ output += site.html + "\n\n";
3007
+ } else if (formats.includes("json") && site.json) {
3008
+ output += site.json + "\n\n";
3009
+ }
3010
+ }
3011
+ } else {
3012
+ output = JSON.stringify(
3013
+ {
3014
+ urls: result.urls,
3015
+ metadata: result.metadata
3016
+ },
3017
+ null,
3018
+ 2
3019
+ );
3020
+ }
3021
+ if (options.output) {
3022
+ writeFileSync(options.output, output.trim());
3023
+ if (options.verbose) {
3024
+ console.error(`Output written to ${options.output}`);
3025
+ }
3026
+ } else {
3027
+ console.log(output.trim());
3028
+ }
3029
+ if (options.verbose) {
3030
+ console.error(`
3031
+ Summary:`);
3032
+ console.error(` Discovered: ${result.urls.length} URLs`);
3033
+ console.error(` Duration: ${result.metadata.totalDuration}ms`);
3034
+ }
3035
+ } catch (error) {
3036
+ console.error(`Error: ${error.message}`);
3037
+ process.exit(1);
3038
+ } finally {
3039
+ if (standaloneClient) {
3040
+ await standaloneClient.close();
3041
+ process.exit(0);
3042
+ }
3043
+ }
3044
+ });
3045
+ program.parse();
3046
+ //# sourceMappingURL=index.js.map