@vakra-dev/reader 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1216 @@
1
+ import Hero from '@ulixee/hero';
2
+
3
+ /**
4
+ * Browser instance in the pool
5
+ */
6
+ interface BrowserInstance {
7
+ /** Hero instance */
8
+ hero: Hero;
9
+ /** Unique identifier */
10
+ id: string;
11
+ /** When the instance was created */
12
+ createdAt: number;
13
+ /** When the instance was last used */
14
+ lastUsed: number;
15
+ /** Number of requests handled */
16
+ requestCount: number;
17
+ /** Current status */
18
+ status: "idle" | "busy" | "recycling" | "unhealthy";
19
+ }
20
+ /**
21
+ * Pool configuration
22
+ */
23
+ interface PoolConfig {
24
+ /** Pool size (number of browser instances) */
25
+ size: number;
26
+ /** Retire browser after this many page loads */
27
+ retireAfterPageCount: number;
28
+ /** Retire browser after this age in milliseconds */
29
+ retireAfterAgeMs: number;
30
+ /** How often to check for recycling (ms) */
31
+ recycleCheckInterval: number;
32
+ /** How often to run health checks (ms) */
33
+ healthCheckInterval: number;
34
+ /** Max consecutive failures before marking unhealthy */
35
+ maxConsecutiveFailures: number;
36
+ /** Maximum queue size */
37
+ maxQueueSize: number;
38
+ /** Queue timeout in milliseconds */
39
+ queueTimeout: number;
40
+ }
41
+ /**
42
+ * Pool statistics
43
+ */
44
+ interface PoolStats {
45
+ /** Total instances */
46
+ total: number;
47
+ /** Available instances */
48
+ available: number;
49
+ /** Busy instances */
50
+ busy: number;
51
+ /** Recycling instances */
52
+ recycling: number;
53
+ /** Unhealthy instances */
54
+ unhealthy: number;
55
+ /** Queue length */
56
+ queueLength: number;
57
+ /** Total requests handled */
58
+ totalRequests: number;
59
+ /** Average request duration */
60
+ avgRequestDuration: number;
61
+ }
62
+ /**
63
+ * Health status
64
+ */
65
+ interface HealthStatus {
66
+ /** Overall health */
67
+ healthy: boolean;
68
+ /** Issues found */
69
+ issues: string[];
70
+ /** Stats snapshot */
71
+ stats: PoolStats;
72
+ }
73
+ /**
74
+ * Browser pool interface
75
+ */
76
+ interface IBrowserPool {
77
+ /** Initialize the pool */
78
+ initialize(): Promise<void>;
79
+ /** Shutdown the pool */
80
+ shutdown(): Promise<void>;
81
+ /** Acquire a browser instance */
82
+ acquire(): Promise<Hero>;
83
+ /** Release a browser instance back to the pool */
84
+ release(hero: Hero): void;
85
+ /** Execute callback with auto-managed browser */
86
+ withBrowser<T>(callback: (hero: Hero) => Promise<T>): Promise<T>;
87
+ /** Get pool statistics */
88
+ getStats(): PoolStats;
89
+ /** Run health check */
90
+ healthCheck?(): Promise<HealthStatus>;
91
+ }
92
+
93
+ /**
94
+ * Proxy configuration for Hero
95
+ */
96
+ interface ProxyConfig {
97
+ /** Full proxy URL (takes precedence over other fields) */
98
+ url?: string;
99
+ /** Proxy type */
100
+ type?: "datacenter" | "residential";
101
+ /** Proxy username */
102
+ username?: string;
103
+ /** Proxy password */
104
+ password?: string;
105
+ /** Proxy host */
106
+ host?: string;
107
+ /** Proxy port */
108
+ port?: number;
109
+ /** Country code for residential proxies (e.g., 'us', 'uk') */
110
+ country?: string;
111
+ }
112
+ /**
113
+ * Proxy metadata in scrape results
114
+ */
115
+ interface ProxyMetadata {
116
+ /** Proxy host that was used */
117
+ host: string;
118
+ /** Proxy port that was used */
119
+ port: number;
120
+ /** Country code if geo-targeting was used */
121
+ country?: string;
122
+ }
123
+ /**
124
+ * Browser pool configuration for ReaderClient
125
+ */
126
+ interface BrowserPoolConfig {
127
+ /** Number of browser instances (default: 2) */
128
+ size?: number;
129
+ /** Retire browser after this many page loads (default: 100) */
130
+ retireAfterPages?: number;
131
+ /** Retire browser after this many minutes (default: 30) */
132
+ retireAfterMinutes?: number;
133
+ /** Maximum pending requests in queue (default: 100) */
134
+ maxQueueSize?: number;
135
+ }
136
+ /**
137
+ * Main scraping options interface
138
+ */
139
+ interface ScrapeOptions {
140
+ /** Array of URLs to scrape */
141
+ urls: string[];
142
+ /** Output formats (default: ['markdown']) */
143
+ formats?: Array<"markdown" | "html" | "json" | "text">;
144
+ /** Include URL, title, timestamp (default: true) */
145
+ includeMetadata?: boolean;
146
+ /** Custom user agent string */
147
+ userAgent?: string;
148
+ /** Request timeout in milliseconds (default: 30000) */
149
+ timeoutMs?: number;
150
+ /** URL patterns to include (regex strings) */
151
+ includePatterns?: string[];
152
+ /** URL patterns to exclude (regex strings) */
153
+ excludePatterns?: string[];
154
+ /** Remove ads and tracking elements (default: true) */
155
+ removeAds?: boolean;
156
+ /** Remove base64-encoded images to reduce output size (default: true) */
157
+ removeBase64Images?: boolean;
158
+ /** Skip TLS/SSL certificate verification (default: true) */
159
+ skipTLSVerification?: boolean;
160
+ /** Number of URLs to process in parallel (default: 1 - sequential) */
161
+ batchConcurrency?: number;
162
+ /** Total timeout for the entire batch operation in milliseconds (default: 300000) */
163
+ batchTimeoutMs?: number;
164
+ /** Maximum retry attempts for failed URLs (default: 2) */
165
+ maxRetries?: number;
166
+ /** Progress callback for batch operations */
167
+ onProgress?: (progress: {
168
+ completed: number;
169
+ total: number;
170
+ currentUrl: string;
171
+ }) => void;
172
+ /** Proxy configuration for Hero */
173
+ proxy?: ProxyConfig;
174
+ /** CSS selector to wait for before considering page loaded */
175
+ waitForSelector?: string;
176
+ /** Enable verbose logging (default: false) */
177
+ verbose?: boolean;
178
+ /** Show Chrome window (default: false) */
179
+ showChrome?: boolean;
180
+ /** Connection to Hero Core (for shared Core usage) */
181
+ connectionToCore?: any;
182
+ /** Browser pool configuration (passed from ReaderClient) */
183
+ browserPool?: BrowserPoolConfig;
184
+ /** Browser pool instance (internal, provided by ReaderClient) */
185
+ pool?: IBrowserPool;
186
+ }
187
+ /**
188
+ * Website metadata extracted from the base page
189
+ */
190
+ interface WebsiteMetadata {
191
+ /** Basic meta tags */
192
+ title: string | null /** <title> or <meta property="og:title"> */;
193
+ description: string | null /** <meta name="description"> */;
194
+ author: string | null /** <meta name="author"> */;
195
+ language: string | null /** <html lang="..."> */;
196
+ charset: string | null /** <meta charset="..."> */;
197
+ /** Links */
198
+ favicon: string | null /** <link rel="icon"> */;
199
+ image: string | null /** <meta property="og:image"> */;
200
+ canonical: string | null /** <link rel="canonical"> */;
201
+ /** SEO */
202
+ keywords: string[] | null /** <meta name="keywords"> */;
203
+ robots: string | null /** <meta name="robots"> */;
204
+ /** Branding */
205
+ themeColor: string | null /** <meta name="theme-color"> */;
206
+ /** Open Graph */
207
+ openGraph: {
208
+ title: string | null /** <meta property="og:title"> */;
209
+ description: string | null /** <meta property="og:description"> */;
210
+ type: string | null /** <meta property="og:type"> */;
211
+ url: string | null /** <meta property="og:url"> */;
212
+ image: string | null /** <meta property="og:image"> */;
213
+ siteName: string | null /** <meta property="og:site_name"> */;
214
+ locale: string | null /** <meta property="og:locale"> */;
215
+ } | null;
216
+ /** Twitter Card */
217
+ twitter: {
218
+ card: string | null /** <meta name="twitter:card"> */;
219
+ site: string | null /** <meta name="twitter:site"> */;
220
+ creator: string | null /** <meta name="twitter:creator"> */;
221
+ title: string | null /** <meta name="twitter:title"> */;
222
+ description: string | null /** <meta name="twitter:description"> */;
223
+ image: string | null /** <meta name="twitter:image"> */;
224
+ } | null;
225
+ }
226
+ /**
227
+ * Individual page data
228
+ */
229
+ interface Page {
230
+ /** Full URL of the page */
231
+ url: string;
232
+ /** Page title */
233
+ title: string;
234
+ /** Markdown content */
235
+ markdown: string;
236
+ /** HTML content */
237
+ html: string;
238
+ /** When the page was fetched */
239
+ fetchedAt: string;
240
+ /** Crawl depth from base URL */
241
+ depth: number;
242
+ /** Whether a Cloudflare challenge was detected */
243
+ hadChallenge?: boolean;
244
+ /** Type of challenge encountered */
245
+ challengeType?: string;
246
+ /** Time spent waiting for challenge resolution (ms) */
247
+ waitTimeMs?: number;
248
+ }
249
+ /**
250
+ * Individual website scrape result (for backward compatibility)
251
+ */
252
+ interface WebsiteScrapeResult {
253
+ /** Markdown output (present if 'markdown' in formats) */
254
+ markdown?: string;
255
+ /** HTML output (present if 'html' in formats) */
256
+ html?: string;
257
+ /** JSON output (present if 'json' in formats) */
258
+ json?: string;
259
+ /** Plain text output (present if 'text' in formats) */
260
+ text?: string;
261
+ /** Metadata about the scraping operation */
262
+ metadata: {
263
+ /** Base URL that was scraped */
264
+ baseUrl: string;
265
+ /** Total number of pages scraped */
266
+ totalPages: number;
267
+ /** ISO timestamp when scraping started */
268
+ scrapedAt: string;
269
+ /** Duration in milliseconds */
270
+ duration: number;
271
+ /** Website metadata extracted from base page */
272
+ website: WebsiteMetadata;
273
+ /** Proxy used for this request (if proxy pooling was enabled) */
274
+ proxy?: ProxyMetadata;
275
+ };
276
+ }
277
+ /**
278
+ * Batch metadata for multi-URL operations
279
+ */
280
+ interface BatchMetadata {
281
+ /** Total number of URLs provided */
282
+ totalUrls: number;
283
+ /** Number of URLs successfully scraped */
284
+ successfulUrls: number;
285
+ /** Number of URLs that failed */
286
+ failedUrls: number;
287
+ /** ISO timestamp when the batch operation started */
288
+ scrapedAt: string;
289
+ /** Total duration for the entire batch in milliseconds */
290
+ totalDuration: number;
291
+ /** Array of errors for failed URLs */
292
+ errors?: Array<{
293
+ url: string;
294
+ error: string;
295
+ }>;
296
+ }
297
+ /**
298
+ * Main scrape result interface
299
+ */
300
+ interface ScrapeResult {
301
+ /** Array of individual website results */
302
+ data: WebsiteScrapeResult[];
303
+ /** Metadata about the batch operation */
304
+ batchMetadata: BatchMetadata;
305
+ }
306
+ /**
307
+ * Default scrape options
308
+ */
309
+ declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForSelector" | "connectionToCore" | "userAgent" | "browserPool" | "pool"> & {
310
+ proxy?: ProxyConfig;
311
+ waitForSelector?: string;
312
+ connectionToCore?: any;
313
+ userAgent?: string;
314
+ browserPool?: BrowserPoolConfig;
315
+ pool?: IBrowserPool;
316
+ };
317
+ /**
318
+ * Format type guard
319
+ */
320
+ declare function isValidFormat(format: string): format is "markdown" | "html" | "json" | "text";
321
+ /**
322
+ * Check if a URL should be crawled based on base domain
323
+ */
324
+ declare function shouldCrawlUrl$1(url: URL, baseDomain: string): boolean;
325
+
326
+ /**
327
+ * Crawl options interface
328
+ */
329
+ interface CrawlOptions {
330
+ /** Single seed URL to start crawling from */
331
+ url: string;
332
+ /** Maximum depth to crawl (default: 1) */
333
+ depth?: number;
334
+ /** Maximum pages to discover (default: 20) */
335
+ maxPages?: number;
336
+ /** Also scrape full content (default: false) */
337
+ scrape?: boolean;
338
+ /** Delay between requests in milliseconds (default: 1000) */
339
+ delayMs?: number;
340
+ /** Total timeout for the entire crawl operation in milliseconds */
341
+ timeoutMs?: number;
342
+ /** URL patterns to include (regex strings) - if set, only matching URLs are crawled */
343
+ includePatterns?: string[];
344
+ /** URL patterns to exclude (regex strings) - matching URLs are skipped */
345
+ excludePatterns?: string[];
346
+ /** Output formats for scraped content (default: ['markdown', 'html']) */
347
+ formats?: Array<"markdown" | "html" | "json" | "text">;
348
+ /** Number of URLs to scrape in parallel (default: 2) */
349
+ scrapeConcurrency?: number;
350
+ /** Remove ads and tracking elements (default: true) */
351
+ removeAds?: boolean;
352
+ /** Remove base64-encoded images to reduce output size (default: true) */
353
+ removeBase64Images?: boolean;
354
+ /** Proxy configuration for Hero */
355
+ proxy?: ProxyConfig;
356
+ /** Custom user agent string */
357
+ userAgent?: string;
358
+ /** Enable verbose logging (default: false) */
359
+ verbose?: boolean;
360
+ /** Show Chrome window (default: false) */
361
+ showChrome?: boolean;
362
+ /** Connection to Hero Core (for shared Core usage) */
363
+ connectionToCore?: any;
364
+ /** Browser pool instance (internal, provided by ReaderClient) */
365
+ pool?: IBrowserPool;
366
+ }
367
+ /**
368
+ * Crawl URL result interface
369
+ */
370
+ interface CrawlUrl {
371
+ /** URL of the page */
372
+ url: string;
373
+ /** Page title */
374
+ title: string;
375
+ /** Page description or null if not found */
376
+ description: string | null;
377
+ }
378
+ /**
379
+ * Crawl result interface
380
+ */
381
+ interface CrawlResult {
382
+ /** Array of discovered URLs with basic info */
383
+ urls: CrawlUrl[];
384
+ /** Full scrape results (only when scrape: true) */
385
+ scraped?: ScrapeResult;
386
+ /** Crawl operation metadata */
387
+ metadata: CrawlMetadata;
388
+ }
389
+ /**
390
+ * Crawl metadata interface
391
+ */
392
+ interface CrawlMetadata {
393
+ /** Total URLs discovered */
394
+ totalUrls: number;
395
+ /** Maximum depth reached */
396
+ maxDepth: number;
397
+ /** Total crawl duration in milliseconds */
398
+ totalDuration: number;
399
+ /** Seed URL that started the crawl */
400
+ seedUrl: string;
401
+ }
402
+
403
+ /**
404
+ * ReaderClient
405
+ *
406
+ * A client wrapper that manages HeroCore lifecycle and provides
407
+ * a simple interface for scraping and crawling.
408
+ *
409
+ * @example
410
+ * const reader = new ReaderClient();
411
+ *
412
+ * const result = await reader.scrape({
413
+ * urls: ['https://example.com'],
414
+ * formats: ['markdown'],
415
+ * });
416
+ *
417
+ * console.log(result.data[0].markdown);
418
+ *
419
+ * // When done (optional - auto-closes on process exit)
420
+ * await reader.close();
421
+ */
422
+
423
+ /**
424
+ * Proxy rotation strategy
425
+ */
426
+ type ProxyRotation = "round-robin" | "random";
427
+ /**
428
+ * Configuration options for ReaderClient
429
+ */
430
+ interface ReaderClientOptions {
431
+ /** Enable verbose logging (default: false) */
432
+ verbose?: boolean;
433
+ /** Show Chrome browser window (default: false) */
434
+ showChrome?: boolean;
435
+ /** Browser pool configuration */
436
+ browserPool?: BrowserPoolConfig;
437
+ /** List of proxies to rotate through */
438
+ proxies?: ProxyConfig[];
439
+ /** Proxy rotation strategy (default: "round-robin") */
440
+ proxyRotation?: ProxyRotation;
441
+ /** Skip TLS/SSL certificate verification (default: true) */
442
+ skipTLSVerification?: boolean;
443
+ }
444
+ /**
445
+ * ReaderClient manages the HeroCore lifecycle and provides
446
+ * scrape/crawl methods with automatic initialization.
447
+ */
448
+ declare class ReaderClient {
449
+ private heroCore;
450
+ private pool;
451
+ private initialized;
452
+ private initializing;
453
+ private closed;
454
+ private options;
455
+ private proxyIndex;
456
+ private cleanupHandler;
457
+ constructor(options?: ReaderClientOptions);
458
+ /**
459
+ * Get the next proxy from the rotation pool
460
+ */
461
+ private getNextProxy;
462
+ /**
463
+ * Initialize HeroCore. Called automatically on first scrape/crawl.
464
+ * Can be called explicitly if you want to pre-warm the client.
465
+ */
466
+ start(): Promise<void>;
467
+ /**
468
+ * Internal initialization logic
469
+ */
470
+ private initializeCore;
471
+ /**
472
+ * Create a connection to the HeroCore instance
473
+ */
474
+ private createConnection;
475
+ /**
476
+ * Ensure client is initialized before operation
477
+ */
478
+ private ensureInitialized;
479
+ /**
480
+ * Scrape one or more URLs
481
+ *
482
+ * @param options - Scrape options (urls, formats, etc.)
483
+ * @returns Scrape result with data and metadata
484
+ *
485
+ * @example
486
+ * const result = await reader.scrape({
487
+ * urls: ['https://example.com'],
488
+ * formats: ['markdown', 'html'],
489
+ * });
490
+ */
491
+ scrape(options: Omit<ScrapeOptions, "connectionToCore" | "pool">): Promise<ScrapeResult>;
492
+ /**
493
+ * Crawl a website to discover URLs
494
+ *
495
+ * @param options - Crawl options (url, depth, maxPages, etc.)
496
+ * @returns Crawl result with discovered URLs and optional scraped content
497
+ *
498
+ * @example
499
+ * const result = await reader.crawl({
500
+ * url: 'https://example.com',
501
+ * depth: 2,
502
+ * maxPages: 50,
503
+ * scrape: true,
504
+ * });
505
+ */
506
+ crawl(options: Omit<CrawlOptions, "connectionToCore" | "pool">): Promise<CrawlResult>;
507
+ /**
508
+ * Check if the client is initialized and ready
509
+ */
510
+ isReady(): boolean;
511
+ /**
512
+ * Close the client and release resources
513
+ *
514
+ * Note: This is optional - the client will auto-close on process exit.
515
+ */
516
+ close(): Promise<void>;
517
+ /**
518
+ * Register cleanup handlers for process exit
519
+ */
520
+ private registerCleanup;
521
+ /**
522
+ * Remove process cleanup handlers
523
+ */
524
+ private removeCleanupHandlers;
525
+ }
526
+
527
+ /**
528
+ * Scraper class with built-in concurrency support
529
+ *
530
+ * Features:
531
+ * - Hero-based browser automation
532
+ * - Automatic Cloudflare challenge detection and bypass
533
+ * - Built-in concurrency via browser pool
534
+ * - Progress tracking
535
+ * - Error handling per URL
536
+ *
537
+ * @example
538
+ * const scraper = new Scraper({
539
+ * urls: ['https://example.com', 'https://example.org'],
540
+ * formats: ['markdown', 'html'],
541
+ * batchConcurrency: 2,
542
+ * proxy: { type: 'residential', ... }
543
+ * });
544
+ *
545
+ * const result = await scraper.scrape();
546
+ * console.log(`Scraped ${result.batchMetadata.successfulUrls} URLs`);
547
+ */
548
+ declare class Scraper {
549
+ private options;
550
+ private pool;
551
+ private logger;
552
+ private robotsCache;
553
+ constructor(options: ScrapeOptions);
554
+ /**
555
+ * Get robots.txt rules for a URL, cached per domain
556
+ */
557
+ private getRobotsRules;
558
+ /**
559
+ * Scrape all URLs
560
+ *
561
+ * @returns Scrape result with pages and metadata
562
+ */
563
+ scrape(): Promise<ScrapeResult>;
564
+ /**
565
+ * Scrape URLs with concurrency control
566
+ */
567
+ private scrapeWithConcurrency;
568
+ /**
569
+ * Scrape a single URL with retry logic
570
+ */
571
+ private scrapeSingleUrlWithRetry;
572
+ /**
573
+ * Wait for the final page to load after any Cloudflare redirects
574
+ * Cloudflare often does silent redirects even when bypassed, we need to ensure
575
+ * we're on the actual content page before scraping.
576
+ */
577
+ private waitForFinalPage;
578
+ /**
579
+ * Scrape a single URL
580
+ */
581
+ private scrapeSingleUrl;
582
+ /**
583
+ * Build final scrape result
584
+ */
585
+ private buildScrapeResult;
586
+ }
587
+ /**
588
+ * Convenience function to scrape URLs
589
+ *
590
+ * @param options - Scrape options
591
+ * @returns Scrape result
592
+ *
593
+ * @example
594
+ * const result = await scrape({
595
+ * urls: ['https://example.com'],
596
+ * formats: ['markdown']
597
+ * });
598
+ */
599
+ declare function scrape(options: ScrapeOptions): Promise<ScrapeResult>;
600
+
601
+ /**
602
+ * Crawler class for discovering and optionally scraping pages
603
+ *
604
+ * Features:
605
+ * - BFS/DFS crawling with depth control
606
+ * - Automatic Cloudflare challenge handling
607
+ * - Link extraction and filtering
608
+ * - Optional full content scraping
609
+ * - URL deduplication
610
+ *
611
+ * @example
612
+ * const crawler = new Crawler({
613
+ * url: 'https://example.com',
614
+ * depth: 2,
615
+ * maxPages: 20,
616
+ * scrape: true
617
+ * });
618
+ *
619
+ * const result = await crawler.crawl();
620
+ * console.log(`Discovered ${result.urls.length} URLs`);
621
+ */
622
+ declare class Crawler {
623
+ private options;
624
+ private visited;
625
+ private queue;
626
+ private urls;
627
+ private pool;
628
+ private logger;
629
+ private robotsRules;
630
+ constructor(options: CrawlOptions);
631
+ /**
632
+ * Start crawling
633
+ */
634
+ crawl(): Promise<CrawlResult>;
635
+ /**
636
+ * Fetch a single page and extract basic info
637
+ */
638
+ private fetchPage;
639
+ /**
640
+ * Extract links from HTML content using DOM parsing
641
+ * Handles all href formats (single quotes, double quotes, unquoted)
642
+ */
643
+ private extractLinks;
644
+ /**
645
+ * Scrape all discovered URLs
646
+ */
647
+ private scrapeDiscoveredUrls;
648
+ }
649
+ /**
650
+ * Convenience function to crawl a website
651
+ *
652
+ * @param options - Crawl options
653
+ * @returns Crawl result
654
+ *
655
+ * @example
656
+ * const result = await crawl({
657
+ * url: 'https://example.com',
658
+ * depth: 2,
659
+ * maxPages: 20,
660
+ * scrape: true
661
+ * });
662
+ */
663
+ declare function crawl(options: CrawlOptions): Promise<CrawlResult>;
664
+
665
+ /**
666
+ * Daemon Server
667
+ *
668
+ * An HTTP server that wraps ReaderClient, allowing multiple CLI
669
+ * commands to share a single browser pool for efficient scraping.
670
+ *
671
+ * @example
672
+ * // Start daemon
673
+ * const daemon = new DaemonServer({ port: 3847, poolSize: 5 });
674
+ * await daemon.start();
675
+ *
676
+ * // Stop daemon
677
+ * await daemon.stop();
678
+ */
679
+ declare const DEFAULT_DAEMON_PORT = 3847;
680
+ /**
681
+ * Daemon server configuration
682
+ */
683
+ interface DaemonServerOptions {
684
+ /** Port to listen on (default: 3847) */
685
+ port?: number;
686
+ /** Browser pool size (default: 5) */
687
+ poolSize?: number;
688
+ /** Enable verbose logging (default: false) */
689
+ verbose?: boolean;
690
+ /** Show Chrome browser windows (default: false) */
691
+ showChrome?: boolean;
692
+ }
693
+ /**
694
+ * Status response data
695
+ */
696
+ interface DaemonStatus {
697
+ running: true;
698
+ port: number;
699
+ poolSize: number;
700
+ uptime: number;
701
+ pid: number;
702
+ }
703
+ /**
704
+ * Daemon Server
705
+ */
706
+ declare class DaemonServer {
707
+ private server;
708
+ private client;
709
+ private options;
710
+ private startTime;
711
+ constructor(options?: DaemonServerOptions);
712
+ /**
713
+ * Start the daemon server
714
+ */
715
+ start(): Promise<void>;
716
+ /**
717
+ * Stop the daemon server
718
+ */
719
+ stop(): Promise<void>;
720
+ /**
721
+ * Get the port the daemon is running on
722
+ */
723
+ getPort(): number;
724
+ /**
725
+ * Handle incoming HTTP requests
726
+ */
727
+ private handleRequest;
728
+ /**
729
+ * Handle scrape request
730
+ */
731
+ private handleScrape;
732
+ /**
733
+ * Handle crawl request
734
+ */
735
+ private handleCrawl;
736
+ /**
737
+ * Handle status request
738
+ */
739
+ private handleStatus;
740
+ /**
741
+ * Handle shutdown request
742
+ */
743
+ private handleShutdown;
744
+ /**
745
+ * Send JSON response
746
+ */
747
+ private sendResponse;
748
+ /**
749
+ * Write PID file
750
+ */
751
+ private writePidFile;
752
+ /**
753
+ * Remove PID file
754
+ */
755
+ private removePidFile;
756
+ }
757
+ /**
758
+ * Get path to PID file
759
+ */
760
+ declare function getPidFilePath(): Promise<string>;
761
+ /**
762
+ * Check if daemon is running by reading PID file
763
+ */
764
+ declare function getDaemonInfo(): Promise<{
765
+ pid: number;
766
+ port: number;
767
+ startedAt: string;
768
+ } | null>;
769
+
770
+ /**
771
+ * Daemon Client
772
+ *
773
+ * A client that connects to the daemon server via HTTP.
774
+ * Used by CLI commands when a daemon is running.
775
+ *
776
+ * @example
777
+ * const client = new DaemonClient({ port: 3847 });
778
+ *
779
+ * const result = await client.scrape({
780
+ * urls: ['https://example.com'],
781
+ * formats: ['markdown'],
782
+ * });
783
+ */
784
+
785
+ /**
786
+ * Daemon client configuration
787
+ */
788
+ interface DaemonClientOptions {
789
+ /** Port the daemon is running on (default: 3847) */
790
+ port?: number;
791
+ /** Request timeout in milliseconds (default: 600000 = 10 minutes) */
792
+ timeoutMs?: number;
793
+ }
794
+ /**
795
+ * Daemon Client
796
+ */
797
+ declare class DaemonClient {
798
+ private options;
799
+ constructor(options?: DaemonClientOptions);
800
+ /**
801
+ * Scrape URLs via daemon
802
+ */
803
+ scrape(options: Omit<ScrapeOptions, "connectionToCore">): Promise<ScrapeResult>;
804
+ /**
805
+ * Crawl URL via daemon
806
+ */
807
+ crawl(options: Omit<CrawlOptions, "connectionToCore">): Promise<CrawlResult>;
808
+ /**
809
+ * Get daemon status
810
+ */
811
+ status(): Promise<DaemonStatus>;
812
+ /**
813
+ * Request daemon shutdown
814
+ */
815
+ shutdown(): Promise<void>;
816
+ /**
817
+ * Check if daemon is reachable
818
+ */
819
+ isRunning(): Promise<boolean>;
820
+ /**
821
+ * Make HTTP request to daemon
822
+ */
823
+ private request;
824
+ }
825
+ /**
826
+ * Check if daemon is running on the specified port
827
+ */
828
+ declare function isDaemonRunning(port?: number): Promise<boolean>;
829
+
830
+ /**
831
+ * Convert pages to consolidated Markdown format
832
+ */
833
+ declare function formatToMarkdown(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata, includeMetadata?: boolean): string;
834
+
835
+ /**
836
+ * Convert pages to HTML format with metadata
837
+ */
838
+ declare function formatToHTML(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
839
+
840
+ /**
841
+ * Convert pages to JSON format with metadata
842
+ */
843
+ declare function formatToJson(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
844
+ /**
845
+ * Convert pages to JSON format without HTML (lighter version)
846
+ */
847
+ declare function formatToJsonLite(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
848
+
849
+ /**
850
+ * Convert pages to plain text format
851
+ *
852
+ * Strips all HTML tags and formatting, preserving only readable text content.
853
+ * Useful for LLM consumption where markdown formatting is not needed.
854
+ */
855
+ declare function formatToText(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata, includeMetadata?: boolean): string;
856
+
857
+ /**
858
+ * Extract comprehensive website metadata from HTML content
859
+ * Uses proper DOM parsing for reliable attribute extraction
860
+ */
861
+ declare function extractMetadata(html: string, baseUrl: string): WebsiteMetadata;
862
+
863
+ /**
864
+ * HTML content cleaning utilities using DOM parsing
865
+ */
866
+ /**
867
+ * Content cleaning options
868
+ */
869
+ interface CleaningOptions {
870
+ /** Remove ads and tracking elements (default: true) */
871
+ removeAds?: boolean;
872
+ /** Remove base64-encoded images (default: true) */
873
+ removeBase64Images?: boolean;
874
+ }
875
+ /**
876
+ * Clean HTML content (alias for cleanHtml with options)
877
+ */
878
+ declare function cleanContent(html: string, baseUrl: string, options?: CleaningOptions): string;
879
+
880
+ /**
881
+ * URL validation and normalization utilities
882
+ */
883
+ /**
884
+ * Resolve a relative URL against a base URL
885
+ */
886
+ declare function resolveUrl(relative: string, base: string): string;
887
+ /**
888
+ * Validate if a string is a valid URL
889
+ */
890
+ declare function isValidUrl(string: string): boolean;
891
+ /**
892
+ * Check if a URL belongs to the same domain as the base URL
893
+ * Supports subdomains: blog.example.com matches example.com
894
+ */
895
+ declare function isSameDomain(url: string, baseUrl: string): boolean;
896
+ /**
897
+ * Generate a URL key for deduplication
898
+ */
899
+ declare function getUrlKey(url: string): string;
900
+ /**
901
+ * Validate an array of URLs and return validation results
902
+ */
903
+ declare function validateUrls(urls: string[]): {
904
+ isValid: boolean;
905
+ validUrls: string[];
906
+ errors: Array<{
907
+ url: string;
908
+ error: string;
909
+ }>;
910
+ };
911
+ /**
912
+ * Check if a URL should be crawled based on various criteria
913
+ */
914
+ declare function shouldCrawlUrl(url: string, baseUrl: string, maxDepth: number, currentDepth: number, visited: Set<string>): boolean;
915
+
916
+ /**
917
+ * Simple rate limit function
918
+ */
919
+ declare function rateLimit(ms: number): Promise<void>;
920
+
921
+ /**
922
+ * Browser Pool
923
+ *
924
+ * Manages a pool of Hero browser instances with:
925
+ * - Auto-recycling based on age/request count
926
+ * - Request queuing when pool is full
927
+ * - Health monitoring
928
+ *
929
+ * @example
930
+ * const pool = new BrowserPool({ size: 5 });
931
+ * await pool.initialize();
932
+ *
933
+ * // Use withBrowser for automatic acquire/release
934
+ * await pool.withBrowser(async (hero) => {
935
+ * await hero.goto('https://example.com');
936
+ * const title = await hero.document.title;
937
+ * return title;
938
+ * });
939
+ *
940
+ * await pool.shutdown();
941
+ */
942
+ declare class BrowserPool implements IBrowserPool {
943
+ private instances;
944
+ private available;
945
+ private inUse;
946
+ private queue;
947
+ private config;
948
+ private proxy?;
949
+ private recycleTimer?;
950
+ private healthTimer?;
951
+ private totalRequests;
952
+ private totalRequestDuration;
953
+ private showChrome;
954
+ private connectionToCore?;
955
+ private userAgent?;
956
+ private verbose;
957
+ private logger;
958
+ constructor(config?: Partial<PoolConfig>, proxy?: ProxyConfig, showChrome?: boolean, connectionToCore?: any, userAgent?: string, verbose?: boolean);
959
+ /**
960
+ * Initialize the pool by pre-launching browsers
961
+ */
962
+ initialize(): Promise<void>;
963
+ /**
964
+ * Shutdown the pool and close all browsers
965
+ */
966
+ shutdown(): Promise<void>;
967
+ /**
968
+ * Acquire a browser from the pool
969
+ */
970
+ acquire(): Promise<Hero>;
971
+ /**
972
+ * Release a browser back to the pool
973
+ */
974
+ release(hero: Hero): void;
975
+ /**
976
+ * Execute callback with auto-managed browser
977
+ */
978
+ withBrowser<T>(callback: (hero: Hero) => Promise<T>): Promise<T>;
979
+ /**
980
+ * Get pool statistics
981
+ */
982
+ getStats(): PoolStats;
983
+ /**
984
+ * Run health check
985
+ */
986
+ healthCheck(): Promise<HealthStatus>;
987
+ /**
988
+ * Create a new browser instance
989
+ */
990
+ private createInstance;
991
+ /**
992
+ * Check if instance should be recycled
993
+ */
994
+ private shouldRecycle;
995
+ /**
996
+ * Recycle an instance (close old, create new)
997
+ */
998
+ private recycleInstance;
999
+ /**
1000
+ * Queue a request when no browsers available
1001
+ */
1002
+ private queueRequest;
1003
+ /**
1004
+ * Process queued requests
1005
+ */
1006
+ private processQueue;
1007
+ /**
1008
+ * Start background recycling task
1009
+ */
1010
+ private startRecycling;
1011
+ /**
1012
+ * Start background health checks
1013
+ */
1014
+ private startHealthChecks;
1015
+ }
1016
+
1017
+ /**
1018
+ * Hero configuration options
1019
+ */
1020
+ interface HeroConfigOptions {
1021
+ /** Proxy configuration */
1022
+ proxy?: ProxyConfig;
1023
+ /** Show Chrome window (default: false) */
1024
+ showChrome?: boolean;
1025
+ /** Custom user agent */
1026
+ userAgent?: string;
1027
+ /** Connection to Core (for in-process Core) */
1028
+ connectionToCore?: any;
1029
+ }
1030
+ /**
1031
+ * Create Hero configuration with optimal anti-bot bypass settings
1032
+ *
1033
+ * Extracted from proven hero-test implementation.
1034
+ * Includes:
1035
+ * - TLS fingerprint emulation (disableMitm: false)
1036
+ * - DNS over TLS (mimics Chrome)
1037
+ * - WebRTC IP masking
1038
+ * - Proper locale and timezone
1039
+ *
1040
+ * @param options - Configuration options
1041
+ * @returns Hero configuration object
1042
+ */
1043
+ declare function createHeroConfig(options?: HeroConfigOptions): any;
1044
+
1045
+ /**
1046
+ * Cloudflare challenge detection result
1047
+ */
1048
+ interface ChallengeDetection {
1049
+ /** Whether a challenge was detected */
1050
+ isChallenge: boolean;
1051
+ /** Type of challenge */
1052
+ type: "js_challenge" | "turnstile" | "captcha" | "blocked" | "none";
1053
+ /** Confidence level (0-100) */
1054
+ confidence: number;
1055
+ /** Detection signals found */
1056
+ signals: string[];
1057
+ }
1058
+ /**
1059
+ * Challenge resolution result
1060
+ */
1061
+ interface ChallengeResolutionResult {
1062
+ /** Whether the challenge was resolved */
1063
+ resolved: boolean;
1064
+ /** Method used to detect resolution */
1065
+ method: "url_redirect" | "signals_cleared" | "timeout";
1066
+ /** Time waited in milliseconds */
1067
+ waitedMs: number;
1068
+ }
1069
+ /**
1070
+ * Challenge waiting options
1071
+ */
1072
+ interface ChallengeWaitOptions {
1073
+ /** Maximum time to wait for resolution (default: 45000ms) */
1074
+ maxWaitMs?: number;
1075
+ /** How often to poll for resolution (default: 500ms) */
1076
+ pollIntervalMs?: number;
1077
+ /** Enable verbose logging */
1078
+ verbose?: boolean;
1079
+ /** Initial URL before challenge */
1080
+ initialUrl: string;
1081
+ }
1082
+
1083
+ /**
1084
+ * Detect if current page is a Cloudflare challenge
1085
+ *
1086
+ * Uses multi-signal approach with ONLY challenge-specific indicators.
1087
+ * No content length heuristics to avoid false positives.
1088
+ *
1089
+ * @param hero - Hero instance with loaded page
1090
+ * @returns Detection result with confidence score and signals
1091
+ *
1092
+ * @example
1093
+ * const detection = await detectChallenge(hero);
1094
+ * if (detection.isChallenge) {
1095
+ * console.log(`Challenge detected: ${detection.type}`);
1096
+ * console.log(`Signals: ${detection.signals.join(', ')}`);
1097
+ * }
1098
+ */
1099
+ declare function detectChallenge(hero: Hero): Promise<ChallengeDetection>;
1100
+ /**
1101
+ * Quick check - just returns boolean
1102
+ *
1103
+ * @param hero - Hero instance
1104
+ * @returns True if challenge page detected
1105
+ */
1106
+ declare function isChallengePage(hero: Hero): Promise<boolean>;
1107
+
1108
+ /**
1109
+ * Wait for Cloudflare challenge to resolve
1110
+ *
1111
+ * Uses multiple detection strategies:
1112
+ * 1. URL redirect detection (page redirects after challenge)
1113
+ * 2. Signal polling (challenge-specific elements/text disappear)
1114
+ *
1115
+ * @param hero - Hero instance with challenge page loaded
1116
+ * @param options - Waiting options
1117
+ * @returns Resolution result with method and time waited
1118
+ *
1119
+ * @example
1120
+ * const result = await waitForChallengeResolution(hero, {
1121
+ * maxWaitMs: 45000,
1122
+ * pollIntervalMs: 500,
1123
+ * verbose: true,
1124
+ * initialUrl: 'https://example.com'
1125
+ * });
1126
+ *
1127
+ * if (result.resolved) {
1128
+ * console.log(`Challenge resolved via ${result.method} in ${result.waitedMs}ms`);
1129
+ * }
1130
+ */
1131
+ declare function waitForChallengeResolution(hero: Hero, options: ChallengeWaitOptions): Promise<ChallengeResolutionResult>;
1132
+ /**
1133
+ * Wait for a specific CSS selector to appear
1134
+ *
1135
+ * Useful when you know exactly what element should appear after challenge.
1136
+ *
1137
+ * @param hero - Hero instance
1138
+ * @param selector - CSS selector to wait for
1139
+ * @param maxWaitMs - Maximum time to wait
1140
+ * @param verbose - Enable logging
1141
+ * @returns Whether selector was found and time waited
1142
+ *
1143
+ * @example
1144
+ * const result = await waitForSelector(hero, '.content', 30000, true);
1145
+ * if (result.found) {
1146
+ * console.log(`Content appeared after ${result.waitedMs}ms`);
1147
+ * }
1148
+ */
1149
+ declare function waitForSelector(hero: Hero, selector: string, maxWaitMs: number, verbose?: boolean): Promise<{
1150
+ found: boolean;
1151
+ waitedMs: number;
1152
+ }>;
1153
+ /**
1154
+ * Handle Cloudflare challenge with automatic detection and waiting
1155
+ *
1156
+ * High-level function that combines detection and resolution.
1157
+ *
1158
+ * @param hero - Hero instance
1159
+ * @param options - Wait options (without initialUrl)
1160
+ * @returns Resolution result
1161
+ *
1162
+ * @example
1163
+ * await hero.goto('https://example.com');
1164
+ * const result = await handleChallenge(hero, { verbose: true });
1165
+ * if (result.resolved) {
1166
+ * // Challenge passed, continue scraping
1167
+ * }
1168
+ */
1169
+ declare function handleChallenge(hero: Hero, options?: Omit<ChallengeWaitOptions, "initialUrl">): Promise<ChallengeResolutionResult>;
1170
+
1171
+ /**
1172
+ * Create proxy URL from configuration
1173
+ *
1174
+ * Supports both datacenter and residential proxies.
1175
+ * For residential proxies (e.g., IPRoyal), generates a sticky session ID.
1176
+ *
1177
+ * @param config - Proxy configuration
1178
+ * @returns Formatted proxy URL
1179
+ *
1180
+ * @example
1181
+ * // Datacenter proxy
1182
+ * createProxyUrl({
1183
+ * type: 'datacenter',
1184
+ * username: 'user',
1185
+ * password: 'pass',
1186
+ * host: 'proxy.example.com',
1187
+ * port: 8080
1188
+ * })
1189
+ * // Returns: "http://user:pass@proxy.example.com:8080"
1190
+ *
1191
+ * @example
1192
+ * // Residential proxy with sticky session
1193
+ * createProxyUrl({
1194
+ * type: 'residential',
1195
+ * username: 'customer-abc',
1196
+ * password: 'secret',
1197
+ * host: 'geo.iproyal.com',
1198
+ * port: 12321,
1199
+ * country: 'us'
1200
+ * })
1201
+ * // Returns: "http://customer-abc_session-hero_123_abc456_country-us:secret@geo.iproyal.com:12321"
1202
+ */
1203
+ declare function createProxyUrl(config: ProxyConfig): string;
1204
+ /**
1205
+ * Parse proxy URL into ProxyConfig
1206
+ *
1207
+ * @param url - Proxy URL string
1208
+ * @returns Parsed proxy configuration
1209
+ *
1210
+ * @example
1211
+ * parseProxyUrl("http://user:pass@proxy.example.com:8080")
1212
+ * // Returns: { username: 'user', password: 'pass', host: 'proxy.example.com', port: 8080 }
1213
+ */
1214
+ declare function parseProxyUrl(url: string): ProxyConfig;
1215
+
1216
+ export { type BatchMetadata, type BrowserInstance, BrowserPool, type BrowserPoolConfig, type ChallengeDetection, type ChallengeResolutionResult, type ChallengeWaitOptions, type CrawlMetadata, type CrawlOptions, type CrawlResult, type CrawlUrl, Crawler, DEFAULT_DAEMON_PORT, DEFAULT_OPTIONS, DaemonClient, type DaemonClientOptions, DaemonServer, type DaemonServerOptions, type DaemonStatus, type HealthStatus, BrowserPool as HeroBrowserPool, type IBrowserPool, type Page, type PoolConfig, type PoolStats, type ProxyConfig, type ProxyMetadata, type ProxyRotation, ReaderClient, type ReaderClientOptions, type ScrapeOptions, type ScrapeResult, Scraper, type WebsiteMetadata, type WebsiteScrapeResult, cleanContent, crawl, createHeroConfig, createProxyUrl, detectChallenge, extractMetadata, formatToHTML, formatToJson, formatToJsonLite, formatToMarkdown, formatToText, getDaemonInfo, getPidFilePath, getUrlKey, handleChallenge, isChallengePage, isDaemonRunning, isSameDomain, isValidFormat, isValidUrl, parseProxyUrl, rateLimit, resolveUrl, scrape, shouldCrawlUrl, shouldCrawlUrl$1 as shouldCrawlUrlFn, validateUrls, waitForChallengeResolution, waitForSelector };