@vakra-dev/reader 0.0.3 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -90,6 +90,20 @@ interface IBrowserPool {
90
90
  healthCheck?(): Promise<HealthStatus>;
91
91
  }
92
92
 
93
+ /**
94
+ * Engine types for multi-engine scraping architecture
95
+ *
96
+ * Engine stack (in order of preference):
97
+ * 1. http - Native fetch, fastest, no browser
98
+ * 2. tlsclient - TLS fingerprinting via got-scraping
99
+ * 3. hero - Full browser with JavaScript execution
100
+ */
101
+
102
+ /**
103
+ * Available engine names
104
+ */
105
+ type EngineName = "http" | "tlsclient" | "hero";
106
+
93
107
  /**
94
108
  * Proxy configuration for Hero
95
109
  */
@@ -143,6 +157,8 @@ interface ScrapeOptions {
143
157
  formats?: Array<"markdown" | "html">;
144
158
  /** Custom user agent string */
145
159
  userAgent?: string;
160
+ /** Custom headers for requests */
161
+ headers?: Record<string, string>;
146
162
  /** Request timeout in milliseconds (default: 30000) */
147
163
  timeoutMs?: number;
148
164
  /** URL patterns to include (regex strings) */
@@ -187,6 +203,12 @@ interface ScrapeOptions {
187
203
  browserPool?: BrowserPoolConfig;
188
204
  /** Browser pool instance (internal, provided by ReaderClient) */
189
205
  pool?: IBrowserPool;
206
+ /** Engines to use in order (default: ['http', 'tlsclient', 'hero']) */
207
+ engines?: EngineName[];
208
+ /** Skip specific engines (e.g., ['http'] to skip native fetch) */
209
+ skipEngines?: EngineName[];
210
+ /** Force a specific engine, skipping the cascade */
211
+ forceEngine?: EngineName;
190
212
  }
191
213
  /**
192
214
  * Website metadata extracted from the base page
@@ -306,13 +328,17 @@ interface ScrapeResult {
306
328
  /**
307
329
  * Default scrape options
308
330
  */
309
- declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForSelector" | "connectionToCore" | "userAgent" | "browserPool" | "pool"> & {
331
+ declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForSelector" | "connectionToCore" | "userAgent" | "headers" | "browserPool" | "pool" | "engines" | "skipEngines" | "forceEngine"> & {
310
332
  proxy?: ProxyConfig;
311
333
  waitForSelector?: string;
312
334
  connectionToCore?: any;
313
335
  userAgent?: string;
336
+ headers?: Record<string, string>;
314
337
  browserPool?: BrowserPoolConfig;
315
338
  pool?: IBrowserPool;
339
+ engines?: EngineName[];
340
+ skipEngines?: EngineName[];
341
+ forceEngine?: EngineName;
316
342
  };
317
343
  /**
318
344
  * Format type guard
@@ -547,7 +573,6 @@ declare class ReaderClient {
547
573
  */
548
574
  declare class Scraper {
549
575
  private options;
550
- private pool;
551
576
  private logger;
552
577
  private robotsCache;
553
578
  constructor(options: ScrapeOptions);
@@ -570,13 +595,7 @@ declare class Scraper {
570
595
  */
571
596
  private scrapeSingleUrlWithRetry;
572
597
  /**
573
- * Wait for the final page to load after any Cloudflare redirects
574
- * Cloudflare often does silent redirects even when bypassed, we need to ensure
575
- * we're on the actual content page before scraping.
576
- */
577
- private waitForFinalPage;
578
- /**
579
- * Scrape a single URL
598
+ * Scrape a single URL using the engine orchestrator
580
599
  */
581
600
  private scrapeSingleUrl;
582
601
  /**
@@ -832,6 +851,8 @@ declare function isDaemonRunning(port?: number): Promise<boolean>;
832
851
  *
833
852
  * Simple conversion without any headers, metadata, or formatting wrappers.
834
853
  * Returns clean markdown content ready for LLM consumption.
854
+ *
855
+ * Uses supermarkdown (Rust-based) for high-performance conversion.
835
856
  */
836
857
  declare function htmlToMarkdown(html: string): string;
837
858
  /**