@vakra-dev/reader 0.0.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/dist/cli/index.js +937 -316
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +28 -9
- package/dist/index.js +968 -351
- package/dist/index.js.map +1 -1
- package/package.json +2 -1
package/dist/index.d.ts
CHANGED
|
@@ -90,6 +90,20 @@ interface IBrowserPool {
|
|
|
90
90
|
healthCheck?(): Promise<HealthStatus>;
|
|
91
91
|
}
|
|
92
92
|
|
|
93
|
+
/**
|
|
94
|
+
* Engine types for multi-engine scraping architecture
|
|
95
|
+
*
|
|
96
|
+
* Engine stack (in order of preference):
|
|
97
|
+
* 1. http - Native fetch, fastest, no browser
|
|
98
|
+
* 2. tlsclient - TLS fingerprinting via got-scraping
|
|
99
|
+
* 3. hero - Full browser with JavaScript execution
|
|
100
|
+
*/
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Available engine names
|
|
104
|
+
*/
|
|
105
|
+
type EngineName = "http" | "tlsclient" | "hero";
|
|
106
|
+
|
|
93
107
|
/**
|
|
94
108
|
* Proxy configuration for Hero
|
|
95
109
|
*/
|
|
@@ -143,6 +157,8 @@ interface ScrapeOptions {
|
|
|
143
157
|
formats?: Array<"markdown" | "html">;
|
|
144
158
|
/** Custom user agent string */
|
|
145
159
|
userAgent?: string;
|
|
160
|
+
/** Custom headers for requests */
|
|
161
|
+
headers?: Record<string, string>;
|
|
146
162
|
/** Request timeout in milliseconds (default: 30000) */
|
|
147
163
|
timeoutMs?: number;
|
|
148
164
|
/** URL patterns to include (regex strings) */
|
|
@@ -187,6 +203,12 @@ interface ScrapeOptions {
|
|
|
187
203
|
browserPool?: BrowserPoolConfig;
|
|
188
204
|
/** Browser pool instance (internal, provided by ReaderClient) */
|
|
189
205
|
pool?: IBrowserPool;
|
|
206
|
+
/** Engines to use in order (default: ['http', 'tlsclient', 'hero']) */
|
|
207
|
+
engines?: EngineName[];
|
|
208
|
+
/** Skip specific engines (e.g., ['http'] to skip native fetch) */
|
|
209
|
+
skipEngines?: EngineName[];
|
|
210
|
+
/** Force a specific engine, skipping the cascade */
|
|
211
|
+
forceEngine?: EngineName;
|
|
190
212
|
}
|
|
191
213
|
/**
|
|
192
214
|
* Website metadata extracted from the base page
|
|
@@ -306,13 +328,17 @@ interface ScrapeResult {
|
|
|
306
328
|
/**
|
|
307
329
|
* Default scrape options
|
|
308
330
|
*/
|
|
309
|
-
declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForSelector" | "connectionToCore" | "userAgent" | "browserPool" | "pool"> & {
|
|
331
|
+
declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForSelector" | "connectionToCore" | "userAgent" | "headers" | "browserPool" | "pool" | "engines" | "skipEngines" | "forceEngine"> & {
|
|
310
332
|
proxy?: ProxyConfig;
|
|
311
333
|
waitForSelector?: string;
|
|
312
334
|
connectionToCore?: any;
|
|
313
335
|
userAgent?: string;
|
|
336
|
+
headers?: Record<string, string>;
|
|
314
337
|
browserPool?: BrowserPoolConfig;
|
|
315
338
|
pool?: IBrowserPool;
|
|
339
|
+
engines?: EngineName[];
|
|
340
|
+
skipEngines?: EngineName[];
|
|
341
|
+
forceEngine?: EngineName;
|
|
316
342
|
};
|
|
317
343
|
/**
|
|
318
344
|
* Format type guard
|
|
@@ -547,7 +573,6 @@ declare class ReaderClient {
|
|
|
547
573
|
*/
|
|
548
574
|
declare class Scraper {
|
|
549
575
|
private options;
|
|
550
|
-
private pool;
|
|
551
576
|
private logger;
|
|
552
577
|
private robotsCache;
|
|
553
578
|
constructor(options: ScrapeOptions);
|
|
@@ -570,13 +595,7 @@ declare class Scraper {
|
|
|
570
595
|
*/
|
|
571
596
|
private scrapeSingleUrlWithRetry;
|
|
572
597
|
/**
|
|
573
|
-
*
|
|
574
|
-
* Cloudflare often does silent redirects even when bypassed, we need to ensure
|
|
575
|
-
* we're on the actual content page before scraping.
|
|
576
|
-
*/
|
|
577
|
-
private waitForFinalPage;
|
|
578
|
-
/**
|
|
579
|
-
* Scrape a single URL
|
|
598
|
+
* Scrape a single URL using the engine orchestrator
|
|
580
599
|
*/
|
|
581
600
|
private scrapeSingleUrl;
|
|
582
601
|
/**
|