@vakra-dev/reader 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +205 -0
- package/README.md +658 -0
- package/dist/cli/index.js +3046 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/index.d.ts +1216 -0
- package/dist/index.js +3073 -0
- package/dist/index.js.map +1 -0
- package/package.json +87 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,1216 @@
|
|
|
1
|
+
import Hero from '@ulixee/hero';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Browser instance in the pool
|
|
5
|
+
*/
|
|
6
|
+
interface BrowserInstance {
|
|
7
|
+
/** Hero instance */
|
|
8
|
+
hero: Hero;
|
|
9
|
+
/** Unique identifier */
|
|
10
|
+
id: string;
|
|
11
|
+
/** When the instance was created */
|
|
12
|
+
createdAt: number;
|
|
13
|
+
/** When the instance was last used */
|
|
14
|
+
lastUsed: number;
|
|
15
|
+
/** Number of requests handled */
|
|
16
|
+
requestCount: number;
|
|
17
|
+
/** Current status */
|
|
18
|
+
status: "idle" | "busy" | "recycling" | "unhealthy";
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Pool configuration
|
|
22
|
+
*/
|
|
23
|
+
interface PoolConfig {
|
|
24
|
+
/** Pool size (number of browser instances) */
|
|
25
|
+
size: number;
|
|
26
|
+
/** Retire browser after this many page loads */
|
|
27
|
+
retireAfterPageCount: number;
|
|
28
|
+
/** Retire browser after this age in milliseconds */
|
|
29
|
+
retireAfterAgeMs: number;
|
|
30
|
+
/** How often to check for recycling (ms) */
|
|
31
|
+
recycleCheckInterval: number;
|
|
32
|
+
/** How often to run health checks (ms) */
|
|
33
|
+
healthCheckInterval: number;
|
|
34
|
+
/** Max consecutive failures before marking unhealthy */
|
|
35
|
+
maxConsecutiveFailures: number;
|
|
36
|
+
/** Maximum queue size */
|
|
37
|
+
maxQueueSize: number;
|
|
38
|
+
/** Queue timeout in milliseconds */
|
|
39
|
+
queueTimeout: number;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Pool statistics
|
|
43
|
+
*/
|
|
44
|
+
interface PoolStats {
|
|
45
|
+
/** Total instances */
|
|
46
|
+
total: number;
|
|
47
|
+
/** Available instances */
|
|
48
|
+
available: number;
|
|
49
|
+
/** Busy instances */
|
|
50
|
+
busy: number;
|
|
51
|
+
/** Recycling instances */
|
|
52
|
+
recycling: number;
|
|
53
|
+
/** Unhealthy instances */
|
|
54
|
+
unhealthy: number;
|
|
55
|
+
/** Queue length */
|
|
56
|
+
queueLength: number;
|
|
57
|
+
/** Total requests handled */
|
|
58
|
+
totalRequests: number;
|
|
59
|
+
/** Average request duration */
|
|
60
|
+
avgRequestDuration: number;
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Health status
|
|
64
|
+
*/
|
|
65
|
+
interface HealthStatus {
|
|
66
|
+
/** Overall health */
|
|
67
|
+
healthy: boolean;
|
|
68
|
+
/** Issues found */
|
|
69
|
+
issues: string[];
|
|
70
|
+
/** Stats snapshot */
|
|
71
|
+
stats: PoolStats;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Browser pool interface
|
|
75
|
+
*/
|
|
76
|
+
interface IBrowserPool {
|
|
77
|
+
/** Initialize the pool */
|
|
78
|
+
initialize(): Promise<void>;
|
|
79
|
+
/** Shutdown the pool */
|
|
80
|
+
shutdown(): Promise<void>;
|
|
81
|
+
/** Acquire a browser instance */
|
|
82
|
+
acquire(): Promise<Hero>;
|
|
83
|
+
/** Release a browser instance back to the pool */
|
|
84
|
+
release(hero: Hero): void;
|
|
85
|
+
/** Execute callback with auto-managed browser */
|
|
86
|
+
withBrowser<T>(callback: (hero: Hero) => Promise<T>): Promise<T>;
|
|
87
|
+
/** Get pool statistics */
|
|
88
|
+
getStats(): PoolStats;
|
|
89
|
+
/** Run health check */
|
|
90
|
+
healthCheck?(): Promise<HealthStatus>;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Proxy configuration for Hero
|
|
95
|
+
*/
|
|
96
|
+
interface ProxyConfig {
|
|
97
|
+
/** Full proxy URL (takes precedence over other fields) */
|
|
98
|
+
url?: string;
|
|
99
|
+
/** Proxy type */
|
|
100
|
+
type?: "datacenter" | "residential";
|
|
101
|
+
/** Proxy username */
|
|
102
|
+
username?: string;
|
|
103
|
+
/** Proxy password */
|
|
104
|
+
password?: string;
|
|
105
|
+
/** Proxy host */
|
|
106
|
+
host?: string;
|
|
107
|
+
/** Proxy port */
|
|
108
|
+
port?: number;
|
|
109
|
+
/** Country code for residential proxies (e.g., 'us', 'uk') */
|
|
110
|
+
country?: string;
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Proxy metadata in scrape results
|
|
114
|
+
*/
|
|
115
|
+
interface ProxyMetadata {
|
|
116
|
+
/** Proxy host that was used */
|
|
117
|
+
host: string;
|
|
118
|
+
/** Proxy port that was used */
|
|
119
|
+
port: number;
|
|
120
|
+
/** Country code if geo-targeting was used */
|
|
121
|
+
country?: string;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Browser pool configuration for ReaderClient
|
|
125
|
+
*/
|
|
126
|
+
interface BrowserPoolConfig {
|
|
127
|
+
/** Number of browser instances (default: 2) */
|
|
128
|
+
size?: number;
|
|
129
|
+
/** Retire browser after this many page loads (default: 100) */
|
|
130
|
+
retireAfterPages?: number;
|
|
131
|
+
/** Retire browser after this many minutes (default: 30) */
|
|
132
|
+
retireAfterMinutes?: number;
|
|
133
|
+
/** Maximum pending requests in queue (default: 100) */
|
|
134
|
+
maxQueueSize?: number;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Main scraping options interface
|
|
138
|
+
*/
|
|
139
|
+
interface ScrapeOptions {
|
|
140
|
+
/** Array of URLs to scrape */
|
|
141
|
+
urls: string[];
|
|
142
|
+
/** Output formats (default: ['markdown']) */
|
|
143
|
+
formats?: Array<"markdown" | "html" | "json" | "text">;
|
|
144
|
+
/** Include URL, title, timestamp (default: true) */
|
|
145
|
+
includeMetadata?: boolean;
|
|
146
|
+
/** Custom user agent string */
|
|
147
|
+
userAgent?: string;
|
|
148
|
+
/** Request timeout in milliseconds (default: 30000) */
|
|
149
|
+
timeoutMs?: number;
|
|
150
|
+
/** URL patterns to include (regex strings) */
|
|
151
|
+
includePatterns?: string[];
|
|
152
|
+
/** URL patterns to exclude (regex strings) */
|
|
153
|
+
excludePatterns?: string[];
|
|
154
|
+
/** Remove ads and tracking elements (default: true) */
|
|
155
|
+
removeAds?: boolean;
|
|
156
|
+
/** Remove base64-encoded images to reduce output size (default: true) */
|
|
157
|
+
removeBase64Images?: boolean;
|
|
158
|
+
/** Skip TLS/SSL certificate verification (default: true) */
|
|
159
|
+
skipTLSVerification?: boolean;
|
|
160
|
+
/** Number of URLs to process in parallel (default: 1 - sequential) */
|
|
161
|
+
batchConcurrency?: number;
|
|
162
|
+
/** Total timeout for the entire batch operation in milliseconds (default: 300000) */
|
|
163
|
+
batchTimeoutMs?: number;
|
|
164
|
+
/** Maximum retry attempts for failed URLs (default: 2) */
|
|
165
|
+
maxRetries?: number;
|
|
166
|
+
/** Progress callback for batch operations */
|
|
167
|
+
onProgress?: (progress: {
|
|
168
|
+
completed: number;
|
|
169
|
+
total: number;
|
|
170
|
+
currentUrl: string;
|
|
171
|
+
}) => void;
|
|
172
|
+
/** Proxy configuration for Hero */
|
|
173
|
+
proxy?: ProxyConfig;
|
|
174
|
+
/** CSS selector to wait for before considering page loaded */
|
|
175
|
+
waitForSelector?: string;
|
|
176
|
+
/** Enable verbose logging (default: false) */
|
|
177
|
+
verbose?: boolean;
|
|
178
|
+
/** Show Chrome window (default: false) */
|
|
179
|
+
showChrome?: boolean;
|
|
180
|
+
/** Connection to Hero Core (for shared Core usage) */
|
|
181
|
+
connectionToCore?: any;
|
|
182
|
+
/** Browser pool configuration (passed from ReaderClient) */
|
|
183
|
+
browserPool?: BrowserPoolConfig;
|
|
184
|
+
/** Browser pool instance (internal, provided by ReaderClient) */
|
|
185
|
+
pool?: IBrowserPool;
|
|
186
|
+
}
|
|
187
|
+
/**
|
|
188
|
+
* Website metadata extracted from the base page
|
|
189
|
+
*/
|
|
190
|
+
interface WebsiteMetadata {
|
|
191
|
+
/** Basic meta tags */
|
|
192
|
+
title: string | null /** <title> or <meta property="og:title"> */;
|
|
193
|
+
description: string | null /** <meta name="description"> */;
|
|
194
|
+
author: string | null /** <meta name="author"> */;
|
|
195
|
+
language: string | null /** <html lang="..."> */;
|
|
196
|
+
charset: string | null /** <meta charset="..."> */;
|
|
197
|
+
/** Links */
|
|
198
|
+
favicon: string | null /** <link rel="icon"> */;
|
|
199
|
+
image: string | null /** <meta property="og:image"> */;
|
|
200
|
+
canonical: string | null /** <link rel="canonical"> */;
|
|
201
|
+
/** SEO */
|
|
202
|
+
keywords: string[] | null /** <meta name="keywords"> */;
|
|
203
|
+
robots: string | null /** <meta name="robots"> */;
|
|
204
|
+
/** Branding */
|
|
205
|
+
themeColor: string | null /** <meta name="theme-color"> */;
|
|
206
|
+
/** Open Graph */
|
|
207
|
+
openGraph: {
|
|
208
|
+
title: string | null /** <meta property="og:title"> */;
|
|
209
|
+
description: string | null /** <meta property="og:description"> */;
|
|
210
|
+
type: string | null /** <meta property="og:type"> */;
|
|
211
|
+
url: string | null /** <meta property="og:url"> */;
|
|
212
|
+
image: string | null /** <meta property="og:image"> */;
|
|
213
|
+
siteName: string | null /** <meta property="og:site_name"> */;
|
|
214
|
+
locale: string | null /** <meta property="og:locale"> */;
|
|
215
|
+
} | null;
|
|
216
|
+
/** Twitter Card */
|
|
217
|
+
twitter: {
|
|
218
|
+
card: string | null /** <meta name="twitter:card"> */;
|
|
219
|
+
site: string | null /** <meta name="twitter:site"> */;
|
|
220
|
+
creator: string | null /** <meta name="twitter:creator"> */;
|
|
221
|
+
title: string | null /** <meta name="twitter:title"> */;
|
|
222
|
+
description: string | null /** <meta name="twitter:description"> */;
|
|
223
|
+
image: string | null /** <meta name="twitter:image"> */;
|
|
224
|
+
} | null;
|
|
225
|
+
}
|
|
226
|
+
/**
|
|
227
|
+
* Individual page data
|
|
228
|
+
*/
|
|
229
|
+
interface Page {
|
|
230
|
+
/** Full URL of the page */
|
|
231
|
+
url: string;
|
|
232
|
+
/** Page title */
|
|
233
|
+
title: string;
|
|
234
|
+
/** Markdown content */
|
|
235
|
+
markdown: string;
|
|
236
|
+
/** HTML content */
|
|
237
|
+
html: string;
|
|
238
|
+
/** When the page was fetched */
|
|
239
|
+
fetchedAt: string;
|
|
240
|
+
/** Crawl depth from base URL */
|
|
241
|
+
depth: number;
|
|
242
|
+
/** Whether a Cloudflare challenge was detected */
|
|
243
|
+
hadChallenge?: boolean;
|
|
244
|
+
/** Type of challenge encountered */
|
|
245
|
+
challengeType?: string;
|
|
246
|
+
/** Time spent waiting for challenge resolution (ms) */
|
|
247
|
+
waitTimeMs?: number;
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* Individual website scrape result (for backward compatibility)
|
|
251
|
+
*/
|
|
252
|
+
interface WebsiteScrapeResult {
|
|
253
|
+
/** Markdown output (present if 'markdown' in formats) */
|
|
254
|
+
markdown?: string;
|
|
255
|
+
/** HTML output (present if 'html' in formats) */
|
|
256
|
+
html?: string;
|
|
257
|
+
/** JSON output (present if 'json' in formats) */
|
|
258
|
+
json?: string;
|
|
259
|
+
/** Plain text output (present if 'text' in formats) */
|
|
260
|
+
text?: string;
|
|
261
|
+
/** Metadata about the scraping operation */
|
|
262
|
+
metadata: {
|
|
263
|
+
/** Base URL that was scraped */
|
|
264
|
+
baseUrl: string;
|
|
265
|
+
/** Total number of pages scraped */
|
|
266
|
+
totalPages: number;
|
|
267
|
+
/** ISO timestamp when scraping started */
|
|
268
|
+
scrapedAt: string;
|
|
269
|
+
/** Duration in milliseconds */
|
|
270
|
+
duration: number;
|
|
271
|
+
/** Website metadata extracted from base page */
|
|
272
|
+
website: WebsiteMetadata;
|
|
273
|
+
/** Proxy used for this request (if proxy pooling was enabled) */
|
|
274
|
+
proxy?: ProxyMetadata;
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Batch metadata for multi-URL operations
|
|
279
|
+
*/
|
|
280
|
+
interface BatchMetadata {
|
|
281
|
+
/** Total number of URLs provided */
|
|
282
|
+
totalUrls: number;
|
|
283
|
+
/** Number of URLs successfully scraped */
|
|
284
|
+
successfulUrls: number;
|
|
285
|
+
/** Number of URLs that failed */
|
|
286
|
+
failedUrls: number;
|
|
287
|
+
/** ISO timestamp when the batch operation started */
|
|
288
|
+
scrapedAt: string;
|
|
289
|
+
/** Total duration for the entire batch in milliseconds */
|
|
290
|
+
totalDuration: number;
|
|
291
|
+
/** Array of errors for failed URLs */
|
|
292
|
+
errors?: Array<{
|
|
293
|
+
url: string;
|
|
294
|
+
error: string;
|
|
295
|
+
}>;
|
|
296
|
+
}
|
|
297
|
+
/**
|
|
298
|
+
* Main scrape result interface
|
|
299
|
+
*/
|
|
300
|
+
interface ScrapeResult {
|
|
301
|
+
/** Array of individual website results */
|
|
302
|
+
data: WebsiteScrapeResult[];
|
|
303
|
+
/** Metadata about the batch operation */
|
|
304
|
+
batchMetadata: BatchMetadata;
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
307
|
+
* Default scrape options
|
|
308
|
+
*/
|
|
309
|
+
declare const DEFAULT_OPTIONS: Omit<Required<ScrapeOptions>, "proxy" | "waitForSelector" | "connectionToCore" | "userAgent" | "browserPool" | "pool"> & {
|
|
310
|
+
proxy?: ProxyConfig;
|
|
311
|
+
waitForSelector?: string;
|
|
312
|
+
connectionToCore?: any;
|
|
313
|
+
userAgent?: string;
|
|
314
|
+
browserPool?: BrowserPoolConfig;
|
|
315
|
+
pool?: IBrowserPool;
|
|
316
|
+
};
|
|
317
|
+
/**
|
|
318
|
+
* Format type guard
|
|
319
|
+
*/
|
|
320
|
+
declare function isValidFormat(format: string): format is "markdown" | "html" | "json" | "text";
|
|
321
|
+
/**
|
|
322
|
+
* Check if a URL should be crawled based on base domain
|
|
323
|
+
*/
|
|
324
|
+
declare function shouldCrawlUrl$1(url: URL, baseDomain: string): boolean;
|
|
325
|
+
|
|
326
|
+
/**
|
|
327
|
+
* Crawl options interface
|
|
328
|
+
*/
|
|
329
|
+
interface CrawlOptions {
|
|
330
|
+
/** Single seed URL to start crawling from */
|
|
331
|
+
url: string;
|
|
332
|
+
/** Maximum depth to crawl (default: 1) */
|
|
333
|
+
depth?: number;
|
|
334
|
+
/** Maximum pages to discover (default: 20) */
|
|
335
|
+
maxPages?: number;
|
|
336
|
+
/** Also scrape full content (default: false) */
|
|
337
|
+
scrape?: boolean;
|
|
338
|
+
/** Delay between requests in milliseconds (default: 1000) */
|
|
339
|
+
delayMs?: number;
|
|
340
|
+
/** Total timeout for the entire crawl operation in milliseconds */
|
|
341
|
+
timeoutMs?: number;
|
|
342
|
+
/** URL patterns to include (regex strings) - if set, only matching URLs are crawled */
|
|
343
|
+
includePatterns?: string[];
|
|
344
|
+
/** URL patterns to exclude (regex strings) - matching URLs are skipped */
|
|
345
|
+
excludePatterns?: string[];
|
|
346
|
+
/** Output formats for scraped content (default: ['markdown', 'html']) */
|
|
347
|
+
formats?: Array<"markdown" | "html" | "json" | "text">;
|
|
348
|
+
/** Number of URLs to scrape in parallel (default: 2) */
|
|
349
|
+
scrapeConcurrency?: number;
|
|
350
|
+
/** Remove ads and tracking elements (default: true) */
|
|
351
|
+
removeAds?: boolean;
|
|
352
|
+
/** Remove base64-encoded images to reduce output size (default: true) */
|
|
353
|
+
removeBase64Images?: boolean;
|
|
354
|
+
/** Proxy configuration for Hero */
|
|
355
|
+
proxy?: ProxyConfig;
|
|
356
|
+
/** Custom user agent string */
|
|
357
|
+
userAgent?: string;
|
|
358
|
+
/** Enable verbose logging (default: false) */
|
|
359
|
+
verbose?: boolean;
|
|
360
|
+
/** Show Chrome window (default: false) */
|
|
361
|
+
showChrome?: boolean;
|
|
362
|
+
/** Connection to Hero Core (for shared Core usage) */
|
|
363
|
+
connectionToCore?: any;
|
|
364
|
+
/** Browser pool instance (internal, provided by ReaderClient) */
|
|
365
|
+
pool?: IBrowserPool;
|
|
366
|
+
}
|
|
367
|
+
/**
|
|
368
|
+
* Crawl URL result interface
|
|
369
|
+
*/
|
|
370
|
+
interface CrawlUrl {
|
|
371
|
+
/** URL of the page */
|
|
372
|
+
url: string;
|
|
373
|
+
/** Page title */
|
|
374
|
+
title: string;
|
|
375
|
+
/** Page description or null if not found */
|
|
376
|
+
description: string | null;
|
|
377
|
+
}
|
|
378
|
+
/**
|
|
379
|
+
* Crawl result interface
|
|
380
|
+
*/
|
|
381
|
+
interface CrawlResult {
|
|
382
|
+
/** Array of discovered URLs with basic info */
|
|
383
|
+
urls: CrawlUrl[];
|
|
384
|
+
/** Full scrape results (only when scrape: true) */
|
|
385
|
+
scraped?: ScrapeResult;
|
|
386
|
+
/** Crawl operation metadata */
|
|
387
|
+
metadata: CrawlMetadata;
|
|
388
|
+
}
|
|
389
|
+
/**
|
|
390
|
+
* Crawl metadata interface
|
|
391
|
+
*/
|
|
392
|
+
interface CrawlMetadata {
|
|
393
|
+
/** Total URLs discovered */
|
|
394
|
+
totalUrls: number;
|
|
395
|
+
/** Maximum depth reached */
|
|
396
|
+
maxDepth: number;
|
|
397
|
+
/** Total crawl duration in milliseconds */
|
|
398
|
+
totalDuration: number;
|
|
399
|
+
/** Seed URL that started the crawl */
|
|
400
|
+
seedUrl: string;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
/**
|
|
404
|
+
* ReaderClient
|
|
405
|
+
*
|
|
406
|
+
* A client wrapper that manages HeroCore lifecycle and provides
|
|
407
|
+
* a simple interface for scraping and crawling.
|
|
408
|
+
*
|
|
409
|
+
* @example
|
|
410
|
+
* const reader = new ReaderClient();
|
|
411
|
+
*
|
|
412
|
+
* const result = await reader.scrape({
|
|
413
|
+
* urls: ['https://example.com'],
|
|
414
|
+
* formats: ['markdown'],
|
|
415
|
+
* });
|
|
416
|
+
*
|
|
417
|
+
* console.log(result.data[0].markdown);
|
|
418
|
+
*
|
|
419
|
+
* // When done (optional - auto-closes on process exit)
|
|
420
|
+
* await reader.close();
|
|
421
|
+
*/
|
|
422
|
+
|
|
423
|
+
/**
|
|
424
|
+
* Proxy rotation strategy
|
|
425
|
+
*/
|
|
426
|
+
type ProxyRotation = "round-robin" | "random";
|
|
427
|
+
/**
|
|
428
|
+
* Configuration options for ReaderClient
|
|
429
|
+
*/
|
|
430
|
+
interface ReaderClientOptions {
|
|
431
|
+
/** Enable verbose logging (default: false) */
|
|
432
|
+
verbose?: boolean;
|
|
433
|
+
/** Show Chrome browser window (default: false) */
|
|
434
|
+
showChrome?: boolean;
|
|
435
|
+
/** Browser pool configuration */
|
|
436
|
+
browserPool?: BrowserPoolConfig;
|
|
437
|
+
/** List of proxies to rotate through */
|
|
438
|
+
proxies?: ProxyConfig[];
|
|
439
|
+
/** Proxy rotation strategy (default: "round-robin") */
|
|
440
|
+
proxyRotation?: ProxyRotation;
|
|
441
|
+
/** Skip TLS/SSL certificate verification (default: true) */
|
|
442
|
+
skipTLSVerification?: boolean;
|
|
443
|
+
}
|
|
444
|
+
/**
|
|
445
|
+
* ReaderClient manages the HeroCore lifecycle and provides
|
|
446
|
+
* scrape/crawl methods with automatic initialization.
|
|
447
|
+
*/
|
|
448
|
+
declare class ReaderClient {
|
|
449
|
+
private heroCore;
|
|
450
|
+
private pool;
|
|
451
|
+
private initialized;
|
|
452
|
+
private initializing;
|
|
453
|
+
private closed;
|
|
454
|
+
private options;
|
|
455
|
+
private proxyIndex;
|
|
456
|
+
private cleanupHandler;
|
|
457
|
+
constructor(options?: ReaderClientOptions);
|
|
458
|
+
/**
|
|
459
|
+
* Get the next proxy from the rotation pool
|
|
460
|
+
*/
|
|
461
|
+
private getNextProxy;
|
|
462
|
+
/**
|
|
463
|
+
* Initialize HeroCore. Called automatically on first scrape/crawl.
|
|
464
|
+
* Can be called explicitly if you want to pre-warm the client.
|
|
465
|
+
*/
|
|
466
|
+
start(): Promise<void>;
|
|
467
|
+
/**
|
|
468
|
+
* Internal initialization logic
|
|
469
|
+
*/
|
|
470
|
+
private initializeCore;
|
|
471
|
+
/**
|
|
472
|
+
* Create a connection to the HeroCore instance
|
|
473
|
+
*/
|
|
474
|
+
private createConnection;
|
|
475
|
+
/**
|
|
476
|
+
* Ensure client is initialized before operation
|
|
477
|
+
*/
|
|
478
|
+
private ensureInitialized;
|
|
479
|
+
/**
|
|
480
|
+
* Scrape one or more URLs
|
|
481
|
+
*
|
|
482
|
+
* @param options - Scrape options (urls, formats, etc.)
|
|
483
|
+
* @returns Scrape result with data and metadata
|
|
484
|
+
*
|
|
485
|
+
* @example
|
|
486
|
+
* const result = await reader.scrape({
|
|
487
|
+
* urls: ['https://example.com'],
|
|
488
|
+
* formats: ['markdown', 'html'],
|
|
489
|
+
* });
|
|
490
|
+
*/
|
|
491
|
+
scrape(options: Omit<ScrapeOptions, "connectionToCore" | "pool">): Promise<ScrapeResult>;
|
|
492
|
+
/**
|
|
493
|
+
* Crawl a website to discover URLs
|
|
494
|
+
*
|
|
495
|
+
* @param options - Crawl options (url, depth, maxPages, etc.)
|
|
496
|
+
* @returns Crawl result with discovered URLs and optional scraped content
|
|
497
|
+
*
|
|
498
|
+
* @example
|
|
499
|
+
* const result = await reader.crawl({
|
|
500
|
+
* url: 'https://example.com',
|
|
501
|
+
* depth: 2,
|
|
502
|
+
* maxPages: 50,
|
|
503
|
+
* scrape: true,
|
|
504
|
+
* });
|
|
505
|
+
*/
|
|
506
|
+
crawl(options: Omit<CrawlOptions, "connectionToCore" | "pool">): Promise<CrawlResult>;
|
|
507
|
+
/**
|
|
508
|
+
* Check if the client is initialized and ready
|
|
509
|
+
*/
|
|
510
|
+
isReady(): boolean;
|
|
511
|
+
/**
|
|
512
|
+
* Close the client and release resources
|
|
513
|
+
*
|
|
514
|
+
* Note: This is optional - the client will auto-close on process exit.
|
|
515
|
+
*/
|
|
516
|
+
close(): Promise<void>;
|
|
517
|
+
/**
|
|
518
|
+
* Register cleanup handlers for process exit
|
|
519
|
+
*/
|
|
520
|
+
private registerCleanup;
|
|
521
|
+
/**
|
|
522
|
+
* Remove process cleanup handlers
|
|
523
|
+
*/
|
|
524
|
+
private removeCleanupHandlers;
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
/**
|
|
528
|
+
* Scraper class with built-in concurrency support
|
|
529
|
+
*
|
|
530
|
+
* Features:
|
|
531
|
+
* - Hero-based browser automation
|
|
532
|
+
* - Automatic Cloudflare challenge detection and bypass
|
|
533
|
+
* - Built-in concurrency via browser pool
|
|
534
|
+
* - Progress tracking
|
|
535
|
+
* - Error handling per URL
|
|
536
|
+
*
|
|
537
|
+
* @example
|
|
538
|
+
* const scraper = new Scraper({
|
|
539
|
+
* urls: ['https://example.com', 'https://example.org'],
|
|
540
|
+
* formats: ['markdown', 'html'],
|
|
541
|
+
* batchConcurrency: 2,
|
|
542
|
+
* proxy: { type: 'residential', ... }
|
|
543
|
+
* });
|
|
544
|
+
*
|
|
545
|
+
* const result = await scraper.scrape();
|
|
546
|
+
* console.log(`Scraped ${result.batchMetadata.successfulUrls} URLs`);
|
|
547
|
+
*/
|
|
548
|
+
declare class Scraper {
|
|
549
|
+
private options;
|
|
550
|
+
private pool;
|
|
551
|
+
private logger;
|
|
552
|
+
private robotsCache;
|
|
553
|
+
constructor(options: ScrapeOptions);
|
|
554
|
+
/**
|
|
555
|
+
* Get robots.txt rules for a URL, cached per domain
|
|
556
|
+
*/
|
|
557
|
+
private getRobotsRules;
|
|
558
|
+
/**
|
|
559
|
+
* Scrape all URLs
|
|
560
|
+
*
|
|
561
|
+
* @returns Scrape result with pages and metadata
|
|
562
|
+
*/
|
|
563
|
+
scrape(): Promise<ScrapeResult>;
|
|
564
|
+
/**
|
|
565
|
+
* Scrape URLs with concurrency control
|
|
566
|
+
*/
|
|
567
|
+
private scrapeWithConcurrency;
|
|
568
|
+
/**
|
|
569
|
+
* Scrape a single URL with retry logic
|
|
570
|
+
*/
|
|
571
|
+
private scrapeSingleUrlWithRetry;
|
|
572
|
+
/**
|
|
573
|
+
* Wait for the final page to load after any Cloudflare redirects
|
|
574
|
+
* Cloudflare often does silent redirects even when bypassed, we need to ensure
|
|
575
|
+
* we're on the actual content page before scraping.
|
|
576
|
+
*/
|
|
577
|
+
private waitForFinalPage;
|
|
578
|
+
/**
|
|
579
|
+
* Scrape a single URL
|
|
580
|
+
*/
|
|
581
|
+
private scrapeSingleUrl;
|
|
582
|
+
/**
|
|
583
|
+
* Build final scrape result
|
|
584
|
+
*/
|
|
585
|
+
private buildScrapeResult;
|
|
586
|
+
}
|
|
587
|
+
/**
|
|
588
|
+
* Convenience function to scrape URLs
|
|
589
|
+
*
|
|
590
|
+
* @param options - Scrape options
|
|
591
|
+
* @returns Scrape result
|
|
592
|
+
*
|
|
593
|
+
* @example
|
|
594
|
+
* const result = await scrape({
|
|
595
|
+
* urls: ['https://example.com'],
|
|
596
|
+
* formats: ['markdown']
|
|
597
|
+
* });
|
|
598
|
+
*/
|
|
599
|
+
declare function scrape(options: ScrapeOptions): Promise<ScrapeResult>;
|
|
600
|
+
|
|
601
|
+
/**
|
|
602
|
+
* Crawler class for discovering and optionally scraping pages
|
|
603
|
+
*
|
|
604
|
+
* Features:
|
|
605
|
+
* - BFS/DFS crawling with depth control
|
|
606
|
+
* - Automatic Cloudflare challenge handling
|
|
607
|
+
* - Link extraction and filtering
|
|
608
|
+
* - Optional full content scraping
|
|
609
|
+
* - URL deduplication
|
|
610
|
+
*
|
|
611
|
+
* @example
|
|
612
|
+
* const crawler = new Crawler({
|
|
613
|
+
* url: 'https://example.com',
|
|
614
|
+
* depth: 2,
|
|
615
|
+
* maxPages: 20,
|
|
616
|
+
* scrape: true
|
|
617
|
+
* });
|
|
618
|
+
*
|
|
619
|
+
* const result = await crawler.crawl();
|
|
620
|
+
* console.log(`Discovered ${result.urls.length} URLs`);
|
|
621
|
+
*/
|
|
622
|
+
declare class Crawler {
|
|
623
|
+
private options;
|
|
624
|
+
private visited;
|
|
625
|
+
private queue;
|
|
626
|
+
private urls;
|
|
627
|
+
private pool;
|
|
628
|
+
private logger;
|
|
629
|
+
private robotsRules;
|
|
630
|
+
constructor(options: CrawlOptions);
|
|
631
|
+
/**
|
|
632
|
+
* Start crawling
|
|
633
|
+
*/
|
|
634
|
+
crawl(): Promise<CrawlResult>;
|
|
635
|
+
/**
|
|
636
|
+
* Fetch a single page and extract basic info
|
|
637
|
+
*/
|
|
638
|
+
private fetchPage;
|
|
639
|
+
/**
|
|
640
|
+
* Extract links from HTML content using DOM parsing
|
|
641
|
+
* Handles all href formats (single quotes, double quotes, unquoted)
|
|
642
|
+
*/
|
|
643
|
+
private extractLinks;
|
|
644
|
+
/**
|
|
645
|
+
* Scrape all discovered URLs
|
|
646
|
+
*/
|
|
647
|
+
private scrapeDiscoveredUrls;
|
|
648
|
+
}
|
|
649
|
+
/**
|
|
650
|
+
* Convenience function to crawl a website
|
|
651
|
+
*
|
|
652
|
+
* @param options - Crawl options
|
|
653
|
+
* @returns Crawl result
|
|
654
|
+
*
|
|
655
|
+
* @example
|
|
656
|
+
* const result = await crawl({
|
|
657
|
+
* url: 'https://example.com',
|
|
658
|
+
* depth: 2,
|
|
659
|
+
* maxPages: 20,
|
|
660
|
+
* scrape: true
|
|
661
|
+
* });
|
|
662
|
+
*/
|
|
663
|
+
declare function crawl(options: CrawlOptions): Promise<CrawlResult>;
|
|
664
|
+
|
|
665
|
+
/**
|
|
666
|
+
* Daemon Server
|
|
667
|
+
*
|
|
668
|
+
* An HTTP server that wraps ReaderClient, allowing multiple CLI
|
|
669
|
+
* commands to share a single browser pool for efficient scraping.
|
|
670
|
+
*
|
|
671
|
+
* @example
|
|
672
|
+
* // Start daemon
|
|
673
|
+
* const daemon = new DaemonServer({ port: 3847, poolSize: 5 });
|
|
674
|
+
* await daemon.start();
|
|
675
|
+
*
|
|
676
|
+
* // Stop daemon
|
|
677
|
+
* await daemon.stop();
|
|
678
|
+
*/
|
|
679
|
+
declare const DEFAULT_DAEMON_PORT = 3847;
|
|
680
|
+
/**
|
|
681
|
+
* Daemon server configuration
|
|
682
|
+
*/
|
|
683
|
+
interface DaemonServerOptions {
|
|
684
|
+
/** Port to listen on (default: 3847) */
|
|
685
|
+
port?: number;
|
|
686
|
+
/** Browser pool size (default: 5) */
|
|
687
|
+
poolSize?: number;
|
|
688
|
+
/** Enable verbose logging (default: false) */
|
|
689
|
+
verbose?: boolean;
|
|
690
|
+
/** Show Chrome browser windows (default: false) */
|
|
691
|
+
showChrome?: boolean;
|
|
692
|
+
}
|
|
693
|
+
/**
|
|
694
|
+
* Status response data
|
|
695
|
+
*/
|
|
696
|
+
interface DaemonStatus {
|
|
697
|
+
running: true;
|
|
698
|
+
port: number;
|
|
699
|
+
poolSize: number;
|
|
700
|
+
uptime: number;
|
|
701
|
+
pid: number;
|
|
702
|
+
}
|
|
703
|
+
/**
|
|
704
|
+
* Daemon Server
|
|
705
|
+
*/
|
|
706
|
+
declare class DaemonServer {
|
|
707
|
+
private server;
|
|
708
|
+
private client;
|
|
709
|
+
private options;
|
|
710
|
+
private startTime;
|
|
711
|
+
constructor(options?: DaemonServerOptions);
|
|
712
|
+
/**
|
|
713
|
+
* Start the daemon server
|
|
714
|
+
*/
|
|
715
|
+
start(): Promise<void>;
|
|
716
|
+
/**
|
|
717
|
+
* Stop the daemon server
|
|
718
|
+
*/
|
|
719
|
+
stop(): Promise<void>;
|
|
720
|
+
/**
|
|
721
|
+
* Get the port the daemon is running on
|
|
722
|
+
*/
|
|
723
|
+
getPort(): number;
|
|
724
|
+
/**
|
|
725
|
+
* Handle incoming HTTP requests
|
|
726
|
+
*/
|
|
727
|
+
private handleRequest;
|
|
728
|
+
/**
|
|
729
|
+
* Handle scrape request
|
|
730
|
+
*/
|
|
731
|
+
private handleScrape;
|
|
732
|
+
/**
|
|
733
|
+
* Handle crawl request
|
|
734
|
+
*/
|
|
735
|
+
private handleCrawl;
|
|
736
|
+
/**
|
|
737
|
+
* Handle status request
|
|
738
|
+
*/
|
|
739
|
+
private handleStatus;
|
|
740
|
+
/**
|
|
741
|
+
* Handle shutdown request
|
|
742
|
+
*/
|
|
743
|
+
private handleShutdown;
|
|
744
|
+
/**
|
|
745
|
+
* Send JSON response
|
|
746
|
+
*/
|
|
747
|
+
private sendResponse;
|
|
748
|
+
/**
|
|
749
|
+
* Write PID file
|
|
750
|
+
*/
|
|
751
|
+
private writePidFile;
|
|
752
|
+
/**
|
|
753
|
+
* Remove PID file
|
|
754
|
+
*/
|
|
755
|
+
private removePidFile;
|
|
756
|
+
}
|
|
757
|
+
/**
|
|
758
|
+
* Get path to PID file
|
|
759
|
+
*/
|
|
760
|
+
declare function getPidFilePath(): Promise<string>;
|
|
761
|
+
/**
|
|
762
|
+
* Check if daemon is running by reading PID file
|
|
763
|
+
*/
|
|
764
|
+
declare function getDaemonInfo(): Promise<{
|
|
765
|
+
pid: number;
|
|
766
|
+
port: number;
|
|
767
|
+
startedAt: string;
|
|
768
|
+
} | null>;
|
|
769
|
+
|
|
770
|
+
/**
|
|
771
|
+
* Daemon Client
|
|
772
|
+
*
|
|
773
|
+
* A client that connects to the daemon server via HTTP.
|
|
774
|
+
* Used by CLI commands when a daemon is running.
|
|
775
|
+
*
|
|
776
|
+
* @example
|
|
777
|
+
* const client = new DaemonClient({ port: 3847 });
|
|
778
|
+
*
|
|
779
|
+
* const result = await client.scrape({
|
|
780
|
+
* urls: ['https://example.com'],
|
|
781
|
+
* formats: ['markdown'],
|
|
782
|
+
* });
|
|
783
|
+
*/
|
|
784
|
+
|
|
785
|
+
/**
|
|
786
|
+
* Daemon client configuration
|
|
787
|
+
*/
|
|
788
|
+
interface DaemonClientOptions {
|
|
789
|
+
/** Port the daemon is running on (default: 3847) */
|
|
790
|
+
port?: number;
|
|
791
|
+
/** Request timeout in milliseconds (default: 600000 = 10 minutes) */
|
|
792
|
+
timeoutMs?: number;
|
|
793
|
+
}
|
|
794
|
+
/**
|
|
795
|
+
* Daemon Client
|
|
796
|
+
*/
|
|
797
|
+
declare class DaemonClient {
|
|
798
|
+
private options;
|
|
799
|
+
constructor(options?: DaemonClientOptions);
|
|
800
|
+
/**
|
|
801
|
+
* Scrape URLs via daemon
|
|
802
|
+
*/
|
|
803
|
+
scrape(options: Omit<ScrapeOptions, "connectionToCore">): Promise<ScrapeResult>;
|
|
804
|
+
/**
|
|
805
|
+
* Crawl URL via daemon
|
|
806
|
+
*/
|
|
807
|
+
crawl(options: Omit<CrawlOptions, "connectionToCore">): Promise<CrawlResult>;
|
|
808
|
+
/**
|
|
809
|
+
* Get daemon status
|
|
810
|
+
*/
|
|
811
|
+
status(): Promise<DaemonStatus>;
|
|
812
|
+
/**
|
|
813
|
+
* Request daemon shutdown
|
|
814
|
+
*/
|
|
815
|
+
shutdown(): Promise<void>;
|
|
816
|
+
/**
|
|
817
|
+
* Check if daemon is reachable
|
|
818
|
+
*/
|
|
819
|
+
isRunning(): Promise<boolean>;
|
|
820
|
+
/**
|
|
821
|
+
* Make HTTP request to daemon
|
|
822
|
+
*/
|
|
823
|
+
private request;
|
|
824
|
+
}
|
|
825
|
+
/**
|
|
826
|
+
* Check if daemon is running on the specified port
|
|
827
|
+
*/
|
|
828
|
+
declare function isDaemonRunning(port?: number): Promise<boolean>;
|
|
829
|
+
|
|
830
|
+
/**
|
|
831
|
+
* Convert pages to consolidated Markdown format
|
|
832
|
+
*/
|
|
833
|
+
declare function formatToMarkdown(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata, includeMetadata?: boolean): string;
|
|
834
|
+
|
|
835
|
+
/**
|
|
836
|
+
* Convert pages to HTML format with metadata
|
|
837
|
+
*/
|
|
838
|
+
declare function formatToHTML(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
|
|
839
|
+
|
|
840
|
+
/**
|
|
841
|
+
* Convert pages to JSON format with metadata
|
|
842
|
+
*/
|
|
843
|
+
declare function formatToJson(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
|
|
844
|
+
/**
|
|
845
|
+
* Convert pages to JSON format without HTML (lighter version)
|
|
846
|
+
*/
|
|
847
|
+
declare function formatToJsonLite(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata): string;
|
|
848
|
+
|
|
849
|
+
/**
|
|
850
|
+
* Convert pages to plain text format
|
|
851
|
+
*
|
|
852
|
+
* Strips all HTML tags and formatting, preserving only readable text content.
|
|
853
|
+
* Useful for LLM consumption where markdown formatting is not needed.
|
|
854
|
+
*/
|
|
855
|
+
declare function formatToText(pages: Page[], baseUrl: string, scrapedAt: string, duration: number, website: WebsiteMetadata, includeMetadata?: boolean): string;
|
|
856
|
+
|
|
857
|
+
/**
|
|
858
|
+
* Extract comprehensive website metadata from HTML content
|
|
859
|
+
* Uses proper DOM parsing for reliable attribute extraction
|
|
860
|
+
*/
|
|
861
|
+
declare function extractMetadata(html: string, baseUrl: string): WebsiteMetadata;
|
|
862
|
+
|
|
863
|
+
/**
|
|
864
|
+
* HTML content cleaning utilities using DOM parsing
|
|
865
|
+
*/
|
|
866
|
+
/**
|
|
867
|
+
* Content cleaning options
|
|
868
|
+
*/
|
|
869
|
+
interface CleaningOptions {
|
|
870
|
+
/** Remove ads and tracking elements (default: true) */
|
|
871
|
+
removeAds?: boolean;
|
|
872
|
+
/** Remove base64-encoded images (default: true) */
|
|
873
|
+
removeBase64Images?: boolean;
|
|
874
|
+
}
|
|
875
|
+
/**
|
|
876
|
+
* Clean HTML content (alias for cleanHtml with options)
|
|
877
|
+
*/
|
|
878
|
+
declare function cleanContent(html: string, baseUrl: string, options?: CleaningOptions): string;
|
|
879
|
+
|
|
880
|
+
/**
|
|
881
|
+
* URL validation and normalization utilities
|
|
882
|
+
*/
|
|
883
|
+
/**
|
|
884
|
+
* Resolve a relative URL against a base URL
|
|
885
|
+
*/
|
|
886
|
+
declare function resolveUrl(relative: string, base: string): string;
|
|
887
|
+
/**
|
|
888
|
+
* Validate if a string is a valid URL
|
|
889
|
+
*/
|
|
890
|
+
declare function isValidUrl(string: string): boolean;
|
|
891
|
+
/**
|
|
892
|
+
* Check if a URL belongs to the same domain as the base URL
|
|
893
|
+
* Supports subdomains: blog.example.com matches example.com
|
|
894
|
+
*/
|
|
895
|
+
declare function isSameDomain(url: string, baseUrl: string): boolean;
|
|
896
|
+
/**
|
|
897
|
+
* Generate a URL key for deduplication
|
|
898
|
+
*/
|
|
899
|
+
declare function getUrlKey(url: string): string;
|
|
900
|
+
/**
|
|
901
|
+
* Validate an array of URLs and return validation results
|
|
902
|
+
*/
|
|
903
|
+
declare function validateUrls(urls: string[]): {
|
|
904
|
+
isValid: boolean;
|
|
905
|
+
validUrls: string[];
|
|
906
|
+
errors: Array<{
|
|
907
|
+
url: string;
|
|
908
|
+
error: string;
|
|
909
|
+
}>;
|
|
910
|
+
};
|
|
911
|
+
/**
|
|
912
|
+
* Check if a URL should be crawled based on various criteria
|
|
913
|
+
*/
|
|
914
|
+
declare function shouldCrawlUrl(url: string, baseUrl: string, maxDepth: number, currentDepth: number, visited: Set<string>): boolean;
|
|
915
|
+
|
|
916
|
+
/**
|
|
917
|
+
* Simple rate limit function
|
|
918
|
+
*/
|
|
919
|
+
declare function rateLimit(ms: number): Promise<void>;
|
|
920
|
+
|
|
921
|
+
/**
|
|
922
|
+
* Browser Pool
|
|
923
|
+
*
|
|
924
|
+
* Manages a pool of Hero browser instances with:
|
|
925
|
+
* - Auto-recycling based on age/request count
|
|
926
|
+
* - Request queuing when pool is full
|
|
927
|
+
* - Health monitoring
|
|
928
|
+
*
|
|
929
|
+
* @example
|
|
930
|
+
* const pool = new BrowserPool({ size: 5 });
|
|
931
|
+
* await pool.initialize();
|
|
932
|
+
*
|
|
933
|
+
* // Use withBrowser for automatic acquire/release
|
|
934
|
+
* await pool.withBrowser(async (hero) => {
|
|
935
|
+
* await hero.goto('https://example.com');
|
|
936
|
+
* const title = await hero.document.title;
|
|
937
|
+
* return title;
|
|
938
|
+
* });
|
|
939
|
+
*
|
|
940
|
+
* await pool.shutdown();
|
|
941
|
+
*/
|
|
942
|
+
declare class BrowserPool implements IBrowserPool {
|
|
943
|
+
private instances;
|
|
944
|
+
private available;
|
|
945
|
+
private inUse;
|
|
946
|
+
private queue;
|
|
947
|
+
private config;
|
|
948
|
+
private proxy?;
|
|
949
|
+
private recycleTimer?;
|
|
950
|
+
private healthTimer?;
|
|
951
|
+
private totalRequests;
|
|
952
|
+
private totalRequestDuration;
|
|
953
|
+
private showChrome;
|
|
954
|
+
private connectionToCore?;
|
|
955
|
+
private userAgent?;
|
|
956
|
+
private verbose;
|
|
957
|
+
private logger;
|
|
958
|
+
constructor(config?: Partial<PoolConfig>, proxy?: ProxyConfig, showChrome?: boolean, connectionToCore?: any, userAgent?: string, verbose?: boolean);
|
|
959
|
+
/**
|
|
960
|
+
* Initialize the pool by pre-launching browsers
|
|
961
|
+
*/
|
|
962
|
+
initialize(): Promise<void>;
|
|
963
|
+
/**
|
|
964
|
+
* Shutdown the pool and close all browsers
|
|
965
|
+
*/
|
|
966
|
+
shutdown(): Promise<void>;
|
|
967
|
+
/**
|
|
968
|
+
* Acquire a browser from the pool
|
|
969
|
+
*/
|
|
970
|
+
acquire(): Promise<Hero>;
|
|
971
|
+
/**
|
|
972
|
+
* Release a browser back to the pool
|
|
973
|
+
*/
|
|
974
|
+
release(hero: Hero): void;
|
|
975
|
+
/**
|
|
976
|
+
* Execute callback with auto-managed browser
|
|
977
|
+
*/
|
|
978
|
+
withBrowser<T>(callback: (hero: Hero) => Promise<T>): Promise<T>;
|
|
979
|
+
/**
|
|
980
|
+
* Get pool statistics
|
|
981
|
+
*/
|
|
982
|
+
getStats(): PoolStats;
|
|
983
|
+
/**
|
|
984
|
+
* Run health check
|
|
985
|
+
*/
|
|
986
|
+
healthCheck(): Promise<HealthStatus>;
|
|
987
|
+
/**
|
|
988
|
+
* Create a new browser instance
|
|
989
|
+
*/
|
|
990
|
+
private createInstance;
|
|
991
|
+
/**
|
|
992
|
+
* Check if instance should be recycled
|
|
993
|
+
*/
|
|
994
|
+
private shouldRecycle;
|
|
995
|
+
/**
|
|
996
|
+
* Recycle an instance (close old, create new)
|
|
997
|
+
*/
|
|
998
|
+
private recycleInstance;
|
|
999
|
+
/**
|
|
1000
|
+
* Queue a request when no browsers available
|
|
1001
|
+
*/
|
|
1002
|
+
private queueRequest;
|
|
1003
|
+
/**
|
|
1004
|
+
* Process queued requests
|
|
1005
|
+
*/
|
|
1006
|
+
private processQueue;
|
|
1007
|
+
/**
|
|
1008
|
+
* Start background recycling task
|
|
1009
|
+
*/
|
|
1010
|
+
private startRecycling;
|
|
1011
|
+
/**
|
|
1012
|
+
* Start background health checks
|
|
1013
|
+
*/
|
|
1014
|
+
private startHealthChecks;
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
/**
|
|
1018
|
+
* Hero configuration options
|
|
1019
|
+
*/
|
|
1020
|
+
interface HeroConfigOptions {
|
|
1021
|
+
/** Proxy configuration */
|
|
1022
|
+
proxy?: ProxyConfig;
|
|
1023
|
+
/** Show Chrome window (default: false) */
|
|
1024
|
+
showChrome?: boolean;
|
|
1025
|
+
/** Custom user agent */
|
|
1026
|
+
userAgent?: string;
|
|
1027
|
+
/** Connection to Core (for in-process Core) */
|
|
1028
|
+
connectionToCore?: any;
|
|
1029
|
+
}
|
|
1030
|
+
/**
|
|
1031
|
+
* Create Hero configuration with optimal anti-bot bypass settings
|
|
1032
|
+
*
|
|
1033
|
+
* Extracted from proven hero-test implementation.
|
|
1034
|
+
* Includes:
|
|
1035
|
+
* - TLS fingerprint emulation (disableMitm: false)
|
|
1036
|
+
* - DNS over TLS (mimics Chrome)
|
|
1037
|
+
* - WebRTC IP masking
|
|
1038
|
+
* - Proper locale and timezone
|
|
1039
|
+
*
|
|
1040
|
+
* @param options - Configuration options
|
|
1041
|
+
* @returns Hero configuration object
|
|
1042
|
+
*/
|
|
1043
|
+
declare function createHeroConfig(options?: HeroConfigOptions): any;
|
|
1044
|
+
|
|
1045
|
+
/**
|
|
1046
|
+
* Cloudflare challenge detection result
|
|
1047
|
+
*/
|
|
1048
|
+
interface ChallengeDetection {
|
|
1049
|
+
/** Whether a challenge was detected */
|
|
1050
|
+
isChallenge: boolean;
|
|
1051
|
+
/** Type of challenge */
|
|
1052
|
+
type: "js_challenge" | "turnstile" | "captcha" | "blocked" | "none";
|
|
1053
|
+
/** Confidence level (0-100) */
|
|
1054
|
+
confidence: number;
|
|
1055
|
+
/** Detection signals found */
|
|
1056
|
+
signals: string[];
|
|
1057
|
+
}
|
|
1058
|
+
/**
|
|
1059
|
+
* Challenge resolution result
|
|
1060
|
+
*/
|
|
1061
|
+
interface ChallengeResolutionResult {
|
|
1062
|
+
/** Whether the challenge was resolved */
|
|
1063
|
+
resolved: boolean;
|
|
1064
|
+
/** Method used to detect resolution */
|
|
1065
|
+
method: "url_redirect" | "signals_cleared" | "timeout";
|
|
1066
|
+
/** Time waited in milliseconds */
|
|
1067
|
+
waitedMs: number;
|
|
1068
|
+
}
|
|
1069
|
+
/**
|
|
1070
|
+
* Challenge waiting options
|
|
1071
|
+
*/
|
|
1072
|
+
interface ChallengeWaitOptions {
|
|
1073
|
+
/** Maximum time to wait for resolution (default: 45000ms) */
|
|
1074
|
+
maxWaitMs?: number;
|
|
1075
|
+
/** How often to poll for resolution (default: 500ms) */
|
|
1076
|
+
pollIntervalMs?: number;
|
|
1077
|
+
/** Enable verbose logging */
|
|
1078
|
+
verbose?: boolean;
|
|
1079
|
+
/** Initial URL before challenge */
|
|
1080
|
+
initialUrl: string;
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
/**
|
|
1084
|
+
* Detect if current page is a Cloudflare challenge
|
|
1085
|
+
*
|
|
1086
|
+
* Uses multi-signal approach with ONLY challenge-specific indicators.
|
|
1087
|
+
* No content length heuristics to avoid false positives.
|
|
1088
|
+
*
|
|
1089
|
+
* @param hero - Hero instance with loaded page
|
|
1090
|
+
* @returns Detection result with confidence score and signals
|
|
1091
|
+
*
|
|
1092
|
+
* @example
|
|
1093
|
+
* const detection = await detectChallenge(hero);
|
|
1094
|
+
* if (detection.isChallenge) {
|
|
1095
|
+
* console.log(`Challenge detected: ${detection.type}`);
|
|
1096
|
+
* console.log(`Signals: ${detection.signals.join(', ')}`);
|
|
1097
|
+
* }
|
|
1098
|
+
*/
|
|
1099
|
+
declare function detectChallenge(hero: Hero): Promise<ChallengeDetection>;
|
|
1100
|
+
/**
|
|
1101
|
+
* Quick check - just returns boolean
|
|
1102
|
+
*
|
|
1103
|
+
* @param hero - Hero instance
|
|
1104
|
+
* @returns True if challenge page detected
|
|
1105
|
+
*/
|
|
1106
|
+
declare function isChallengePage(hero: Hero): Promise<boolean>;
|
|
1107
|
+
|
|
1108
|
+
/**
|
|
1109
|
+
* Wait for Cloudflare challenge to resolve
|
|
1110
|
+
*
|
|
1111
|
+
* Uses multiple detection strategies:
|
|
1112
|
+
* 1. URL redirect detection (page redirects after challenge)
|
|
1113
|
+
* 2. Signal polling (challenge-specific elements/text disappear)
|
|
1114
|
+
*
|
|
1115
|
+
* @param hero - Hero instance with challenge page loaded
|
|
1116
|
+
* @param options - Waiting options
|
|
1117
|
+
* @returns Resolution result with method and time waited
|
|
1118
|
+
*
|
|
1119
|
+
* @example
|
|
1120
|
+
* const result = await waitForChallengeResolution(hero, {
|
|
1121
|
+
* maxWaitMs: 45000,
|
|
1122
|
+
* pollIntervalMs: 500,
|
|
1123
|
+
* verbose: true,
|
|
1124
|
+
* initialUrl: 'https://example.com'
|
|
1125
|
+
* });
|
|
1126
|
+
*
|
|
1127
|
+
* if (result.resolved) {
|
|
1128
|
+
* console.log(`Challenge resolved via ${result.method} in ${result.waitedMs}ms`);
|
|
1129
|
+
* }
|
|
1130
|
+
*/
|
|
1131
|
+
declare function waitForChallengeResolution(hero: Hero, options: ChallengeWaitOptions): Promise<ChallengeResolutionResult>;
|
|
1132
|
+
/**
|
|
1133
|
+
* Wait for a specific CSS selector to appear
|
|
1134
|
+
*
|
|
1135
|
+
* Useful when you know exactly what element should appear after challenge.
|
|
1136
|
+
*
|
|
1137
|
+
* @param hero - Hero instance
|
|
1138
|
+
* @param selector - CSS selector to wait for
|
|
1139
|
+
* @param maxWaitMs - Maximum time to wait
|
|
1140
|
+
* @param verbose - Enable logging
|
|
1141
|
+
* @returns Whether selector was found and time waited
|
|
1142
|
+
*
|
|
1143
|
+
* @example
|
|
1144
|
+
* const result = await waitForSelector(hero, '.content', 30000, true);
|
|
1145
|
+
* if (result.found) {
|
|
1146
|
+
* console.log(`Content appeared after ${result.waitedMs}ms`);
|
|
1147
|
+
* }
|
|
1148
|
+
*/
|
|
1149
|
+
declare function waitForSelector(hero: Hero, selector: string, maxWaitMs: number, verbose?: boolean): Promise<{
|
|
1150
|
+
found: boolean;
|
|
1151
|
+
waitedMs: number;
|
|
1152
|
+
}>;
|
|
1153
|
+
/**
|
|
1154
|
+
* Handle Cloudflare challenge with automatic detection and waiting
|
|
1155
|
+
*
|
|
1156
|
+
* High-level function that combines detection and resolution.
|
|
1157
|
+
*
|
|
1158
|
+
* @param hero - Hero instance
|
|
1159
|
+
* @param options - Wait options (without initialUrl)
|
|
1160
|
+
* @returns Resolution result
|
|
1161
|
+
*
|
|
1162
|
+
* @example
|
|
1163
|
+
* await hero.goto('https://example.com');
|
|
1164
|
+
* const result = await handleChallenge(hero, { verbose: true });
|
|
1165
|
+
* if (result.resolved) {
|
|
1166
|
+
* // Challenge passed, continue scraping
|
|
1167
|
+
* }
|
|
1168
|
+
*/
|
|
1169
|
+
declare function handleChallenge(hero: Hero, options?: Omit<ChallengeWaitOptions, "initialUrl">): Promise<ChallengeResolutionResult>;
|
|
1170
|
+
|
|
1171
|
+
/**
|
|
1172
|
+
* Create proxy URL from configuration
|
|
1173
|
+
*
|
|
1174
|
+
* Supports both datacenter and residential proxies.
|
|
1175
|
+
* For residential proxies (e.g., IPRoyal), generates a sticky session ID.
|
|
1176
|
+
*
|
|
1177
|
+
* @param config - Proxy configuration
|
|
1178
|
+
* @returns Formatted proxy URL
|
|
1179
|
+
*
|
|
1180
|
+
* @example
|
|
1181
|
+
* // Datacenter proxy
|
|
1182
|
+
* createProxyUrl({
|
|
1183
|
+
* type: 'datacenter',
|
|
1184
|
+
* username: 'user',
|
|
1185
|
+
* password: 'pass',
|
|
1186
|
+
* host: 'proxy.example.com',
|
|
1187
|
+
* port: 8080
|
|
1188
|
+
* })
|
|
1189
|
+
* // Returns: "http://user:pass@proxy.example.com:8080"
|
|
1190
|
+
*
|
|
1191
|
+
* @example
|
|
1192
|
+
* // Residential proxy with sticky session
|
|
1193
|
+
* createProxyUrl({
|
|
1194
|
+
* type: 'residential',
|
|
1195
|
+
* username: 'customer-abc',
|
|
1196
|
+
* password: 'secret',
|
|
1197
|
+
* host: 'geo.iproyal.com',
|
|
1198
|
+
* port: 12321,
|
|
1199
|
+
* country: 'us'
|
|
1200
|
+
* })
|
|
1201
|
+
* // Returns: "http://customer-abc_session-hero_123_abc456_country-us:secret@geo.iproyal.com:12321"
|
|
1202
|
+
*/
|
|
1203
|
+
declare function createProxyUrl(config: ProxyConfig): string;
|
|
1204
|
+
/**
|
|
1205
|
+
* Parse proxy URL into ProxyConfig
|
|
1206
|
+
*
|
|
1207
|
+
* @param url - Proxy URL string
|
|
1208
|
+
* @returns Parsed proxy configuration
|
|
1209
|
+
*
|
|
1210
|
+
* @example
|
|
1211
|
+
* parseProxyUrl("http://user:pass@proxy.example.com:8080")
|
|
1212
|
+
* // Returns: { username: 'user', password: 'pass', host: 'proxy.example.com', port: 8080 }
|
|
1213
|
+
*/
|
|
1214
|
+
declare function parseProxyUrl(url: string): ProxyConfig;
|
|
1215
|
+
|
|
1216
|
+
export { type BatchMetadata, type BrowserInstance, BrowserPool, type BrowserPoolConfig, type ChallengeDetection, type ChallengeResolutionResult, type ChallengeWaitOptions, type CrawlMetadata, type CrawlOptions, type CrawlResult, type CrawlUrl, Crawler, DEFAULT_DAEMON_PORT, DEFAULT_OPTIONS, DaemonClient, type DaemonClientOptions, DaemonServer, type DaemonServerOptions, type DaemonStatus, type HealthStatus, BrowserPool as HeroBrowserPool, type IBrowserPool, type Page, type PoolConfig, type PoolStats, type ProxyConfig, type ProxyMetadata, type ProxyRotation, ReaderClient, type ReaderClientOptions, type ScrapeOptions, type ScrapeResult, Scraper, type WebsiteMetadata, type WebsiteScrapeResult, cleanContent, crawl, createHeroConfig, createProxyUrl, detectChallenge, extractMetadata, formatToHTML, formatToJson, formatToJsonLite, formatToMarkdown, formatToText, getDaemonInfo, getPidFilePath, getUrlKey, handleChallenge, isChallengePage, isDaemonRunning, isSameDomain, isValidFormat, isValidUrl, parseProxyUrl, rateLimit, resolveUrl, scrape, shouldCrawlUrl, shouldCrawlUrl$1 as shouldCrawlUrlFn, validateUrls, waitForChallengeResolution, waitForSelector };
|