rezo 1.0.43 → 1.0.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/dist/adapters/entries/curl.d.ts +115 -0
  2. package/dist/adapters/entries/fetch.d.ts +115 -0
  3. package/dist/adapters/entries/http.d.ts +115 -0
  4. package/dist/adapters/entries/http2.d.ts +115 -0
  5. package/dist/adapters/entries/react-native.d.ts +115 -0
  6. package/dist/adapters/entries/xhr.d.ts +115 -0
  7. package/dist/adapters/fetch.cjs +18 -0
  8. package/dist/adapters/fetch.js +18 -0
  9. package/dist/adapters/http.cjs +18 -0
  10. package/dist/adapters/http.js +18 -0
  11. package/dist/adapters/http2.cjs +18 -0
  12. package/dist/adapters/http2.js +18 -0
  13. package/dist/adapters/index.cjs +6 -6
  14. package/dist/adapters/xhr.cjs +19 -0
  15. package/dist/adapters/xhr.js +19 -0
  16. package/dist/cache/index.cjs +9 -15
  17. package/dist/cache/index.js +0 -3
  18. package/dist/core/hooks.cjs +4 -2
  19. package/dist/core/hooks.js +4 -2
  20. package/dist/crawler/addon/decodo/index.cjs +1 -0
  21. package/dist/crawler/addon/decodo/index.js +1 -0
  22. package/dist/crawler/crawler-options.cjs +1 -0
  23. package/dist/crawler/crawler-options.js +1 -0
  24. package/dist/{plugin → crawler}/crawler.cjs +392 -32
  25. package/dist/{plugin → crawler}/crawler.js +392 -32
  26. package/dist/crawler/index.cjs +40 -0
  27. package/dist/{plugin → crawler}/index.js +4 -2
  28. package/dist/crawler/plugin/file-cacher.cjs +19 -0
  29. package/dist/crawler/plugin/file-cacher.js +19 -0
  30. package/dist/crawler/plugin/index.cjs +1 -0
  31. package/dist/crawler/plugin/index.js +1 -0
  32. package/dist/crawler/plugin/navigation-history.cjs +43 -0
  33. package/dist/crawler/plugin/navigation-history.js +43 -0
  34. package/dist/crawler/plugin/robots-txt.cjs +2 -0
  35. package/dist/crawler/plugin/robots-txt.js +2 -0
  36. package/dist/crawler/plugin/url-store.cjs +18 -0
  37. package/dist/crawler/plugin/url-store.js +18 -0
  38. package/dist/crawler.d.ts +430 -172
  39. package/dist/entries/crawler.cjs +5 -5
  40. package/dist/entries/crawler.js +2 -2
  41. package/dist/index.cjs +27 -27
  42. package/dist/index.d.ts +115 -0
  43. package/dist/internal/agents/index.cjs +10 -10
  44. package/dist/platform/browser.d.ts +115 -0
  45. package/dist/platform/bun.d.ts +115 -0
  46. package/dist/platform/deno.d.ts +115 -0
  47. package/dist/platform/node.d.ts +115 -0
  48. package/dist/platform/react-native.d.ts +115 -0
  49. package/dist/platform/worker.d.ts +115 -0
  50. package/dist/proxy/index.cjs +5 -5
  51. package/dist/proxy/index.js +1 -1
  52. package/dist/queue/index.cjs +8 -8
  53. package/dist/responses/universal/index.cjs +11 -11
  54. package/dist/utils/rate-limit-wait.cjs +217 -0
  55. package/dist/utils/rate-limit-wait.js +208 -0
  56. package/package.json +2 -6
  57. package/dist/cache/file-cacher.cjs +0 -270
  58. package/dist/cache/file-cacher.js +0 -267
  59. package/dist/cache/navigation-history.cjs +0 -298
  60. package/dist/cache/navigation-history.js +0 -296
  61. package/dist/cache/url-store.cjs +0 -294
  62. package/dist/cache/url-store.js +0 -291
  63. package/dist/plugin/addon/decodo/index.cjs +0 -1
  64. package/dist/plugin/addon/decodo/index.js +0 -1
  65. package/dist/plugin/crawler-options.cjs +0 -1
  66. package/dist/plugin/crawler-options.js +0 -1
  67. package/dist/plugin/index.cjs +0 -36
  68. /package/dist/{plugin → crawler}/addon/decodo/options.cjs +0 -0
  69. /package/dist/{plugin → crawler}/addon/decodo/options.js +0 -0
  70. /package/dist/{plugin → crawler}/addon/decodo/types.cjs +0 -0
  71. /package/dist/{plugin → crawler}/addon/decodo/types.js +0 -0
  72. /package/dist/{plugin → crawler}/addon/oxylabs/index.cjs +0 -0
  73. /package/dist/{plugin → crawler}/addon/oxylabs/index.js +0 -0
  74. /package/dist/{plugin → crawler}/addon/oxylabs/options.cjs +0 -0
  75. /package/dist/{plugin → crawler}/addon/oxylabs/options.js +0 -0
  76. /package/dist/{plugin → crawler}/addon/oxylabs/types.cjs +0 -0
  77. /package/dist/{plugin → crawler}/addon/oxylabs/types.js +0 -0
  78. /package/dist/{plugin → crawler}/scraper.cjs +0 -0
  79. /package/dist/{plugin → crawler}/scraper.js +0 -0
package/dist/crawler.d.ts CHANGED
@@ -6,43 +6,24 @@ import { SecureContext, TLSSocket } from 'node:tls';
6
6
  import { Cookie as TouchCookie, CookieJar as TouchCookieJar, CreateCookieOptions } from 'tough-cookie';
7
7
 
8
8
  /**
9
- * FileCacher - Cross-runtime SQLite-based file caching system
9
+ * CrawlerCache - High-performance SQLite-based response caching for web crawlers
10
10
  *
11
- * Provides persistent key-value storage with namespace support, TTL expiration,
12
- * and optional zstd compression for efficient data storage.
11
+ * Optimized specifically for crawler workloads with:
12
+ * - WAL mode for high-throughput concurrent reads/writes
13
+ * - Batch operations for efficient bulk storage
14
+ * - Domain-based namespacing for organized cache management
15
+ * - Optional zstd compression for storage efficiency
13
16
  *
14
17
  * @module cache/file-cacher
15
18
  * @author Rezo HTTP Client Library
16
- *
17
- * @example
18
- * ```typescript
19
- * import { FileCacher } from 'rezo';
20
- *
21
- * // Create a file cacher instance
22
- * const cacher = await FileCacher.create({
23
- * cacheDir: './cache',
24
- * ttl: 3600000, // 1 hour
25
- * compression: true,
26
- * encryptNamespace: true
27
- * });
28
- *
29
- * // Store and retrieve data
30
- * await cacher.set('user:123', { name: 'John' }, 3600000, 'users');
31
- * const user = await cacher.get('user:123', 'users');
32
- *
33
- * // Check existence and cleanup
34
- * const exists = await cacher.has('user:123', 'users');
35
- * await cacher.delete('user:123', 'users');
36
- * await cacher.close();
37
- * ```
38
19
  */
39
20
  /**
40
- * Configuration options for FileCacher
21
+ * Configuration options for CrawlerCache
41
22
  */
42
23
  export interface FileCacherOptions {
43
24
  /**
44
25
  * Directory path for storing cache databases
45
- * @default './cache'
26
+ * @default '/tmp/rezo-crawler/cache'
46
27
  */
47
28
  cacheDir?: string;
48
29
  /**
@@ -51,23 +32,18 @@ export interface FileCacherOptions {
51
32
  */
52
33
  ttl?: number;
53
34
  /**
54
- * Enable zstd compression for stored values
35
+ * Enable zstd compression for stored values (Node.js 22.15+)
55
36
  * Reduces storage size but adds CPU overhead
56
37
  * @default false
57
38
  */
58
39
  compression?: boolean;
59
40
  /**
60
- * Enable soft delete (mark as deleted instead of removing)
61
- * @default false
62
- */
63
- softDelete?: boolean;
64
- /**
65
- * Hash namespace names for privacy/security
41
+ * Hash namespace names for privacy
66
42
  * @default false
67
43
  */
68
44
  encryptNamespace?: boolean;
69
45
  /**
70
- * Maximum number of entries per namespace (0 = unlimited)
46
+ * Maximum entries per namespace (0 = unlimited)
71
47
  * @default 0
72
48
  */
73
49
  maxEntries?: number;
@@ -77,155 +53,63 @@ declare class FileCacher {
77
53
  private readonly options;
78
54
  private readonly cacheDir;
79
55
  private closed;
80
- /**
81
- * Private constructor - use FileCacher.create() instead
82
- */
83
56
  private constructor();
84
57
  /**
85
58
  * Create a new FileCacher instance
86
- *
87
- * @param options - Configuration options
88
- * @returns Promise resolving to initialized FileCacher instance
89
- *
90
- * @example
91
- * ```typescript
92
- * const cacher = await FileCacher.create({
93
- * cacheDir: './my-cache',
94
- * ttl: 3600000,
95
- * compression: true
96
- * });
97
- * ```
98
59
  */
99
60
  static create(options?: FileCacherOptions): Promise<FileCacher>;
100
61
  /**
101
- * Get or create database for a namespace
62
+ * Get or create optimized database for a namespace (domain)
102
63
  */
103
64
  private getDatabase;
104
65
  /**
105
- * Store a value in the cache
106
- *
107
- * @param key - Unique key for the cached item
108
- * @param value - Value to cache (will be JSON serialized)
109
- * @param ttl - Time-to-live in milliseconds (uses default if not specified)
110
- * @param namespace - Namespace for isolation (default: 'default')
111
- * @returns Promise resolving when stored
112
- *
113
- * @example
114
- * ```typescript
115
- * // Store with default TTL
116
- * await cacher.set('key1', { data: 'value' });
117
- *
118
- * // Store with custom TTL and namespace
119
- * await cacher.set('key2', responseData, 3600000, 'api-responses');
120
- * ```
66
+ * Store a response in the cache
121
67
  */
122
68
  set<T = any>(key: string, value: T, ttl?: number, namespace?: string): Promise<void>;
123
69
  /**
124
- * Retrieve a value from the cache
125
- *
126
- * @param key - Key of the cached item
127
- * @param namespace - Namespace to search in (default: 'default')
128
- * @returns Promise resolving to cached value or null if not found/expired
129
- *
130
- * @example
131
- * ```typescript
132
- * const data = await cacher.get<MyType>('key1', 'my-namespace');
133
- * if (data) {
134
- * console.log('Cache hit:', data);
135
- * }
136
- * ```
70
+ * Store multiple responses in a single transaction (batch operation)
71
+ */
72
+ setMany<T = any>(entries: Array<{
73
+ key: string;
74
+ value: T;
75
+ ttl?: number;
76
+ }>, namespace?: string): Promise<void>;
77
+ /**
78
+ * Retrieve a cached response
137
79
  */
138
80
  get<T = any>(key: string, namespace?: string): Promise<T | null>;
139
81
  /**
140
- * Check if a key exists in the cache and is not expired
141
- *
142
- * @param key - Key to check
143
- * @param namespace - Namespace to search in (default: 'default')
144
- * @returns Promise resolving to true if key exists and is valid
145
- *
146
- * @example
147
- * ```typescript
148
- * if (await cacher.has('key1', 'my-namespace')) {
149
- * const data = await cacher.get('key1', 'my-namespace');
150
- * }
151
- * ```
82
+ * Check if a key exists and is not expired
152
83
  */
153
84
  has(key: string, namespace?: string): Promise<boolean>;
85
+ /**
86
+ * Check multiple keys at once (batch operation)
87
+ */
88
+ hasMany(keys: string[], namespace?: string): Promise<Set<string>>;
154
89
  /**
155
90
  * Delete a key from the cache
156
- *
157
- * @param key - Key to delete
158
- * @param namespace - Namespace to delete from (default: 'default')
159
- * @returns Promise resolving to true if key was deleted
160
- *
161
- * @example
162
- * ```typescript
163
- * await cacher.delete('obsolete-key', 'my-namespace');
164
- * ```
165
91
  */
166
92
  delete(key: string, namespace?: string): Promise<boolean>;
167
93
  /**
168
94
  * Clear all entries in a namespace
169
- *
170
- * @param namespace - Namespace to clear (default: 'default')
171
- * @returns Promise resolving when cleared
172
- *
173
- * @example
174
- * ```typescript
175
- * // Clear all cached data for a domain
176
- * await cacher.clear('example.com');
177
- * ```
178
95
  */
179
96
  clear(namespace?: string): Promise<void>;
180
97
  /**
181
- * Remove all expired entries from a namespace
182
- *
183
- * @param namespace - Namespace to cleanup (default: 'default')
184
- * @returns Promise resolving to number of entries removed
185
- *
186
- * @example
187
- * ```typescript
188
- * const removed = await cacher.cleanup('my-namespace');
189
- * console.log(`Removed ${removed} expired entries`);
190
- * ```
98
+ * Remove all expired entries
191
99
  */
192
100
  cleanup(namespace?: string): Promise<number>;
193
101
  /**
194
- * Get statistics for a namespace
195
- *
196
- * @param namespace - Namespace to get stats for (default: 'default')
197
- * @returns Promise resolving to cache statistics
198
- *
199
- * @example
200
- * ```typescript
201
- * const stats = await cacher.stats('my-namespace');
202
- * console.log(`${stats.count} entries, ${stats.size} bytes`);
203
- * ```
102
+ * Get cache statistics for a namespace
204
103
  */
205
104
  stats(namespace?: string): Promise<{
206
105
  count: number;
207
106
  expired: number;
208
- deleted: number;
209
107
  }>;
210
108
  /**
211
- * Close all database connections and release resources
212
- *
213
- * @returns Promise resolving when all connections are closed
214
- *
215
- * @example
216
- * ```typescript
217
- * // Always close when done
218
- * await cacher.close();
219
- * ```
109
+ * Close all database connections
220
110
  */
221
111
  close(): Promise<void>;
222
- /**
223
- * Check if the cacher has been closed
224
- */
225
112
  get isClosed(): boolean;
226
- /**
227
- * Get the cache directory path
228
- */
229
113
  get directory(): string;
230
114
  }
231
115
  export interface CrawlSession {
@@ -1644,6 +1528,35 @@ export type OnTimeoutHook = (event: TimeoutEvent, config: RezoConfig) => void;
1644
1528
  * Use for cleanup, logging
1645
1529
  */
1646
1530
  export type OnAbortHook = (event: AbortEvent, config: RezoConfig) => void;
1531
+ /**
1532
+ * Rate limit wait event data - fired when waiting due to rate limiting
1533
+ */
1534
+ export interface RateLimitWaitEvent {
1535
+ /** HTTP status code that triggered the wait (e.g., 429, 503) */
1536
+ status: number;
1537
+ /** Time to wait in milliseconds */
1538
+ waitTime: number;
1539
+ /** Current wait attempt number (1-indexed) */
1540
+ attempt: number;
1541
+ /** Maximum wait attempts configured */
1542
+ maxAttempts: number;
1543
+ /** Where the wait time was extracted from */
1544
+ source: "header" | "body" | "function" | "default";
1545
+ /** The header or body path used (if applicable) */
1546
+ sourcePath?: string;
1547
+ /** URL being requested */
1548
+ url: string;
1549
+ /** HTTP method of the request */
1550
+ method: string;
1551
+ /** Timestamp when the wait started */
1552
+ timestamp: number;
1553
+ }
1554
+ /**
1555
+ * Hook called when rate limit wait occurs
1556
+ * Informational only - cannot abort the wait
1557
+ * Use for logging, monitoring, alerting
1558
+ */
1559
+ export type OnRateLimitWaitHook = (event: RateLimitWaitEvent, config: RezoConfig) => void | Promise<void>;
1647
1560
  /**
1648
1561
  * Hook called before a proxy is selected
1649
1562
  * Can return a specific proxy to override selection
@@ -1724,6 +1637,7 @@ export interface RezoHooks {
1724
1637
  onTls: OnTlsHook[];
1725
1638
  onTimeout: OnTimeoutHook[];
1726
1639
  onAbort: OnAbortHook[];
1640
+ onRateLimitWait: OnRateLimitWaitHook[];
1727
1641
  }
1728
1642
  /**
1729
1643
  * Configuration object that encapsulates comprehensive request execution metadata and response processing information.
@@ -2549,6 +2463,91 @@ export interface RezoRequestConfig<D = any> {
2549
2463
  /** Weather to stop or continue retry when certain condition is met*/
2550
2464
  condition?: (error: RezoError) => boolean | Promise<boolean>;
2551
2465
  };
2466
+ /**
2467
+ * Rate limit wait configuration - wait and retry when receiving rate limit responses.
2468
+ *
2469
+ * This feature runs BEFORE the retry system. When a rate-limiting status code is received,
2470
+ * the client will wait for the specified time and automatically retry the request.
2471
+ *
2472
+ * **Basic Usage:**
2473
+ * - `waitOnStatus: true` - Enable waiting on 429 status (default behavior)
2474
+ * - `waitOnStatus: [429, 503]` - Enable waiting on specific status codes
2475
+ *
2476
+ * **Wait Time Sources:**
2477
+ * - `'retry-after'` - Use standard Retry-After header (default)
2478
+ * - `{ header: 'X-RateLimit-Reset' }` - Use custom header
2479
+ * - `{ body: 'retry_after' }` - Extract from JSON response body
2480
+ * - Custom function for complex logic
2481
+ *
2482
+ * @example
2483
+ * ```typescript
2484
+ * // Wait on 429 using Retry-After header
2485
+ * await rezo.get(url, { waitOnStatus: true });
2486
+ *
2487
+ * // Wait on 429 using custom header
2488
+ * await rezo.get(url, {
2489
+ * waitOnStatus: true,
2490
+ * waitTimeSource: { header: 'X-RateLimit-Reset' }
2491
+ * });
2492
+ *
2493
+ * // Wait on 429 extracting time from JSON body
2494
+ * await rezo.get(url, {
2495
+ * waitOnStatus: true,
2496
+ * waitTimeSource: { body: 'data.retry_after' }
2497
+ * });
2498
+ *
2499
+ * // Custom function for complex APIs
2500
+ * await rezo.get(url, {
2501
+ * waitOnStatus: [429, 503],
2502
+ * waitTimeSource: (response) => {
2503
+ * const reset = response.headers.get('x-ratelimit-reset');
2504
+ * return reset ? parseInt(reset) - Math.floor(Date.now() / 1000) : null;
2505
+ * }
2506
+ * });
2507
+ * ```
2508
+ */
2509
+ waitOnStatus?: boolean | number[];
2510
+ /**
2511
+ * Where to extract the wait time from when rate-limited.
2512
+ *
2513
+ * - `'retry-after'` - Standard Retry-After header (default)
2514
+ * - `{ header: string }` - Custom header name (e.g., 'X-RateLimit-Reset')
2515
+ * - `{ body: string }` - JSON path in response body (e.g., 'data.retry_after', 'wait_seconds')
2516
+ * - Function - Custom logic receiving the response, return seconds to wait or null
2517
+ *
2518
+ * @default 'retry-after'
2519
+ */
2520
+ waitTimeSource?: "retry-after" | {
2521
+ header: string;
2522
+ } | {
2523
+ body: string;
2524
+ } | ((response: {
2525
+ status: number;
2526
+ headers: RezoHeaders;
2527
+ data?: any;
2528
+ }) => number | null);
2529
+ /**
2530
+ * Maximum time to wait for rate limit in milliseconds.
2531
+ * If the extracted wait time exceeds this, the request will fail instead of waiting.
2532
+ * Set to 0 for unlimited wait time.
2533
+ *
2534
+ * @default 60000 (60 seconds)
2535
+ */
2536
+ maxWaitTime?: number;
2537
+ /**
2538
+ * Default wait time in milliseconds if the wait time source returns nothing.
2539
+ * Used as fallback when Retry-After header or body path is not present.
2540
+ *
2541
+ * @default 1000 (1 second)
2542
+ */
2543
+ defaultWaitTime?: number;
2544
+ /**
2545
+ * Maximum number of wait attempts before giving up.
2546
+ * After this many waits, the request will proceed to retry logic or fail.
2547
+ *
2548
+ * @default 3
2549
+ */
2550
+ maxWaitAttempts?: number;
2552
2551
  /** Whether to use a secure context for HTTPS requests */
2553
2552
  useSecureContext?: boolean;
2554
2553
  /** Custom secure context for TLS connections */
@@ -6232,17 +6231,25 @@ declare class Decodo {
6232
6231
  /**
6233
6232
  * Create a new Decodo client instance
6234
6233
  *
6235
- * @param config - Decodo API configuration
6236
- * @throws Error if username or password is missing
6234
+ * @param config - Decodo API configuration (supports username/password OR token auth)
6235
+ * @throws Error if authentication credentials are missing
6237
6236
  *
6238
6237
  * @example
6239
6238
  * ```typescript
6239
+ * // Username/password authentication
6240
6240
  * const decodo = new Decodo({
6241
6241
  * username: 'user',
6242
6242
  * password: 'password',
6243
6243
  * headless: 'html',
6244
6244
  * country: 'US'
6245
6245
  * });
6246
+ *
6247
+ * // Token authentication (alternative)
6248
+ * const decodo = new Decodo({
6249
+ * token: 'your_api_token',
6250
+ * headless: 'html',
6251
+ * country: 'US'
6252
+ * });
6246
6253
  * ```
6247
6254
  */
6248
6255
  constructor(config: DecodoConfig);
@@ -6472,6 +6479,42 @@ export interface ICrawlerOptions {
6472
6479
  } | {
6473
6480
  enable: false;
6474
6481
  } | undefined | false;
6482
+ /** Decodo proxy service configuration for specific domains or global use */
6483
+ decodo?: {
6484
+ enable: true;
6485
+ labs: [
6486
+ {
6487
+ domain: Domain;
6488
+ isGlobal?: boolean;
6489
+ options: DecodoOptions;
6490
+ queueOptions: queueOptions$1;
6491
+ }
6492
+ ];
6493
+ } | {
6494
+ enable: false;
6495
+ } | undefined | false;
6496
+ /** Maximum crawl depth from start URL (0 = unlimited, default: 0) */
6497
+ maxDepth?: number;
6498
+ /** Maximum total URLs to crawl (0 = unlimited, default: 0) */
6499
+ maxUrls?: number;
6500
+ /** Maximum response size in bytes to process (0 = unlimited, default: 0) */
6501
+ maxResponseSize?: number;
6502
+ /** Respect robots.txt rules (default: false) */
6503
+ respectRobotsTxt?: boolean;
6504
+ /** Follow rel="nofollow" links (default: false - ignores nofollow links) */
6505
+ followNofollow?: boolean;
6506
+ /** Enable automatic throttling based on server response times (default: true) */
6507
+ autoThrottle?: boolean;
6508
+ /** Target request delay in ms for AutoThrottle (default: 1000) */
6509
+ autoThrottleTargetDelay?: number;
6510
+ /** Minimum delay between requests in ms (default: 100) */
6511
+ autoThrottleMinDelay?: number;
6512
+ /** Maximum delay between requests in ms (default: 60000) */
6513
+ autoThrottleMaxDelay?: number;
6514
+ /** Maximum time to wait on 429 response in ms (default: 1800000 = 30 min) */
6515
+ maxWaitOn429?: number;
6516
+ /** Always wait on 429 regardless of time, shows warning (default: false) */
6517
+ alwaysWaitOn429?: boolean;
6475
6518
  }
6476
6519
  /**
6477
6520
  * Advanced web crawler configuration class with support for domain-specific settings
@@ -6548,6 +6591,28 @@ export declare class CrawlerOptions {
6548
6591
  throwFatalError?: boolean;
6549
6592
  /** Enable debug logging */
6550
6593
  debug?: boolean;
6594
+ /** Maximum crawl depth from start URL (0 = unlimited) */
6595
+ maxDepth: number;
6596
+ /** Maximum total URLs to crawl (0 = unlimited) */
6597
+ maxUrls: number;
6598
+ /** Maximum response size in bytes to process (0 = unlimited) */
6599
+ maxResponseSize: number;
6600
+ /** Respect robots.txt rules */
6601
+ respectRobotsTxt: boolean;
6602
+ /** Follow rel="nofollow" links */
6603
+ followNofollow: boolean;
6604
+ /** Enable automatic throttling based on server response times */
6605
+ autoThrottle: boolean;
6606
+ /** Target request delay in ms for AutoThrottle */
6607
+ autoThrottleTargetDelay: number;
6608
+ /** Minimum delay between requests in ms */
6609
+ autoThrottleMinDelay: number;
6610
+ /** Maximum delay between requests in ms */
6611
+ autoThrottleMaxDelay: number;
6612
+ /** Maximum time to wait on 429 response in ms */
6613
+ maxWaitOn429: number;
6614
+ /** Always wait on 429 regardless of time */
6615
+ alwaysWaitOn429: boolean;
6551
6616
  /** Internal storage for Oxylabs configurations with domain mapping */
6552
6617
  oxylabs: {
6553
6618
  domain?: Domain;
@@ -6929,13 +6994,44 @@ export interface EmailDiscoveryEvent {
6929
6994
  discoveredAt: string;
6930
6995
  timestamp: Date;
6931
6996
  }
6997
+ interface RedirectEvent$1 {
6998
+ originalUrl: string;
6999
+ finalUrl: string;
7000
+ redirectCount: number;
7001
+ statusCode: number;
7002
+ }
7003
+ /**
7004
+ * Export format options
7005
+ */
7006
+ export type ExportFormat = "json" | "jsonl" | "csv";
6932
7007
  /**
6933
- * Generic handler function type for crawler event callbacks.
6934
- * All crawler event handlers must return a Promise<void>.
7008
+ * Handler with element bound to `this` context.
7009
+ * Use `function` syntax (not arrow functions) to access `this`.
6935
7010
  *
6936
- * @template T - The type of element or data passed to the handler
7011
+ * @example
7012
+ * ```typescript
7013
+ * crawler.onText('h1', async function(text) {
7014
+ * console.log(text, this.tagName); // `this` is the element
7015
+ * });
7016
+ * ```
6937
7017
  */
6938
- export type CrawlerHandler<T = any> = (element: T) => Promise<void>;
7018
+ export type ElementBoundHandler<TValue, TElement = Element> = (this: TElement, value: TValue) => Promise<void>;
7019
+ /**
7020
+ * Handler for attribute extraction with element bound to `this`.
7021
+ * Receives both the attribute value and attribute name.
7022
+ */
7023
+ export type AttributeHandler = (this: Element, value: string, attributeName: string) => Promise<void>;
7024
+ /**
7025
+ * Crawl statistics
7026
+ */
7027
+ export interface CrawlStats {
7028
+ urlsVisited: number;
7029
+ urlsQueued: number;
7030
+ urlsFailed: number;
7031
+ startTime: number;
7032
+ endTime?: number;
7033
+ currentDepth: number;
7034
+ }
6939
7035
  /**
6940
7036
  * A powerful web crawler that provides event-driven HTML parsing and data extraction.
6941
7037
  * Supports caching, proxy rotation, retry mechanisms, and email lead discovery.
@@ -6992,6 +7088,25 @@ export declare class Crawler {
6992
7088
  /** Adapter-specific request executor */
6993
7089
  private adapterExecutor;
6994
7090
  private adapterType;
7091
+ /** Track pending execute() calls for proper done() behavior */
7092
+ private pendingExecutions;
7093
+ /** robots.txt parser and validator */
7094
+ private robotsTxt;
7095
+ /** AutoThrottle: track response times per domain for adaptive rate limiting */
7096
+ private domainResponseTimes;
7097
+ private domainCurrentDelay;
7098
+ /** Crawl statistics */
7099
+ private crawlStats;
7100
+ /** URL depth tracking for maxDepth limit */
7101
+ private urlDepthMap;
7102
+ /** Lifecycle event handlers */
7103
+ private startHandlers;
7104
+ private finishHandlers;
7105
+ private redirectHandlers;
7106
+ /** Data collection for export */
7107
+ private collectedData;
7108
+ /** Flag to track if crawl has started */
7109
+ private crawlStarted;
6995
7110
  /**
6996
7111
  * Creates a new Crawler instance with the specified configuration.
6997
7112
  *
@@ -7160,6 +7275,54 @@ export declare class Crawler {
7160
7275
  * ```
7161
7276
  */
7162
7277
  onEmailLeads(handler: (emails: string[]) => Promise<void>): Crawler;
7278
+ /**
7279
+ * Registers a handler called before crawling starts.
7280
+ * Useful for initialization, logging, or setup tasks.
7281
+ *
7282
+ * @param handler - Function to call before crawling begins
7283
+ * @returns The crawler instance for method chaining
7284
+ *
7285
+ * @example
7286
+ * ```typescript
7287
+ * crawler.onStart(async () => {
7288
+ * console.log('Crawl session started');
7289
+ * await initializeDatabase();
7290
+ * });
7291
+ * ```
7292
+ */
7293
+ onStart(handler: () => Promise<void>): Crawler;
7294
+ /**
7295
+ * Registers a handler called when crawling finishes.
7296
+ * Receives crawl statistics including URLs visited, failed, and timing.
7297
+ *
7298
+ * @param handler - Function to call when crawling completes
7299
+ * @returns The crawler instance for method chaining
7300
+ *
7301
+ * @example
7302
+ * ```typescript
7303
+ * crawler.onFinish(async (stats) => {
7304
+ * console.log(`Crawl completed: ${stats.urlsVisited} URLs in ${stats.endTime - stats.startTime}ms`);
7305
+ * await generateReport(stats);
7306
+ * });
7307
+ * ```
7308
+ */
7309
+ onFinish(handler: (stats: CrawlStats) => Promise<void>): Crawler;
7310
+ /**
7311
+ * Registers a handler called when a redirect is followed.
7312
+ * Provides information about the original URL, final URL, and redirect count.
7313
+ *
7314
+ * @param handler - Function to handle redirect events
7315
+ * @returns The crawler instance for method chaining
7316
+ *
7317
+ * @example
7318
+ * ```typescript
7319
+ * crawler.onRedirect(async (event) => {
7320
+ * console.log(`Redirect: ${event.originalUrl} -> ${event.finalUrl}`);
7321
+ * trackRedirects(event);
7322
+ * });
7323
+ * ```
7324
+ */
7325
+ onRedirect(handler: (event: RedirectEvent$1) => Promise<void>): Crawler;
7163
7326
  /**
7164
7327
  * Registers a handler for raw response data.
7165
7328
  * Triggered for all responses, providing access to the raw Buffer data.
@@ -7255,21 +7418,23 @@ export declare class Crawler {
7255
7418
  /**
7256
7419
  * Registers a handler for href attributes from anchor and link elements.
7257
7420
  * Automatically resolves relative URLs to absolute URLs.
7421
+ * Use `function` syntax (not arrow) to access `this` as the element.
7258
7422
  *
7259
- * @param handler - Function to handle href URLs as strings
7423
+ * @param handler - Function receiving href string, with `this` bound to the element
7260
7424
  * @returns The crawler instance for method chaining
7261
7425
  *
7262
7426
  * @example
7263
7427
  * ```typescript
7264
- * crawler.onHref(async (href) => {
7428
+ * crawler.onHref(async function(href) {
7265
7429
  * console.log('Found URL:', href);
7430
+ * console.log('Link text:', this.textContent); // `this` is the anchor/link element
7266
7431
  * if (href.includes('/api/')) {
7267
7432
  * await crawler.visit(href);
7268
7433
  * }
7269
7434
  * });
7270
7435
  * ```
7271
7436
  */
7272
- onHref(handler: (href: string) => Promise<void>): Crawler;
7437
+ onHref(handler: ElementBoundHandler<string, HTMLAnchorElement | HTMLLinkElement>): Crawler;
7273
7438
  /**
7274
7439
  * Registers a handler for elements matching a CSS selector.
7275
7440
  * Provides fine-grained control over which elements to process.
@@ -7311,55 +7476,57 @@ export declare class Crawler {
7311
7476
  /**
7312
7477
  * Registers a handler for HTML element attributes.
7313
7478
  * Can extract specific attributes from all elements or from elements matching a selector.
7479
+ * Use `function` syntax (not arrow) to access `this` as the element.
7314
7480
  *
7315
7481
  * @param attribute - The attribute name to extract
7316
- * @param handler - Function to handle attribute values
7482
+ * @param handler - Function receiving (value, attrName), with `this` bound to element
7317
7483
  * @returns The crawler instance for method chaining
7318
7484
  *
7319
7485
  * @overload
7320
7486
  * @param selection - CSS selector to filter elements
7321
7487
  * @param attribute - The attribute name to extract
7322
- * @param handler - Function to handle attribute values
7488
+ * @param handler - Function receiving (value, attrName), with `this` bound to element
7323
7489
  * @returns The crawler instance for method chaining
7324
7490
  *
7325
7491
  * @example
7326
7492
  * ```typescript
7327
7493
  * // Extract all 'data-id' attributes
7328
- * crawler.onAttribute('data-id', async (value) => {
7329
- * console.log('Found data-id:', value);
7494
+ * crawler.onAttribute('data-id', async function(value, attrName) {
7495
+ * console.log('Found', attrName, ':', value, 'on:', this.tagName);
7330
7496
  * });
7331
7497
  *
7332
7498
  * // Extract 'src' attributes from images only
7333
- * crawler.onAttribute('img', 'src', async (src) => {
7334
- * console.log('Image source:', src);
7499
+ * crawler.onAttribute('img', 'src', async function(value) {
7500
+ * console.log('Image source:', value, 'alt:', this.getAttribute('alt'));
7335
7501
  * });
7336
7502
  * ```
7337
7503
  */
7338
- onAttribute(attribute: string, handler: CrawlerHandler<string>): Crawler;
7339
- onAttribute(selection: string, attribute: string, handler: CrawlerHandler<string>): Crawler;
7504
+ onAttribute(attribute: string, handler: AttributeHandler): Crawler;
7505
+ onAttribute(selection: string, attribute: string, handler: AttributeHandler): Crawler;
7340
7506
  /**
7341
7507
  * Registers a handler for text content of elements matching a CSS selector.
7342
7508
  * Extracts and processes the textContent of matching elements.
7509
+ * Use `function` syntax (not arrow) to access `this` as the element.
7343
7510
  *
7344
7511
  * @param selection - CSS selector to match elements
7345
- * @param handler - Function to handle extracted text content
7512
+ * @param handler - Function receiving text string, with `this` bound to element
7346
7513
  * @returns The crawler instance for method chaining
7347
7514
  *
7348
7515
  * @example
7349
7516
  * ```typescript
7350
- * // Extract all heading text
7351
- * crawler.onText('h1, h2, h3', async (text) => {
7352
- * console.log('Heading:', text.trim());
7517
+ * // Extract all heading text with element context
7518
+ * crawler.onText('h1, h2, h3', async function(text) {
7519
+ * console.log('Heading:', text.trim(), 'Tag:', this.tagName);
7353
7520
  * });
7354
7521
  *
7355
- * // Extract product prices
7356
- * crawler.onText('.price', async (price) => {
7357
- * const numericPrice = parseFloat(price.replace(/[^\d.]/g, ''));
7358
- * console.log('Price value:', numericPrice);
7522
+ * // Extract product prices with element context
7523
+ * crawler.onText('.price', async function(text) {
7524
+ * const numericPrice = parseFloat(text.replace(/[^\d.]/g, ''));
7525
+ * console.log('Price:', numericPrice, 'Product:', this.closest('.product')?.id);
7359
7526
  * });
7360
7527
  * ```
7361
7528
  */
7362
- onText(selection: string, handler: CrawlerHandler<string>): Crawler;
7529
+ onText(selection: string, handler: ElementBoundHandler<string>): Crawler;
7363
7530
  private _onBody;
7364
7531
  private _onAttribute;
7365
7532
  private _onText;
@@ -7374,6 +7541,86 @@ export declare class Crawler {
7374
7541
  private _onEmailLeads;
7375
7542
  private _onRawResponse;
7376
7543
  private _onResponse;
7544
+ /**
7545
+ * Calculate adaptive delay based on server response times (AutoThrottle)
7546
+ */
7547
+ private calculateAutoThrottleDelay;
7548
+ /**
7549
+ * Get current AutoThrottle delay for a domain
7550
+ */
7551
+ private getAutoThrottleDelay;
7552
+ /**
7553
+ * Handle 429 Too Many Requests response with Retry-After header parsing
7554
+ */
7555
+ private handle429Response;
7556
+ /**
7557
+ * Check if URL passes all crawl limit checks
7558
+ */
7559
+ private checkCrawlLimits;
7560
+ /**
7561
+ * Check if a link should be followed based on nofollow rules
7562
+ */
7563
+ private shouldFollowLink;
7564
+ /**
7565
+ * Check response size against maxResponseSize limit
7566
+ */
7567
+ private checkResponseSize;
7568
+ /**
7569
+ * Collect data for later export
7570
+ *
7571
+ * @param data - Data to collect (will be added to export buffer)
7572
+ * @returns The crawler instance for method chaining
7573
+ *
7574
+ * @example
7575
+ * ```typescript
7576
+ * crawler.onDocument(async (doc) => {
7577
+ * crawler.collect({
7578
+ * title: doc.title,
7579
+ * url: doc.URL,
7580
+ * h1: doc.querySelector('h1')?.textContent
7581
+ * });
7582
+ * });
7583
+ * ```
7584
+ */
7585
+ collect(data: any): Crawler;
7586
+ /**
7587
+ * Get all collected data
7588
+ */
7589
+ getCollectedData(): any[];
7590
+ /**
7591
+ * Clear collected data
7592
+ */
7593
+ clearCollectedData(): Crawler;
7594
+ /**
7595
+ * Export collected data to a file
7596
+ *
7597
+ * @param filePath - Output file path
7598
+ * @param format - Export format: 'json', 'jsonl', or 'csv'
7599
+ *
7600
+ * @example
7601
+ * ```typescript
7602
+ * await crawler.waitForAll();
7603
+ * await crawler.exportData('./output.json', 'json');
7604
+ * await crawler.exportData('./output.csv', 'csv');
7605
+ * ```
7606
+ */
7607
+ exportData(filePath: string, format?: ExportFormat): Promise<void>;
7608
+ /**
7609
+ * Get current crawl statistics
7610
+ */
7611
+ getStats(): CrawlStats;
7612
+ /**
7613
+ * Trigger onStart handlers (called once on first visit)
7614
+ */
7615
+ private triggerStartHandlers;
7616
+ /**
7617
+ * Trigger onFinish handlers
7618
+ */
7619
+ private triggerFinishHandlers;
7620
+ /**
7621
+ * Trigger onRedirect handlers
7622
+ */
7623
+ private triggerRedirectHandlers;
7377
7624
  private buildUrl;
7378
7625
  /**
7379
7626
  * Visits a URL and processes it according to registered event handlers.
@@ -7489,6 +7736,17 @@ export declare class Crawler {
7489
7736
  */
7490
7737
  done(): Promise<void>;
7491
7738
  close(): Promise<void>;
7739
+ /**
7740
+ * Destroys the crawler instance and releases all resources.
7741
+ * Clears all queued tasks, closes caches, and cleans up event handlers.
7742
+ * @returns Promise that resolves when destruction is complete
7743
+ * @example
7744
+ * ```typescript
7745
+ * await crawler.destroy();
7746
+ * // Crawler is now fully cleaned up
7747
+ * ```
7748
+ */
7749
+ destroy(): Promise<void>;
7492
7750
  }
7493
7751
 
7494
7752
  export {};