rezo 1.0.42 → 1.0.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/dist/adapters/curl.cjs +131 -29
  2. package/dist/adapters/curl.js +131 -29
  3. package/dist/adapters/entries/curl.d.ts +65 -0
  4. package/dist/adapters/entries/fetch.d.ts +65 -0
  5. package/dist/adapters/entries/http.d.ts +65 -0
  6. package/dist/adapters/entries/http2.d.ts +65 -0
  7. package/dist/adapters/entries/react-native.d.ts +65 -0
  8. package/dist/adapters/entries/xhr.d.ts +65 -0
  9. package/dist/adapters/http2.cjs +209 -22
  10. package/dist/adapters/http2.js +209 -22
  11. package/dist/adapters/index.cjs +6 -6
  12. package/dist/cache/index.cjs +9 -13
  13. package/dist/cache/index.js +0 -2
  14. package/dist/core/rezo.cjs +7 -0
  15. package/dist/core/rezo.js +7 -0
  16. package/dist/crawler/addon/decodo/index.cjs +1 -0
  17. package/dist/crawler/addon/decodo/index.js +1 -0
  18. package/dist/crawler/crawler-options.cjs +1 -0
  19. package/dist/crawler/crawler-options.js +1 -0
  20. package/dist/crawler/crawler.cjs +1070 -0
  21. package/dist/crawler/crawler.js +1068 -0
  22. package/dist/crawler/index.cjs +40 -0
  23. package/dist/{plugin → crawler}/index.js +4 -2
  24. package/dist/crawler/plugin/file-cacher.cjs +19 -0
  25. package/dist/crawler/plugin/file-cacher.js +19 -0
  26. package/dist/crawler/plugin/index.cjs +1 -0
  27. package/dist/crawler/plugin/index.js +1 -0
  28. package/dist/crawler/plugin/navigation-history.cjs +43 -0
  29. package/dist/crawler/plugin/navigation-history.js +43 -0
  30. package/dist/crawler/plugin/robots-txt.cjs +2 -0
  31. package/dist/crawler/plugin/robots-txt.js +2 -0
  32. package/dist/crawler/plugin/url-store.cjs +18 -0
  33. package/dist/crawler/plugin/url-store.js +18 -0
  34. package/dist/crawler.d.ts +511 -183
  35. package/dist/entries/crawler.cjs +5 -5
  36. package/dist/entries/crawler.js +2 -2
  37. package/dist/index.cjs +27 -24
  38. package/dist/index.d.ts +73 -0
  39. package/dist/index.js +1 -0
  40. package/dist/internal/agents/base.cjs +113 -0
  41. package/dist/internal/agents/base.js +110 -0
  42. package/dist/internal/agents/http-proxy.cjs +89 -0
  43. package/dist/internal/agents/http-proxy.js +86 -0
  44. package/dist/internal/agents/https-proxy.cjs +176 -0
  45. package/dist/internal/agents/https-proxy.js +173 -0
  46. package/dist/internal/agents/index.cjs +10 -0
  47. package/dist/internal/agents/index.js +5 -0
  48. package/dist/internal/agents/socks-client.cjs +571 -0
  49. package/dist/internal/agents/socks-client.js +567 -0
  50. package/dist/internal/agents/socks-proxy.cjs +75 -0
  51. package/dist/internal/agents/socks-proxy.js +72 -0
  52. package/dist/platform/browser.d.ts +65 -0
  53. package/dist/platform/bun.d.ts +65 -0
  54. package/dist/platform/deno.d.ts +65 -0
  55. package/dist/platform/node.d.ts +65 -0
  56. package/dist/platform/react-native.d.ts +65 -0
  57. package/dist/platform/worker.d.ts +65 -0
  58. package/dist/proxy/index.cjs +18 -16
  59. package/dist/proxy/index.js +17 -12
  60. package/dist/queue/index.cjs +8 -8
  61. package/dist/responses/buildError.cjs +11 -2
  62. package/dist/responses/buildError.js +11 -2
  63. package/dist/responses/universal/index.cjs +11 -11
  64. package/dist/utils/curl.cjs +317 -0
  65. package/dist/utils/curl.js +314 -0
  66. package/package.json +2 -6
  67. package/dist/cache/file-cacher.cjs +0 -264
  68. package/dist/cache/file-cacher.js +0 -261
  69. package/dist/cache/url-store.cjs +0 -288
  70. package/dist/cache/url-store.js +0 -285
  71. package/dist/plugin/addon/decodo/index.cjs +0 -1
  72. package/dist/plugin/addon/decodo/index.js +0 -1
  73. package/dist/plugin/crawler-options.cjs +0 -1
  74. package/dist/plugin/crawler-options.js +0 -1
  75. package/dist/plugin/crawler.cjs +0 -519
  76. package/dist/plugin/crawler.js +0 -517
  77. package/dist/plugin/index.cjs +0 -36
  78. /package/dist/{plugin → crawler}/addon/decodo/options.cjs +0 -0
  79. /package/dist/{plugin → crawler}/addon/decodo/options.js +0 -0
  80. /package/dist/{plugin → crawler}/addon/decodo/types.cjs +0 -0
  81. /package/dist/{plugin → crawler}/addon/decodo/types.js +0 -0
  82. /package/dist/{plugin → crawler}/addon/oxylabs/index.cjs +0 -0
  83. /package/dist/{plugin → crawler}/addon/oxylabs/index.js +0 -0
  84. /package/dist/{plugin → crawler}/addon/oxylabs/options.cjs +0 -0
  85. /package/dist/{plugin → crawler}/addon/oxylabs/options.js +0 -0
  86. /package/dist/{plugin → crawler}/addon/oxylabs/types.cjs +0 -0
  87. /package/dist/{plugin → crawler}/addon/oxylabs/types.js +0 -0
  88. /package/dist/{plugin → crawler}/scraper.cjs +0 -0
  89. /package/dist/{plugin → crawler}/scraper.js +0 -0
package/dist/crawler.d.ts CHANGED
@@ -6,43 +6,24 @@ import { SecureContext, TLSSocket } from 'node:tls';
6
6
  import { Cookie as TouchCookie, CookieJar as TouchCookieJar, CreateCookieOptions } from 'tough-cookie';
7
7
 
8
8
  /**
9
- * FileCacher - Cross-runtime SQLite-based file caching system
9
+ * CrawlerCache - High-performance SQLite-based response caching for web crawlers
10
10
  *
11
- * Provides persistent key-value storage with namespace support, TTL expiration,
12
- * and optional zstd compression for efficient data storage.
11
+ * Optimized specifically for crawler workloads with:
12
+ * - WAL mode for high-throughput concurrent reads/writes
13
+ * - Batch operations for efficient bulk storage
14
+ * - Domain-based namespacing for organized cache management
15
+ * - Optional zstd compression for storage efficiency
13
16
  *
14
17
  * @module cache/file-cacher
15
18
  * @author Rezo HTTP Client Library
16
- *
17
- * @example
18
- * ```typescript
19
- * import { FileCacher } from 'rezo';
20
- *
21
- * // Create a file cacher instance
22
- * const cacher = await FileCacher.create({
23
- * cacheDir: './cache',
24
- * ttl: 3600000, // 1 hour
25
- * compression: true,
26
- * encryptNamespace: true
27
- * });
28
- *
29
- * // Store and retrieve data
30
- * await cacher.set('user:123', { name: 'John' }, 3600000, 'users');
31
- * const user = await cacher.get('user:123', 'users');
32
- *
33
- * // Check existence and cleanup
34
- * const exists = await cacher.has('user:123', 'users');
35
- * await cacher.delete('user:123', 'users');
36
- * await cacher.close();
37
- * ```
38
19
  */
39
20
  /**
40
- * Configuration options for FileCacher
21
+ * Configuration options for CrawlerCache
41
22
  */
42
23
  export interface FileCacherOptions {
43
24
  /**
44
25
  * Directory path for storing cache databases
45
- * @default './cache'
26
+ * @default '/tmp/rezo-crawler/cache'
46
27
  */
47
28
  cacheDir?: string;
48
29
  /**
@@ -51,23 +32,18 @@ export interface FileCacherOptions {
51
32
  */
52
33
  ttl?: number;
53
34
  /**
54
- * Enable zstd compression for stored values
35
+ * Enable zstd compression for stored values (Node.js 22.15+)
55
36
  * Reduces storage size but adds CPU overhead
56
37
  * @default false
57
38
  */
58
39
  compression?: boolean;
59
40
  /**
60
- * Enable soft delete (mark as deleted instead of removing)
61
- * @default false
62
- */
63
- softDelete?: boolean;
64
- /**
65
- * Hash namespace names for privacy/security
41
+ * Hash namespace names for privacy
66
42
  * @default false
67
43
  */
68
44
  encryptNamespace?: boolean;
69
45
  /**
70
- * Maximum number of entries per namespace (0 = unlimited)
46
+ * Maximum entries per namespace (0 = unlimited)
71
47
  * @default 0
72
48
  */
73
49
  maxEntries?: number;
@@ -77,157 +53,76 @@ declare class FileCacher {
77
53
  private readonly options;
78
54
  private readonly cacheDir;
79
55
  private closed;
80
- /**
81
- * Private constructor - use FileCacher.create() instead
82
- */
83
56
  private constructor();
84
57
  /**
85
58
  * Create a new FileCacher instance
86
- *
87
- * @param options - Configuration options
88
- * @returns Promise resolving to initialized FileCacher instance
89
- *
90
- * @example
91
- * ```typescript
92
- * const cacher = await FileCacher.create({
93
- * cacheDir: './my-cache',
94
- * ttl: 3600000,
95
- * compression: true
96
- * });
97
- * ```
98
59
  */
99
60
  static create(options?: FileCacherOptions): Promise<FileCacher>;
100
61
  /**
101
- * Get or create database for a namespace
62
+ * Get or create optimized database for a namespace (domain)
102
63
  */
103
64
  private getDatabase;
104
65
  /**
105
- * Store a value in the cache
106
- *
107
- * @param key - Unique key for the cached item
108
- * @param value - Value to cache (will be JSON serialized)
109
- * @param ttl - Time-to-live in milliseconds (uses default if not specified)
110
- * @param namespace - Namespace for isolation (default: 'default')
111
- * @returns Promise resolving when stored
112
- *
113
- * @example
114
- * ```typescript
115
- * // Store with default TTL
116
- * await cacher.set('key1', { data: 'value' });
117
- *
118
- * // Store with custom TTL and namespace
119
- * await cacher.set('key2', responseData, 3600000, 'api-responses');
120
- * ```
66
+ * Store a response in the cache
121
67
  */
122
68
  set<T = any>(key: string, value: T, ttl?: number, namespace?: string): Promise<void>;
123
69
  /**
124
- * Retrieve a value from the cache
125
- *
126
- * @param key - Key of the cached item
127
- * @param namespace - Namespace to search in (default: 'default')
128
- * @returns Promise resolving to cached value or null if not found/expired
129
- *
130
- * @example
131
- * ```typescript
132
- * const data = await cacher.get<MyType>('key1', 'my-namespace');
133
- * if (data) {
134
- * console.log('Cache hit:', data);
135
- * }
136
- * ```
70
+ * Store multiple responses in a single transaction (batch operation)
71
+ */
72
+ setMany<T = any>(entries: Array<{
73
+ key: string;
74
+ value: T;
75
+ ttl?: number;
76
+ }>, namespace?: string): Promise<void>;
77
+ /**
78
+ * Retrieve a cached response
137
79
  */
138
80
  get<T = any>(key: string, namespace?: string): Promise<T | null>;
139
81
  /**
140
- * Check if a key exists in the cache and is not expired
141
- *
142
- * @param key - Key to check
143
- * @param namespace - Namespace to search in (default: 'default')
144
- * @returns Promise resolving to true if key exists and is valid
145
- *
146
- * @example
147
- * ```typescript
148
- * if (await cacher.has('key1', 'my-namespace')) {
149
- * const data = await cacher.get('key1', 'my-namespace');
150
- * }
151
- * ```
82
+ * Check if a key exists and is not expired
152
83
  */
153
84
  has(key: string, namespace?: string): Promise<boolean>;
85
+ /**
86
+ * Check multiple keys at once (batch operation)
87
+ */
88
+ hasMany(keys: string[], namespace?: string): Promise<Set<string>>;
154
89
  /**
155
90
  * Delete a key from the cache
156
- *
157
- * @param key - Key to delete
158
- * @param namespace - Namespace to delete from (default: 'default')
159
- * @returns Promise resolving to true if key was deleted
160
- *
161
- * @example
162
- * ```typescript
163
- * await cacher.delete('obsolete-key', 'my-namespace');
164
- * ```
165
91
  */
166
92
  delete(key: string, namespace?: string): Promise<boolean>;
167
93
  /**
168
94
  * Clear all entries in a namespace
169
- *
170
- * @param namespace - Namespace to clear (default: 'default')
171
- * @returns Promise resolving when cleared
172
- *
173
- * @example
174
- * ```typescript
175
- * // Clear all cached data for a domain
176
- * await cacher.clear('example.com');
177
- * ```
178
95
  */
179
96
  clear(namespace?: string): Promise<void>;
180
97
  /**
181
- * Remove all expired entries from a namespace
182
- *
183
- * @param namespace - Namespace to cleanup (default: 'default')
184
- * @returns Promise resolving to number of entries removed
185
- *
186
- * @example
187
- * ```typescript
188
- * const removed = await cacher.cleanup('my-namespace');
189
- * console.log(`Removed ${removed} expired entries`);
190
- * ```
98
+ * Remove all expired entries
191
99
  */
192
100
  cleanup(namespace?: string): Promise<number>;
193
101
  /**
194
- * Get statistics for a namespace
195
- *
196
- * @param namespace - Namespace to get stats for (default: 'default')
197
- * @returns Promise resolving to cache statistics
198
- *
199
- * @example
200
- * ```typescript
201
- * const stats = await cacher.stats('my-namespace');
202
- * console.log(`${stats.count} entries, ${stats.size} bytes`);
203
- * ```
102
+ * Get cache statistics for a namespace
204
103
  */
205
104
  stats(namespace?: string): Promise<{
206
105
  count: number;
207
106
  expired: number;
208
- deleted: number;
209
107
  }>;
210
108
  /**
211
- * Close all database connections and release resources
212
- *
213
- * @returns Promise resolving when all connections are closed
214
- *
215
- * @example
216
- * ```typescript
217
- * // Always close when done
218
- * await cacher.close();
219
- * ```
109
+ * Close all database connections
220
110
  */
221
111
  close(): Promise<void>;
222
- /**
223
- * Check if the cacher has been closed
224
- */
225
112
  get isClosed(): boolean;
226
- /**
227
- * Get the cache directory path
228
- */
229
113
  get directory(): string;
230
114
  }
115
+ export interface CrawlSession {
116
+ sessionId: string;
117
+ baseUrl: string;
118
+ startedAt: number;
119
+ lastActivityAt: number;
120
+ status: "running" | "paused" | "completed" | "failed";
121
+ urlsVisited: number;
122
+ urlsQueued: number;
123
+ urlsFailed: number;
124
+ metadata?: string;
125
+ }
231
126
  export interface RezoHttpHeaders {
232
127
  accept?: string | undefined;
233
128
  "accept-encoding"?: string | undefined;
@@ -4464,6 +4359,71 @@ declare class Rezo {
4464
4359
  * @see {@link cookieJar} - Access the underlying RezoCookieJar for more control
4465
4360
  */
4466
4361
  clearCookies(): void;
4362
+ /**
4363
+ * Convert a Rezo request configuration to a cURL command string.
4364
+ *
4365
+ * Generates a valid cURL command that can be executed in a terminal to
4366
+ * reproduce the same HTTP request. Useful for:
4367
+ * - Debugging and sharing requests
4368
+ * - Documentation and examples
4369
+ * - Testing requests outside of Node.js
4370
+ * - Exporting requests to other tools
4371
+ *
4372
+ * @param config - Request configuration object
4373
+ * @returns A cURL command string
4374
+ *
4375
+ * @example
4376
+ * ```typescript
4377
+ * const curl = Rezo.toCurl({
4378
+ * url: 'https://api.example.com/users',
4379
+ * method: 'POST',
4380
+ * headers: { 'Content-Type': 'application/json' },
4381
+ * body: { name: 'John', email: 'john@example.com' }
4382
+ * });
4383
+ * // Output: curl -X POST -H 'content-type: application/json' --data-raw '{"name":"John","email":"john@example.com"}' -L --compressed 'https://api.example.com/users'
4384
+ * ```
4385
+ */
4386
+ static toCurl(config: RezoRequestConfig | RezoRequestOptions): string;
4387
+ /**
4388
+ * Parse a cURL command string into a Rezo request configuration.
4389
+ *
4390
+ * Converts a cURL command into a configuration object that can be
4391
+ * passed directly to Rezo request methods. Useful for:
4392
+ * - Importing requests from browser DevTools
4393
+ * - Converting curl examples from API documentation
4394
+ * - Migrating scripts from curl to Rezo
4395
+ *
4396
+ * Supports common cURL options:
4397
+ * - `-X, --request` - HTTP method
4398
+ * - `-H, --header` - Request headers
4399
+ * - `-d, --data, --data-raw, --data-binary` - Request body
4400
+ * - `-u, --user` - Basic authentication
4401
+ * - `-x, --proxy` - Proxy configuration
4402
+ * - `--socks5, --socks4` - SOCKS proxy
4403
+ * - `-L, --location` - Follow redirects
4404
+ * - `--max-redirs` - Maximum redirects
4405
+ * - `--max-time` - Request timeout
4406
+ * - `-k, --insecure` - Skip TLS verification
4407
+ * - `-A, --user-agent` - User agent header
4408
+ *
4409
+ * @param curlCommand - A cURL command string
4410
+ * @returns A request configuration object
4411
+ *
4412
+ * @example
4413
+ * ```typescript
4414
+ * // From browser DevTools "Copy as cURL"
4415
+ * const config = Rezo.fromCurl(`
4416
+ * curl 'https://api.example.com/data' \\
4417
+ * -H 'Authorization: Bearer token123' \\
4418
+ * -H 'Content-Type: application/json'
4419
+ * `);
4420
+ *
4421
+ * // Use with Rezo
4422
+ * const rezo = new Rezo();
4423
+ * const response = await rezo.request(config);
4424
+ * ```
4425
+ */
4426
+ static fromCurl(curlCommand: string): RezoRequestOptions;
4467
4427
  }
4468
4428
  /**
4469
4429
  * Rezo HTTP Client - Core Types
@@ -6156,17 +6116,25 @@ declare class Decodo {
6156
6116
  /**
6157
6117
  * Create a new Decodo client instance
6158
6118
  *
6159
- * @param config - Decodo API configuration
6160
- * @throws Error if username or password is missing
6119
+ * @param config - Decodo API configuration (supports username/password OR token auth)
6120
+ * @throws Error if authentication credentials are missing
6161
6121
  *
6162
6122
  * @example
6163
6123
  * ```typescript
6124
+ * // Username/password authentication
6164
6125
  * const decodo = new Decodo({
6165
6126
  * username: 'user',
6166
6127
  * password: 'password',
6167
6128
  * headless: 'html',
6168
6129
  * country: 'US'
6169
6130
  * });
6131
+ *
6132
+ * // Token authentication (alternative)
6133
+ * const decodo = new Decodo({
6134
+ * token: 'your_api_token',
6135
+ * headless: 'html',
6136
+ * country: 'US'
6137
+ * });
6170
6138
  * ```
6171
6139
  */
6172
6140
  constructor(config: DecodoConfig);
@@ -6284,6 +6252,15 @@ declare class Decodo {
6284
6252
  * const regexDomain: Domain = '^(sub|api)\.example\.com$';
6285
6253
  */
6286
6254
  export type Domain = string[] | string | RegExp;
6255
+ /**
6256
+ * Supported HTTP adapter types for crawler requests
6257
+ * @description
6258
+ * - 'http': Standard Node.js HTTP/HTTPS adapter (default)
6259
+ * - 'http2': HTTP/2 adapter with session pooling
6260
+ * - 'curl': cURL adapter for maximum compatibility
6261
+ * - 'fetch': Browser-compatible Fetch API adapter
6262
+ */
6263
+ export type CrawlerAdapterType = "http" | "http2" | "curl" | "fetch";
6287
6264
  /**
6288
6265
  * Configuration interface for the CrawlerOptions class
6289
6266
  * @description Defines all available options for configuring web crawler behavior,
@@ -6292,6 +6269,12 @@ export type Domain = string[] | string | RegExp;
6292
6269
  export interface ICrawlerOptions {
6293
6270
  /** Base URL for the crawler - the starting point for crawling operations */
6294
6271
  baseUrl: string;
6272
+ /** HTTP adapter to use for requests (default: 'http') */
6273
+ adapter?: CrawlerAdapterType;
6274
+ /** Enable navigation history for resumable crawling (default: false) */
6275
+ enableNavigationHistory?: boolean;
6276
+ /** Session ID for navigation history - allows resuming specific crawl sessions */
6277
+ sessionId?: string;
6295
6278
  /** Whether to reject unauthorized SSL certificates (default: true) */
6296
6279
  rejectUnauthorized?: boolean;
6297
6280
  /** Custom user agent string for HTTP requests */
@@ -6381,6 +6364,42 @@ export interface ICrawlerOptions {
6381
6364
  } | {
6382
6365
  enable: false;
6383
6366
  } | undefined | false;
6367
+ /** Decodo proxy service configuration for specific domains or global use */
6368
+ decodo?: {
6369
+ enable: true;
6370
+ labs: [
6371
+ {
6372
+ domain: Domain;
6373
+ isGlobal?: boolean;
6374
+ options: DecodoOptions;
6375
+ queueOptions: queueOptions$1;
6376
+ }
6377
+ ];
6378
+ } | {
6379
+ enable: false;
6380
+ } | undefined | false;
6381
+ /** Maximum crawl depth from start URL (0 = unlimited, default: 0) */
6382
+ maxDepth?: number;
6383
+ /** Maximum total URLs to crawl (0 = unlimited, default: 0) */
6384
+ maxUrls?: number;
6385
+ /** Maximum response size in bytes to process (0 = unlimited, default: 0) */
6386
+ maxResponseSize?: number;
6387
+ /** Respect robots.txt rules (default: false) */
6388
+ respectRobotsTxt?: boolean;
6389
+ /** Follow rel="nofollow" links (default: false - ignores nofollow links) */
6390
+ followNofollow?: boolean;
6391
+ /** Enable automatic throttling based on server response times (default: true) */
6392
+ autoThrottle?: boolean;
6393
+ /** Target request delay in ms for AutoThrottle (default: 1000) */
6394
+ autoThrottleTargetDelay?: number;
6395
+ /** Minimum delay between requests in ms (default: 100) */
6396
+ autoThrottleMinDelay?: number;
6397
+ /** Maximum delay between requests in ms (default: 60000) */
6398
+ autoThrottleMaxDelay?: number;
6399
+ /** Maximum time to wait on 429 response in ms (default: 1800000 = 30 min) */
6400
+ maxWaitOn429?: number;
6401
+ /** Always wait on 429 regardless of time, shows warning (default: false) */
6402
+ alwaysWaitOn429?: boolean;
6384
6403
  }
6385
6404
  /**
6386
6405
  * Advanced web crawler configuration class with support for domain-specific settings
@@ -6415,6 +6434,12 @@ export interface ICrawlerOptions {
6415
6434
  export declare class CrawlerOptions {
6416
6435
  /** Base URL for the crawler - the starting point for crawling operations */
6417
6436
  baseUrl: string;
6437
+ /** HTTP adapter to use for requests */
6438
+ adapter: CrawlerAdapterType;
6439
+ /** Enable navigation history for resumable crawling */
6440
+ enableNavigationHistory: boolean;
6441
+ /** Session ID for navigation history - allows resuming specific crawl sessions */
6442
+ sessionId: string;
6418
6443
  /** Whether to reject unauthorized SSL certificates */
6419
6444
  rejectUnauthorized?: boolean;
6420
6445
  /** Custom user agent string for HTTP requests */
@@ -6451,6 +6476,28 @@ export declare class CrawlerOptions {
6451
6476
  throwFatalError?: boolean;
6452
6477
  /** Enable debug logging */
6453
6478
  debug?: boolean;
6479
+ /** Maximum crawl depth from start URL (0 = unlimited) */
6480
+ maxDepth: number;
6481
+ /** Maximum total URLs to crawl (0 = unlimited) */
6482
+ maxUrls: number;
6483
+ /** Maximum response size in bytes to process (0 = unlimited) */
6484
+ maxResponseSize: number;
6485
+ /** Respect robots.txt rules */
6486
+ respectRobotsTxt: boolean;
6487
+ /** Follow rel="nofollow" links */
6488
+ followNofollow: boolean;
6489
+ /** Enable automatic throttling based on server response times */
6490
+ autoThrottle: boolean;
6491
+ /** Target request delay in ms for AutoThrottle */
6492
+ autoThrottleTargetDelay: number;
6493
+ /** Minimum delay between requests in ms */
6494
+ autoThrottleMinDelay: number;
6495
+ /** Maximum delay between requests in ms */
6496
+ autoThrottleMaxDelay: number;
6497
+ /** Maximum time to wait on 429 response in ms */
6498
+ maxWaitOn429: number;
6499
+ /** Always wait on 429 regardless of time */
6500
+ alwaysWaitOn429: boolean;
6454
6501
  /** Internal storage for Oxylabs configurations with domain mapping */
6455
6502
  oxylabs: {
6456
6503
  domain?: Domain;
@@ -6832,13 +6879,44 @@ export interface EmailDiscoveryEvent {
6832
6879
  discoveredAt: string;
6833
6880
  timestamp: Date;
6834
6881
  }
6882
+ interface RedirectEvent$1 {
6883
+ originalUrl: string;
6884
+ finalUrl: string;
6885
+ redirectCount: number;
6886
+ statusCode: number;
6887
+ }
6888
+ /**
6889
+ * Export format options
6890
+ */
6891
+ export type ExportFormat = "json" | "jsonl" | "csv";
6835
6892
  /**
6836
- * Generic handler function type for crawler event callbacks.
6837
- * All crawler event handlers must return a Promise<void>.
6893
+ * Handler with element bound to `this` context.
6894
+ * Use `function` syntax (not arrow functions) to access `this`.
6838
6895
  *
6839
- * @template T - The type of element or data passed to the handler
6896
+ * @example
6897
+ * ```typescript
6898
+ * crawler.onText('h1', async function(text) {
6899
+ * console.log(text, this.tagName); // `this` is the element
6900
+ * });
6901
+ * ```
6902
+ */
6903
+ export type ElementBoundHandler<TValue, TElement = Element> = (this: TElement, value: TValue) => Promise<void>;
6904
+ /**
6905
+ * Handler for attribute extraction with element bound to `this`.
6906
+ * Receives both the attribute value and attribute name.
6907
+ */
6908
+ export type AttributeHandler = (this: Element, value: string, attributeName: string) => Promise<void>;
6909
+ /**
6910
+ * Crawl statistics
6840
6911
  */
6841
- export type CrawlerHandler<T = any> = (element: T) => Promise<void>;
6912
+ export interface CrawlStats {
6913
+ urlsVisited: number;
6914
+ urlsQueued: number;
6915
+ urlsFailed: number;
6916
+ startTime: number;
6917
+ endTime?: number;
6918
+ currentDepth: number;
6919
+ }
6842
6920
  /**
6843
6921
  * A powerful web crawler that provides event-driven HTML parsing and data extraction.
6844
6922
  * Supports caching, proxy rotation, retry mechanisms, and email lead discovery.
@@ -6886,29 +6964,126 @@ export declare class Crawler {
6886
6964
  private isStorageReady;
6887
6965
  private isCacheReady;
6888
6966
  private leadsFinder;
6967
+ /** Navigation history for resumable crawling */
6968
+ private navigationHistory;
6969
+ private isNavigationHistoryReady;
6970
+ private isSessionReady;
6971
+ private currentSession;
6972
+ private navigationHistoryInitPromise;
6973
+ /** Adapter-specific request executor */
6974
+ private adapterExecutor;
6975
+ private adapterType;
6976
+ /** Track pending execute() calls for proper done() behavior */
6977
+ private pendingExecutions;
6978
+ /** robots.txt parser and validator */
6979
+ private robotsTxt;
6980
+ /** AutoThrottle: track response times per domain for adaptive rate limiting */
6981
+ private domainResponseTimes;
6982
+ private domainCurrentDelay;
6983
+ /** Crawl statistics */
6984
+ private crawlStats;
6985
+ /** URL depth tracking for maxDepth limit */
6986
+ private urlDepthMap;
6987
+ /** Lifecycle event handlers */
6988
+ private startHandlers;
6989
+ private finishHandlers;
6990
+ private redirectHandlers;
6991
+ /** Data collection for export */
6992
+ private collectedData;
6993
+ /** Flag to track if crawl has started */
6994
+ private crawlStarted;
6889
6995
  /**
6890
6996
  * Creates a new Crawler instance with the specified configuration.
6891
6997
  *
6892
- * @param option - Primary crawler configuration options
6893
- * @param backup - Optional backup HTTP client configuration for failover scenarios
6998
+ * @param crawlerOptions - Crawler configuration options
6999
+ * @param http - Optional Rezo HTTP client instance (creates default if not provided)
6894
7000
  *
6895
7001
  * @example
6896
7002
  * ```typescript
7003
+ * // Basic usage (creates default Rezo instance)
6897
7004
  * const crawler = new Crawler({
6898
- * http: primaryHttpClient,
6899
- * baseUrl: 'https://api.example.com',
6900
- * timeout: 30000,
7005
+ * baseUrl: 'https://example.com',
6901
7006
  * enableCache: true,
6902
7007
  * cacheDir: './cache',
6903
- * socksProxies: [{ host: '127.0.0.1', port: 9050 }]
6904
- * }, {
6905
- * http: backupHttpClient,
6906
- * useProxy: false,
6907
- * concurrency: 5
6908
7008
  * });
7009
+ *
7010
+ * // With resumable crawling
7011
+ * const crawler = new Crawler({
7012
+ * baseUrl: 'https://example.com',
7013
+ * enableNavigationHistory: true,
7014
+ * sessionId: 'my-session',
7015
+ * cacheDir: './cache',
7016
+ * });
7017
+ *
7018
+ * // With custom Rezo instance
7019
+ * const crawler = new Crawler({
7020
+ * baseUrl: 'https://example.com',
7021
+ * adapter: 'curl',
7022
+ * }, myRezoInstance);
6909
7023
  * ```
6910
7024
  */
6911
- constructor(crawlerOptions: ICrawlerOptions, http: Rezo);
7025
+ constructor(crawlerOptions: ICrawlerOptions, http?: Rezo);
7026
+ /**
7027
+ * Initialize the HTTP adapter based on configuration
7028
+ */
7029
+ private initializeAdapter;
7030
+ /**
7031
+ * Initialize navigation history and session
7032
+ */
7033
+ private initializeNavigationHistory;
7034
+ /**
7035
+ * Wait for navigation history and session to be ready
7036
+ */
7037
+ private waitForNavigationHistory;
7038
+ /**
7039
+ * Ensure navigation history is ready and return it (or null if not enabled)
7040
+ * This is used by visit() and other methods that need to write to navigation history
7041
+ */
7042
+ private ensureNavigationHistoryReady;
7043
+ /**
7044
+ * Add URL to navigation history queue
7045
+ */
7046
+ private addToNavigationQueue;
7047
+ /**
7048
+ * Mark URL as visited in navigation history
7049
+ */
7050
+ private markUrlVisited;
7051
+ /**
7052
+ * Get the current crawl session
7053
+ */
7054
+ getSession(): CrawlSession | null;
7055
+ /**
7056
+ * Get the session ID
7057
+ */
7058
+ getSessionId(): string;
7059
+ /**
7060
+ * Resume a previous crawl session
7061
+ * @param sessionId - Optional session ID to resume (uses current session if not provided)
7062
+ * @returns Promise resolving to the Crawler instance for chaining
7063
+ */
7064
+ resume(sessionId?: string): Promise<Crawler>;
7065
+ /**
7066
+ * Get list of resumable sessions
7067
+ * @returns Promise resolving to array of sessions that can be resumed
7068
+ */
7069
+ getResumableSessions(): Promise<CrawlSession[]>;
7070
+ /**
7071
+ * Pause the current crawl session
7072
+ */
7073
+ pause(): Promise<void>;
7074
+ /**
7075
+ * Mark the current session as completed
7076
+ */
7077
+ complete(): Promise<void>;
7078
+ /**
7079
+ * Get the current adapter type being used
7080
+ */
7081
+ getAdapterType(): CrawlerAdapterType;
7082
+ /**
7083
+ * Switch to a different adapter at runtime
7084
+ * @param adapter - The adapter type to switch to
7085
+ */
7086
+ setAdapter(adapter: CrawlerAdapterType): Promise<void>;
6912
7087
  private rawResponseHandler;
6913
7088
  private waitForCache;
6914
7089
  private waitForStorage;
@@ -6985,6 +7160,54 @@ export declare class Crawler {
6985
7160
  * ```
6986
7161
  */
6987
7162
  onEmailLeads(handler: (emails: string[]) => Promise<void>): Crawler;
7163
+ /**
7164
+ * Registers a handler called before crawling starts.
7165
+ * Useful for initialization, logging, or setup tasks.
7166
+ *
7167
+ * @param handler - Function to call before crawling begins
7168
+ * @returns The crawler instance for method chaining
7169
+ *
7170
+ * @example
7171
+ * ```typescript
7172
+ * crawler.onStart(async () => {
7173
+ * console.log('Crawl session started');
7174
+ * await initializeDatabase();
7175
+ * });
7176
+ * ```
7177
+ */
7178
+ onStart(handler: () => Promise<void>): Crawler;
7179
+ /**
7180
+ * Registers a handler called when crawling finishes.
7181
+ * Receives crawl statistics including URLs visited, failed, and timing.
7182
+ *
7183
+ * @param handler - Function to call when crawling completes
7184
+ * @returns The crawler instance for method chaining
7185
+ *
7186
+ * @example
7187
+ * ```typescript
7188
+ * crawler.onFinish(async (stats) => {
7189
+ * console.log(`Crawl completed: ${stats.urlsVisited} URLs in ${stats.endTime - stats.startTime}ms`);
7190
+ * await generateReport(stats);
7191
+ * });
7192
+ * ```
7193
+ */
7194
+ onFinish(handler: (stats: CrawlStats) => Promise<void>): Crawler;
7195
+ /**
7196
+ * Registers a handler called when a redirect is followed.
7197
+ * Provides information about the original URL, final URL, and redirect count.
7198
+ *
7199
+ * @param handler - Function to handle redirect events
7200
+ * @returns The crawler instance for method chaining
7201
+ *
7202
+ * @example
7203
+ * ```typescript
7204
+ * crawler.onRedirect(async (event) => {
7205
+ * console.log(`Redirect: ${event.originalUrl} -> ${event.finalUrl}`);
7206
+ * trackRedirects(event);
7207
+ * });
7208
+ * ```
7209
+ */
7210
+ onRedirect(handler: (event: RedirectEvent$1) => Promise<void>): Crawler;
6988
7211
  /**
6989
7212
  * Registers a handler for raw response data.
6990
7213
  * Triggered for all responses, providing access to the raw Buffer data.
@@ -7080,21 +7303,23 @@ export declare class Crawler {
7080
7303
  /**
7081
7304
  * Registers a handler for href attributes from anchor and link elements.
7082
7305
  * Automatically resolves relative URLs to absolute URLs.
7306
+ * Use `function` syntax (not arrow) to access `this` as the element.
7083
7307
  *
7084
- * @param handler - Function to handle href URLs as strings
7308
+ * @param handler - Function receiving href string, with `this` bound to the element
7085
7309
  * @returns The crawler instance for method chaining
7086
7310
  *
7087
7311
  * @example
7088
7312
  * ```typescript
7089
- * crawler.onHref(async (href) => {
7313
+ * crawler.onHref(async function(href) {
7090
7314
  * console.log('Found URL:', href);
7315
+ * console.log('Link text:', this.textContent); // `this` is the anchor/link element
7091
7316
  * if (href.includes('/api/')) {
7092
7317
  * await crawler.visit(href);
7093
7318
  * }
7094
7319
  * });
7095
7320
  * ```
7096
7321
  */
7097
- onHref(handler: (href: string) => Promise<void>): Crawler;
7322
+ onHref(handler: ElementBoundHandler<string, HTMLAnchorElement | HTMLLinkElement>): Crawler;
7098
7323
  /**
7099
7324
  * Registers a handler for elements matching a CSS selector.
7100
7325
  * Provides fine-grained control over which elements to process.
@@ -7136,55 +7361,57 @@ export declare class Crawler {
7136
7361
  /**
7137
7362
  * Registers a handler for HTML element attributes.
7138
7363
  * Can extract specific attributes from all elements or from elements matching a selector.
7364
+ * Use `function` syntax (not arrow) to access `this` as the element.
7139
7365
  *
7140
7366
  * @param attribute - The attribute name to extract
7141
- * @param handler - Function to handle attribute values
7367
+ * @param handler - Function receiving (value, attrName), with `this` bound to element
7142
7368
  * @returns The crawler instance for method chaining
7143
7369
  *
7144
7370
  * @overload
7145
7371
  * @param selection - CSS selector to filter elements
7146
7372
  * @param attribute - The attribute name to extract
7147
- * @param handler - Function to handle attribute values
7373
+ * @param handler - Function receiving (value, attrName), with `this` bound to element
7148
7374
  * @returns The crawler instance for method chaining
7149
7375
  *
7150
7376
  * @example
7151
7377
  * ```typescript
7152
7378
  * // Extract all 'data-id' attributes
7153
- * crawler.onAttribute('data-id', async (value) => {
7154
- * console.log('Found data-id:', value);
7379
+ * crawler.onAttribute('data-id', async function(value, attrName) {
7380
+ * console.log('Found', attrName, ':', value, 'on:', this.tagName);
7155
7381
  * });
7156
7382
  *
7157
7383
  * // Extract 'src' attributes from images only
7158
- * crawler.onAttribute('img', 'src', async (src) => {
7159
- * console.log('Image source:', src);
7384
+ * crawler.onAttribute('img', 'src', async function(value) {
7385
+ * console.log('Image source:', value, 'alt:', this.getAttribute('alt'));
7160
7386
  * });
7161
7387
  * ```
7162
7388
  */
7163
- onAttribute(attribute: string, handler: CrawlerHandler<string>): Crawler;
7164
- onAttribute(selection: string, attribute: string, handler: CrawlerHandler<string>): Crawler;
7389
+ onAttribute(attribute: string, handler: AttributeHandler): Crawler;
7390
+ onAttribute(selection: string, attribute: string, handler: AttributeHandler): Crawler;
7165
7391
  /**
7166
7392
  * Registers a handler for text content of elements matching a CSS selector.
7167
7393
  * Extracts and processes the textContent of matching elements.
7394
+ * Use `function` syntax (not arrow) to access `this` as the element.
7168
7395
  *
7169
7396
  * @param selection - CSS selector to match elements
7170
- * @param handler - Function to handle extracted text content
7397
+ * @param handler - Function receiving text string, with `this` bound to element
7171
7398
  * @returns The crawler instance for method chaining
7172
7399
  *
7173
7400
  * @example
7174
7401
  * ```typescript
7175
- * // Extract all heading text
7176
- * crawler.onText('h1, h2, h3', async (text) => {
7177
- * console.log('Heading:', text.trim());
7402
+ * // Extract all heading text with element context
7403
+ * crawler.onText('h1, h2, h3', async function(text) {
7404
+ * console.log('Heading:', text.trim(), 'Tag:', this.tagName);
7178
7405
  * });
7179
7406
  *
7180
- * // Extract product prices
7181
- * crawler.onText('.price', async (price) => {
7182
- * const numericPrice = parseFloat(price.replace(/[^\d.]/g, ''));
7183
- * console.log('Price value:', numericPrice);
7407
+ * // Extract product prices with element context
7408
+ * crawler.onText('.price', async function(text) {
7409
+ * const numericPrice = parseFloat(text.replace(/[^\d.]/g, ''));
7410
+ * console.log('Price:', numericPrice, 'Product:', this.closest('.product')?.id);
7184
7411
  * });
7185
7412
  * ```
7186
7413
  */
7187
- onText(selection: string, handler: CrawlerHandler<string>): Crawler;
7414
+ onText(selection: string, handler: ElementBoundHandler<string>): Crawler;
7188
7415
  private _onBody;
7189
7416
  private _onAttribute;
7190
7417
  private _onText;
@@ -7199,6 +7426,86 @@ export declare class Crawler {
7199
7426
  private _onEmailLeads;
7200
7427
  private _onRawResponse;
7201
7428
  private _onResponse;
7429
+ /**
7430
+ * Calculate adaptive delay based on server response times (AutoThrottle)
7431
+ */
7432
+ private calculateAutoThrottleDelay;
7433
+ /**
7434
+ * Get current AutoThrottle delay for a domain
7435
+ */
7436
+ private getAutoThrottleDelay;
7437
+ /**
7438
+ * Handle 429 Too Many Requests response with Retry-After header parsing
7439
+ */
7440
+ private handle429Response;
7441
+ /**
7442
+ * Check if URL passes all crawl limit checks
7443
+ */
7444
+ private checkCrawlLimits;
7445
+ /**
7446
+ * Check if a link should be followed based on nofollow rules
7447
+ */
7448
+ private shouldFollowLink;
7449
+ /**
7450
+ * Check response size against maxResponseSize limit
7451
+ */
7452
+ private checkResponseSize;
7453
+ /**
7454
+ * Collect data for later export
7455
+ *
7456
+ * @param data - Data to collect (will be added to export buffer)
7457
+ * @returns The crawler instance for method chaining
7458
+ *
7459
+ * @example
7460
+ * ```typescript
7461
+ * crawler.onDocument(async (doc) => {
7462
+ * crawler.collect({
7463
+ * title: doc.title,
7464
+ * url: doc.URL,
7465
+ * h1: doc.querySelector('h1')?.textContent
7466
+ * });
7467
+ * });
7468
+ * ```
7469
+ */
7470
+ collect(data: any): Crawler;
7471
+ /**
7472
+ * Get all collected data
7473
+ */
7474
+ getCollectedData(): any[];
7475
+ /**
7476
+ * Clear collected data
7477
+ */
7478
+ clearCollectedData(): Crawler;
7479
+ /**
7480
+ * Export collected data to a file
7481
+ *
7482
+ * @param filePath - Output file path
7483
+ * @param format - Export format: 'json', 'jsonl', or 'csv'
7484
+ *
7485
+ * @example
7486
+ * ```typescript
7487
+ * await crawler.waitForAll();
7488
+ * await crawler.exportData('./output.json', 'json');
7489
+ * await crawler.exportData('./output.csv', 'csv');
7490
+ * ```
7491
+ */
7492
+ exportData(filePath: string, format?: ExportFormat): Promise<void>;
7493
+ /**
7494
+ * Get current crawl statistics
7495
+ */
7496
+ getStats(): CrawlStats;
7497
+ /**
7498
+ * Trigger onStart handlers (called once on first visit)
7499
+ */
7500
+ private triggerStartHandlers;
7501
+ /**
7502
+ * Trigger onFinish handlers
7503
+ */
7504
+ private triggerFinishHandlers;
7505
+ /**
7506
+ * Trigger onRedirect handlers
7507
+ */
7508
+ private triggerRedirectHandlers;
7202
7509
  private buildUrl;
7203
7510
  /**
7204
7511
  * Visits a URL and processes it according to registered event handlers.
@@ -7303,7 +7610,28 @@ export declare class Crawler {
7303
7610
  * ```
7304
7611
  */
7305
7612
  waitForAll(): Promise<void>;
7613
+ /**
7614
+ * Alias for waitForAll() - waits for all crawling operations to complete.
7615
+ * @returns Promise that resolves when done
7616
+ * @example
7617
+ * ```typescript
7618
+ * crawler.visit('https://example.com');
7619
+ * await crawler.done();
7620
+ * ```
7621
+ */
7622
+ done(): Promise<void>;
7306
7623
  close(): Promise<void>;
7624
+ /**
7625
+ * Destroys the crawler instance and releases all resources.
7626
+ * Clears all queued tasks, closes caches, and cleans up event handlers.
7627
+ * @returns Promise that resolves when destruction is complete
7628
+ * @example
7629
+ * ```typescript
7630
+ * await crawler.destroy();
7631
+ * // Crawler is now fully cleaned up
7632
+ * ```
7633
+ */
7634
+ destroy(): Promise<void>;
7307
7635
  }
7308
7636
 
7309
7637
  export {};