rezo 1.0.42 → 1.0.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/curl.cjs +131 -29
- package/dist/adapters/curl.js +131 -29
- package/dist/adapters/entries/curl.d.ts +65 -0
- package/dist/adapters/entries/fetch.d.ts +65 -0
- package/dist/adapters/entries/http.d.ts +65 -0
- package/dist/adapters/entries/http2.d.ts +65 -0
- package/dist/adapters/entries/react-native.d.ts +65 -0
- package/dist/adapters/entries/xhr.d.ts +65 -0
- package/dist/adapters/http2.cjs +209 -22
- package/dist/adapters/http2.js +209 -22
- package/dist/adapters/index.cjs +6 -6
- package/dist/cache/index.cjs +9 -13
- package/dist/cache/index.js +0 -2
- package/dist/core/rezo.cjs +7 -0
- package/dist/core/rezo.js +7 -0
- package/dist/crawler/addon/decodo/index.cjs +1 -0
- package/dist/crawler/addon/decodo/index.js +1 -0
- package/dist/crawler/crawler-options.cjs +1 -0
- package/dist/crawler/crawler-options.js +1 -0
- package/dist/crawler/crawler.cjs +1070 -0
- package/dist/crawler/crawler.js +1068 -0
- package/dist/crawler/index.cjs +40 -0
- package/dist/{plugin → crawler}/index.js +4 -2
- package/dist/crawler/plugin/file-cacher.cjs +19 -0
- package/dist/crawler/plugin/file-cacher.js +19 -0
- package/dist/crawler/plugin/index.cjs +1 -0
- package/dist/crawler/plugin/index.js +1 -0
- package/dist/crawler/plugin/navigation-history.cjs +43 -0
- package/dist/crawler/plugin/navigation-history.js +43 -0
- package/dist/crawler/plugin/robots-txt.cjs +2 -0
- package/dist/crawler/plugin/robots-txt.js +2 -0
- package/dist/crawler/plugin/url-store.cjs +18 -0
- package/dist/crawler/plugin/url-store.js +18 -0
- package/dist/crawler.d.ts +511 -183
- package/dist/entries/crawler.cjs +5 -5
- package/dist/entries/crawler.js +2 -2
- package/dist/index.cjs +27 -24
- package/dist/index.d.ts +73 -0
- package/dist/index.js +1 -0
- package/dist/internal/agents/base.cjs +113 -0
- package/dist/internal/agents/base.js +110 -0
- package/dist/internal/agents/http-proxy.cjs +89 -0
- package/dist/internal/agents/http-proxy.js +86 -0
- package/dist/internal/agents/https-proxy.cjs +176 -0
- package/dist/internal/agents/https-proxy.js +173 -0
- package/dist/internal/agents/index.cjs +10 -0
- package/dist/internal/agents/index.js +5 -0
- package/dist/internal/agents/socks-client.cjs +571 -0
- package/dist/internal/agents/socks-client.js +567 -0
- package/dist/internal/agents/socks-proxy.cjs +75 -0
- package/dist/internal/agents/socks-proxy.js +72 -0
- package/dist/platform/browser.d.ts +65 -0
- package/dist/platform/bun.d.ts +65 -0
- package/dist/platform/deno.d.ts +65 -0
- package/dist/platform/node.d.ts +65 -0
- package/dist/platform/react-native.d.ts +65 -0
- package/dist/platform/worker.d.ts +65 -0
- package/dist/proxy/index.cjs +18 -16
- package/dist/proxy/index.js +17 -12
- package/dist/queue/index.cjs +8 -8
- package/dist/responses/buildError.cjs +11 -2
- package/dist/responses/buildError.js +11 -2
- package/dist/responses/universal/index.cjs +11 -11
- package/dist/utils/curl.cjs +317 -0
- package/dist/utils/curl.js +314 -0
- package/package.json +2 -6
- package/dist/cache/file-cacher.cjs +0 -264
- package/dist/cache/file-cacher.js +0 -261
- package/dist/cache/url-store.cjs +0 -288
- package/dist/cache/url-store.js +0 -285
- package/dist/plugin/addon/decodo/index.cjs +0 -1
- package/dist/plugin/addon/decodo/index.js +0 -1
- package/dist/plugin/crawler-options.cjs +0 -1
- package/dist/plugin/crawler-options.js +0 -1
- package/dist/plugin/crawler.cjs +0 -519
- package/dist/plugin/crawler.js +0 -517
- package/dist/plugin/index.cjs +0 -36
- /package/dist/{plugin → crawler}/addon/decodo/options.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/decodo/options.js +0 -0
- /package/dist/{plugin → crawler}/addon/decodo/types.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/decodo/types.js +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/index.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/index.js +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/options.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/options.js +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/types.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/types.js +0 -0
- /package/dist/{plugin → crawler}/scraper.cjs +0 -0
- /package/dist/{plugin → crawler}/scraper.js +0 -0
package/dist/crawler.d.ts
CHANGED
|
@@ -6,43 +6,24 @@ import { SecureContext, TLSSocket } from 'node:tls';
|
|
|
6
6
|
import { Cookie as TouchCookie, CookieJar as TouchCookieJar, CreateCookieOptions } from 'tough-cookie';
|
|
7
7
|
|
|
8
8
|
/**
|
|
9
|
-
*
|
|
9
|
+
* CrawlerCache - High-performance SQLite-based response caching for web crawlers
|
|
10
10
|
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
11
|
+
* Optimized specifically for crawler workloads with:
|
|
12
|
+
* - WAL mode for high-throughput concurrent reads/writes
|
|
13
|
+
* - Batch operations for efficient bulk storage
|
|
14
|
+
* - Domain-based namespacing for organized cache management
|
|
15
|
+
* - Optional zstd compression for storage efficiency
|
|
13
16
|
*
|
|
14
17
|
* @module cache/file-cacher
|
|
15
18
|
* @author Rezo HTTP Client Library
|
|
16
|
-
*
|
|
17
|
-
* @example
|
|
18
|
-
* ```typescript
|
|
19
|
-
* import { FileCacher } from 'rezo';
|
|
20
|
-
*
|
|
21
|
-
* // Create a file cacher instance
|
|
22
|
-
* const cacher = await FileCacher.create({
|
|
23
|
-
* cacheDir: './cache',
|
|
24
|
-
* ttl: 3600000, // 1 hour
|
|
25
|
-
* compression: true,
|
|
26
|
-
* encryptNamespace: true
|
|
27
|
-
* });
|
|
28
|
-
*
|
|
29
|
-
* // Store and retrieve data
|
|
30
|
-
* await cacher.set('user:123', { name: 'John' }, 3600000, 'users');
|
|
31
|
-
* const user = await cacher.get('user:123', 'users');
|
|
32
|
-
*
|
|
33
|
-
* // Check existence and cleanup
|
|
34
|
-
* const exists = await cacher.has('user:123', 'users');
|
|
35
|
-
* await cacher.delete('user:123', 'users');
|
|
36
|
-
* await cacher.close();
|
|
37
|
-
* ```
|
|
38
19
|
*/
|
|
39
20
|
/**
|
|
40
|
-
* Configuration options for
|
|
21
|
+
* Configuration options for CrawlerCache
|
|
41
22
|
*/
|
|
42
23
|
export interface FileCacherOptions {
|
|
43
24
|
/**
|
|
44
25
|
* Directory path for storing cache databases
|
|
45
|
-
* @default '
|
|
26
|
+
* @default '/tmp/rezo-crawler/cache'
|
|
46
27
|
*/
|
|
47
28
|
cacheDir?: string;
|
|
48
29
|
/**
|
|
@@ -51,23 +32,18 @@ export interface FileCacherOptions {
|
|
|
51
32
|
*/
|
|
52
33
|
ttl?: number;
|
|
53
34
|
/**
|
|
54
|
-
* Enable zstd compression for stored values
|
|
35
|
+
* Enable zstd compression for stored values (Node.js 22.15+)
|
|
55
36
|
* Reduces storage size but adds CPU overhead
|
|
56
37
|
* @default false
|
|
57
38
|
*/
|
|
58
39
|
compression?: boolean;
|
|
59
40
|
/**
|
|
60
|
-
*
|
|
61
|
-
* @default false
|
|
62
|
-
*/
|
|
63
|
-
softDelete?: boolean;
|
|
64
|
-
/**
|
|
65
|
-
* Hash namespace names for privacy/security
|
|
41
|
+
* Hash namespace names for privacy
|
|
66
42
|
* @default false
|
|
67
43
|
*/
|
|
68
44
|
encryptNamespace?: boolean;
|
|
69
45
|
/**
|
|
70
|
-
* Maximum
|
|
46
|
+
* Maximum entries per namespace (0 = unlimited)
|
|
71
47
|
* @default 0
|
|
72
48
|
*/
|
|
73
49
|
maxEntries?: number;
|
|
@@ -77,157 +53,76 @@ declare class FileCacher {
|
|
|
77
53
|
private readonly options;
|
|
78
54
|
private readonly cacheDir;
|
|
79
55
|
private closed;
|
|
80
|
-
/**
|
|
81
|
-
* Private constructor - use FileCacher.create() instead
|
|
82
|
-
*/
|
|
83
56
|
private constructor();
|
|
84
57
|
/**
|
|
85
58
|
* Create a new FileCacher instance
|
|
86
|
-
*
|
|
87
|
-
* @param options - Configuration options
|
|
88
|
-
* @returns Promise resolving to initialized FileCacher instance
|
|
89
|
-
*
|
|
90
|
-
* @example
|
|
91
|
-
* ```typescript
|
|
92
|
-
* const cacher = await FileCacher.create({
|
|
93
|
-
* cacheDir: './my-cache',
|
|
94
|
-
* ttl: 3600000,
|
|
95
|
-
* compression: true
|
|
96
|
-
* });
|
|
97
|
-
* ```
|
|
98
59
|
*/
|
|
99
60
|
static create(options?: FileCacherOptions): Promise<FileCacher>;
|
|
100
61
|
/**
|
|
101
|
-
* Get or create database for a namespace
|
|
62
|
+
* Get or create optimized database for a namespace (domain)
|
|
102
63
|
*/
|
|
103
64
|
private getDatabase;
|
|
104
65
|
/**
|
|
105
|
-
* Store a
|
|
106
|
-
*
|
|
107
|
-
* @param key - Unique key for the cached item
|
|
108
|
-
* @param value - Value to cache (will be JSON serialized)
|
|
109
|
-
* @param ttl - Time-to-live in milliseconds (uses default if not specified)
|
|
110
|
-
* @param namespace - Namespace for isolation (default: 'default')
|
|
111
|
-
* @returns Promise resolving when stored
|
|
112
|
-
*
|
|
113
|
-
* @example
|
|
114
|
-
* ```typescript
|
|
115
|
-
* // Store with default TTL
|
|
116
|
-
* await cacher.set('key1', { data: 'value' });
|
|
117
|
-
*
|
|
118
|
-
* // Store with custom TTL and namespace
|
|
119
|
-
* await cacher.set('key2', responseData, 3600000, 'api-responses');
|
|
120
|
-
* ```
|
|
66
|
+
* Store a response in the cache
|
|
121
67
|
*/
|
|
122
68
|
set<T = any>(key: string, value: T, ttl?: number, namespace?: string): Promise<void>;
|
|
123
69
|
/**
|
|
124
|
-
*
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
*
|
|
133
|
-
* if (data) {
|
|
134
|
-
* console.log('Cache hit:', data);
|
|
135
|
-
* }
|
|
136
|
-
* ```
|
|
70
|
+
* Store multiple responses in a single transaction (batch operation)
|
|
71
|
+
*/
|
|
72
|
+
setMany<T = any>(entries: Array<{
|
|
73
|
+
key: string;
|
|
74
|
+
value: T;
|
|
75
|
+
ttl?: number;
|
|
76
|
+
}>, namespace?: string): Promise<void>;
|
|
77
|
+
/**
|
|
78
|
+
* Retrieve a cached response
|
|
137
79
|
*/
|
|
138
80
|
get<T = any>(key: string, namespace?: string): Promise<T | null>;
|
|
139
81
|
/**
|
|
140
|
-
* Check if a key exists
|
|
141
|
-
*
|
|
142
|
-
* @param key - Key to check
|
|
143
|
-
* @param namespace - Namespace to search in (default: 'default')
|
|
144
|
-
* @returns Promise resolving to true if key exists and is valid
|
|
145
|
-
*
|
|
146
|
-
* @example
|
|
147
|
-
* ```typescript
|
|
148
|
-
* if (await cacher.has('key1', 'my-namespace')) {
|
|
149
|
-
* const data = await cacher.get('key1', 'my-namespace');
|
|
150
|
-
* }
|
|
151
|
-
* ```
|
|
82
|
+
* Check if a key exists and is not expired
|
|
152
83
|
*/
|
|
153
84
|
has(key: string, namespace?: string): Promise<boolean>;
|
|
85
|
+
/**
|
|
86
|
+
* Check multiple keys at once (batch operation)
|
|
87
|
+
*/
|
|
88
|
+
hasMany(keys: string[], namespace?: string): Promise<Set<string>>;
|
|
154
89
|
/**
|
|
155
90
|
* Delete a key from the cache
|
|
156
|
-
*
|
|
157
|
-
* @param key - Key to delete
|
|
158
|
-
* @param namespace - Namespace to delete from (default: 'default')
|
|
159
|
-
* @returns Promise resolving to true if key was deleted
|
|
160
|
-
*
|
|
161
|
-
* @example
|
|
162
|
-
* ```typescript
|
|
163
|
-
* await cacher.delete('obsolete-key', 'my-namespace');
|
|
164
|
-
* ```
|
|
165
91
|
*/
|
|
166
92
|
delete(key: string, namespace?: string): Promise<boolean>;
|
|
167
93
|
/**
|
|
168
94
|
* Clear all entries in a namespace
|
|
169
|
-
*
|
|
170
|
-
* @param namespace - Namespace to clear (default: 'default')
|
|
171
|
-
* @returns Promise resolving when cleared
|
|
172
|
-
*
|
|
173
|
-
* @example
|
|
174
|
-
* ```typescript
|
|
175
|
-
* // Clear all cached data for a domain
|
|
176
|
-
* await cacher.clear('example.com');
|
|
177
|
-
* ```
|
|
178
95
|
*/
|
|
179
96
|
clear(namespace?: string): Promise<void>;
|
|
180
97
|
/**
|
|
181
|
-
* Remove all expired entries
|
|
182
|
-
*
|
|
183
|
-
* @param namespace - Namespace to cleanup (default: 'default')
|
|
184
|
-
* @returns Promise resolving to number of entries removed
|
|
185
|
-
*
|
|
186
|
-
* @example
|
|
187
|
-
* ```typescript
|
|
188
|
-
* const removed = await cacher.cleanup('my-namespace');
|
|
189
|
-
* console.log(`Removed ${removed} expired entries`);
|
|
190
|
-
* ```
|
|
98
|
+
* Remove all expired entries
|
|
191
99
|
*/
|
|
192
100
|
cleanup(namespace?: string): Promise<number>;
|
|
193
101
|
/**
|
|
194
|
-
* Get statistics for a namespace
|
|
195
|
-
*
|
|
196
|
-
* @param namespace - Namespace to get stats for (default: 'default')
|
|
197
|
-
* @returns Promise resolving to cache statistics
|
|
198
|
-
*
|
|
199
|
-
* @example
|
|
200
|
-
* ```typescript
|
|
201
|
-
* const stats = await cacher.stats('my-namespace');
|
|
202
|
-
* console.log(`${stats.count} entries, ${stats.size} bytes`);
|
|
203
|
-
* ```
|
|
102
|
+
* Get cache statistics for a namespace
|
|
204
103
|
*/
|
|
205
104
|
stats(namespace?: string): Promise<{
|
|
206
105
|
count: number;
|
|
207
106
|
expired: number;
|
|
208
|
-
deleted: number;
|
|
209
107
|
}>;
|
|
210
108
|
/**
|
|
211
|
-
* Close all database connections
|
|
212
|
-
*
|
|
213
|
-
* @returns Promise resolving when all connections are closed
|
|
214
|
-
*
|
|
215
|
-
* @example
|
|
216
|
-
* ```typescript
|
|
217
|
-
* // Always close when done
|
|
218
|
-
* await cacher.close();
|
|
219
|
-
* ```
|
|
109
|
+
* Close all database connections
|
|
220
110
|
*/
|
|
221
111
|
close(): Promise<void>;
|
|
222
|
-
/**
|
|
223
|
-
* Check if the cacher has been closed
|
|
224
|
-
*/
|
|
225
112
|
get isClosed(): boolean;
|
|
226
|
-
/**
|
|
227
|
-
* Get the cache directory path
|
|
228
|
-
*/
|
|
229
113
|
get directory(): string;
|
|
230
114
|
}
|
|
115
|
+
export interface CrawlSession {
|
|
116
|
+
sessionId: string;
|
|
117
|
+
baseUrl: string;
|
|
118
|
+
startedAt: number;
|
|
119
|
+
lastActivityAt: number;
|
|
120
|
+
status: "running" | "paused" | "completed" | "failed";
|
|
121
|
+
urlsVisited: number;
|
|
122
|
+
urlsQueued: number;
|
|
123
|
+
urlsFailed: number;
|
|
124
|
+
metadata?: string;
|
|
125
|
+
}
|
|
231
126
|
export interface RezoHttpHeaders {
|
|
232
127
|
accept?: string | undefined;
|
|
233
128
|
"accept-encoding"?: string | undefined;
|
|
@@ -4464,6 +4359,71 @@ declare class Rezo {
|
|
|
4464
4359
|
* @see {@link cookieJar} - Access the underlying RezoCookieJar for more control
|
|
4465
4360
|
*/
|
|
4466
4361
|
clearCookies(): void;
|
|
4362
|
+
/**
|
|
4363
|
+
* Convert a Rezo request configuration to a cURL command string.
|
|
4364
|
+
*
|
|
4365
|
+
* Generates a valid cURL command that can be executed in a terminal to
|
|
4366
|
+
* reproduce the same HTTP request. Useful for:
|
|
4367
|
+
* - Debugging and sharing requests
|
|
4368
|
+
* - Documentation and examples
|
|
4369
|
+
* - Testing requests outside of Node.js
|
|
4370
|
+
* - Exporting requests to other tools
|
|
4371
|
+
*
|
|
4372
|
+
* @param config - Request configuration object
|
|
4373
|
+
* @returns A cURL command string
|
|
4374
|
+
*
|
|
4375
|
+
* @example
|
|
4376
|
+
* ```typescript
|
|
4377
|
+
* const curl = Rezo.toCurl({
|
|
4378
|
+
* url: 'https://api.example.com/users',
|
|
4379
|
+
* method: 'POST',
|
|
4380
|
+
* headers: { 'Content-Type': 'application/json' },
|
|
4381
|
+
* body: { name: 'John', email: 'john@example.com' }
|
|
4382
|
+
* });
|
|
4383
|
+
* // Output: curl -X POST -H 'content-type: application/json' --data-raw '{"name":"John","email":"john@example.com"}' -L --compressed 'https://api.example.com/users'
|
|
4384
|
+
* ```
|
|
4385
|
+
*/
|
|
4386
|
+
static toCurl(config: RezoRequestConfig | RezoRequestOptions): string;
|
|
4387
|
+
/**
|
|
4388
|
+
* Parse a cURL command string into a Rezo request configuration.
|
|
4389
|
+
*
|
|
4390
|
+
* Converts a cURL command into a configuration object that can be
|
|
4391
|
+
* passed directly to Rezo request methods. Useful for:
|
|
4392
|
+
* - Importing requests from browser DevTools
|
|
4393
|
+
* - Converting curl examples from API documentation
|
|
4394
|
+
* - Migrating scripts from curl to Rezo
|
|
4395
|
+
*
|
|
4396
|
+
* Supports common cURL options:
|
|
4397
|
+
* - `-X, --request` - HTTP method
|
|
4398
|
+
* - `-H, --header` - Request headers
|
|
4399
|
+
* - `-d, --data, --data-raw, --data-binary` - Request body
|
|
4400
|
+
* - `-u, --user` - Basic authentication
|
|
4401
|
+
* - `-x, --proxy` - Proxy configuration
|
|
4402
|
+
* - `--socks5, --socks4` - SOCKS proxy
|
|
4403
|
+
* - `-L, --location` - Follow redirects
|
|
4404
|
+
* - `--max-redirs` - Maximum redirects
|
|
4405
|
+
* - `--max-time` - Request timeout
|
|
4406
|
+
* - `-k, --insecure` - Skip TLS verification
|
|
4407
|
+
* - `-A, --user-agent` - User agent header
|
|
4408
|
+
*
|
|
4409
|
+
* @param curlCommand - A cURL command string
|
|
4410
|
+
* @returns A request configuration object
|
|
4411
|
+
*
|
|
4412
|
+
* @example
|
|
4413
|
+
* ```typescript
|
|
4414
|
+
* // From browser DevTools "Copy as cURL"
|
|
4415
|
+
* const config = Rezo.fromCurl(`
|
|
4416
|
+
* curl 'https://api.example.com/data' \\
|
|
4417
|
+
* -H 'Authorization: Bearer token123' \\
|
|
4418
|
+
* -H 'Content-Type: application/json'
|
|
4419
|
+
* `);
|
|
4420
|
+
*
|
|
4421
|
+
* // Use with Rezo
|
|
4422
|
+
* const rezo = new Rezo();
|
|
4423
|
+
* const response = await rezo.request(config);
|
|
4424
|
+
* ```
|
|
4425
|
+
*/
|
|
4426
|
+
static fromCurl(curlCommand: string): RezoRequestOptions;
|
|
4467
4427
|
}
|
|
4468
4428
|
/**
|
|
4469
4429
|
* Rezo HTTP Client - Core Types
|
|
@@ -6156,17 +6116,25 @@ declare class Decodo {
|
|
|
6156
6116
|
/**
|
|
6157
6117
|
* Create a new Decodo client instance
|
|
6158
6118
|
*
|
|
6159
|
-
* @param config - Decodo API configuration
|
|
6160
|
-
* @throws Error if
|
|
6119
|
+
* @param config - Decodo API configuration (supports username/password OR token auth)
|
|
6120
|
+
* @throws Error if authentication credentials are missing
|
|
6161
6121
|
*
|
|
6162
6122
|
* @example
|
|
6163
6123
|
* ```typescript
|
|
6124
|
+
* // Username/password authentication
|
|
6164
6125
|
* const decodo = new Decodo({
|
|
6165
6126
|
* username: 'user',
|
|
6166
6127
|
* password: 'password',
|
|
6167
6128
|
* headless: 'html',
|
|
6168
6129
|
* country: 'US'
|
|
6169
6130
|
* });
|
|
6131
|
+
*
|
|
6132
|
+
* // Token authentication (alternative)
|
|
6133
|
+
* const decodo = new Decodo({
|
|
6134
|
+
* token: 'your_api_token',
|
|
6135
|
+
* headless: 'html',
|
|
6136
|
+
* country: 'US'
|
|
6137
|
+
* });
|
|
6170
6138
|
* ```
|
|
6171
6139
|
*/
|
|
6172
6140
|
constructor(config: DecodoConfig);
|
|
@@ -6284,6 +6252,15 @@ declare class Decodo {
|
|
|
6284
6252
|
* const regexDomain: Domain = '^(sub|api)\.example\.com$';
|
|
6285
6253
|
*/
|
|
6286
6254
|
export type Domain = string[] | string | RegExp;
|
|
6255
|
+
/**
|
|
6256
|
+
* Supported HTTP adapter types for crawler requests
|
|
6257
|
+
* @description
|
|
6258
|
+
* - 'http': Standard Node.js HTTP/HTTPS adapter (default)
|
|
6259
|
+
* - 'http2': HTTP/2 adapter with session pooling
|
|
6260
|
+
* - 'curl': cURL adapter for maximum compatibility
|
|
6261
|
+
* - 'fetch': Browser-compatible Fetch API adapter
|
|
6262
|
+
*/
|
|
6263
|
+
export type CrawlerAdapterType = "http" | "http2" | "curl" | "fetch";
|
|
6287
6264
|
/**
|
|
6288
6265
|
* Configuration interface for the CrawlerOptions class
|
|
6289
6266
|
* @description Defines all available options for configuring web crawler behavior,
|
|
@@ -6292,6 +6269,12 @@ export type Domain = string[] | string | RegExp;
|
|
|
6292
6269
|
export interface ICrawlerOptions {
|
|
6293
6270
|
/** Base URL for the crawler - the starting point for crawling operations */
|
|
6294
6271
|
baseUrl: string;
|
|
6272
|
+
/** HTTP adapter to use for requests (default: 'http') */
|
|
6273
|
+
adapter?: CrawlerAdapterType;
|
|
6274
|
+
/** Enable navigation history for resumable crawling (default: false) */
|
|
6275
|
+
enableNavigationHistory?: boolean;
|
|
6276
|
+
/** Session ID for navigation history - allows resuming specific crawl sessions */
|
|
6277
|
+
sessionId?: string;
|
|
6295
6278
|
/** Whether to reject unauthorized SSL certificates (default: true) */
|
|
6296
6279
|
rejectUnauthorized?: boolean;
|
|
6297
6280
|
/** Custom user agent string for HTTP requests */
|
|
@@ -6381,6 +6364,42 @@ export interface ICrawlerOptions {
|
|
|
6381
6364
|
} | {
|
|
6382
6365
|
enable: false;
|
|
6383
6366
|
} | undefined | false;
|
|
6367
|
+
/** Decodo proxy service configuration for specific domains or global use */
|
|
6368
|
+
decodo?: {
|
|
6369
|
+
enable: true;
|
|
6370
|
+
labs: [
|
|
6371
|
+
{
|
|
6372
|
+
domain: Domain;
|
|
6373
|
+
isGlobal?: boolean;
|
|
6374
|
+
options: DecodoOptions;
|
|
6375
|
+
queueOptions: queueOptions$1;
|
|
6376
|
+
}
|
|
6377
|
+
];
|
|
6378
|
+
} | {
|
|
6379
|
+
enable: false;
|
|
6380
|
+
} | undefined | false;
|
|
6381
|
+
/** Maximum crawl depth from start URL (0 = unlimited, default: 0) */
|
|
6382
|
+
maxDepth?: number;
|
|
6383
|
+
/** Maximum total URLs to crawl (0 = unlimited, default: 0) */
|
|
6384
|
+
maxUrls?: number;
|
|
6385
|
+
/** Maximum response size in bytes to process (0 = unlimited, default: 0) */
|
|
6386
|
+
maxResponseSize?: number;
|
|
6387
|
+
/** Respect robots.txt rules (default: false) */
|
|
6388
|
+
respectRobotsTxt?: boolean;
|
|
6389
|
+
/** Follow rel="nofollow" links (default: false - ignores nofollow links) */
|
|
6390
|
+
followNofollow?: boolean;
|
|
6391
|
+
/** Enable automatic throttling based on server response times (default: true) */
|
|
6392
|
+
autoThrottle?: boolean;
|
|
6393
|
+
/** Target request delay in ms for AutoThrottle (default: 1000) */
|
|
6394
|
+
autoThrottleTargetDelay?: number;
|
|
6395
|
+
/** Minimum delay between requests in ms (default: 100) */
|
|
6396
|
+
autoThrottleMinDelay?: number;
|
|
6397
|
+
/** Maximum delay between requests in ms (default: 60000) */
|
|
6398
|
+
autoThrottleMaxDelay?: number;
|
|
6399
|
+
/** Maximum time to wait on 429 response in ms (default: 1800000 = 30 min) */
|
|
6400
|
+
maxWaitOn429?: number;
|
|
6401
|
+
/** Always wait on 429 regardless of time, shows warning (default: false) */
|
|
6402
|
+
alwaysWaitOn429?: boolean;
|
|
6384
6403
|
}
|
|
6385
6404
|
/**
|
|
6386
6405
|
* Advanced web crawler configuration class with support for domain-specific settings
|
|
@@ -6415,6 +6434,12 @@ export interface ICrawlerOptions {
|
|
|
6415
6434
|
export declare class CrawlerOptions {
|
|
6416
6435
|
/** Base URL for the crawler - the starting point for crawling operations */
|
|
6417
6436
|
baseUrl: string;
|
|
6437
|
+
/** HTTP adapter to use for requests */
|
|
6438
|
+
adapter: CrawlerAdapterType;
|
|
6439
|
+
/** Enable navigation history for resumable crawling */
|
|
6440
|
+
enableNavigationHistory: boolean;
|
|
6441
|
+
/** Session ID for navigation history - allows resuming specific crawl sessions */
|
|
6442
|
+
sessionId: string;
|
|
6418
6443
|
/** Whether to reject unauthorized SSL certificates */
|
|
6419
6444
|
rejectUnauthorized?: boolean;
|
|
6420
6445
|
/** Custom user agent string for HTTP requests */
|
|
@@ -6451,6 +6476,28 @@ export declare class CrawlerOptions {
|
|
|
6451
6476
|
throwFatalError?: boolean;
|
|
6452
6477
|
/** Enable debug logging */
|
|
6453
6478
|
debug?: boolean;
|
|
6479
|
+
/** Maximum crawl depth from start URL (0 = unlimited) */
|
|
6480
|
+
maxDepth: number;
|
|
6481
|
+
/** Maximum total URLs to crawl (0 = unlimited) */
|
|
6482
|
+
maxUrls: number;
|
|
6483
|
+
/** Maximum response size in bytes to process (0 = unlimited) */
|
|
6484
|
+
maxResponseSize: number;
|
|
6485
|
+
/** Respect robots.txt rules */
|
|
6486
|
+
respectRobotsTxt: boolean;
|
|
6487
|
+
/** Follow rel="nofollow" links */
|
|
6488
|
+
followNofollow: boolean;
|
|
6489
|
+
/** Enable automatic throttling based on server response times */
|
|
6490
|
+
autoThrottle: boolean;
|
|
6491
|
+
/** Target request delay in ms for AutoThrottle */
|
|
6492
|
+
autoThrottleTargetDelay: number;
|
|
6493
|
+
/** Minimum delay between requests in ms */
|
|
6494
|
+
autoThrottleMinDelay: number;
|
|
6495
|
+
/** Maximum delay between requests in ms */
|
|
6496
|
+
autoThrottleMaxDelay: number;
|
|
6497
|
+
/** Maximum time to wait on 429 response in ms */
|
|
6498
|
+
maxWaitOn429: number;
|
|
6499
|
+
/** Always wait on 429 regardless of time */
|
|
6500
|
+
alwaysWaitOn429: boolean;
|
|
6454
6501
|
/** Internal storage for Oxylabs configurations with domain mapping */
|
|
6455
6502
|
oxylabs: {
|
|
6456
6503
|
domain?: Domain;
|
|
@@ -6832,13 +6879,44 @@ export interface EmailDiscoveryEvent {
|
|
|
6832
6879
|
discoveredAt: string;
|
|
6833
6880
|
timestamp: Date;
|
|
6834
6881
|
}
|
|
6882
|
+
interface RedirectEvent$1 {
|
|
6883
|
+
originalUrl: string;
|
|
6884
|
+
finalUrl: string;
|
|
6885
|
+
redirectCount: number;
|
|
6886
|
+
statusCode: number;
|
|
6887
|
+
}
|
|
6888
|
+
/**
|
|
6889
|
+
* Export format options
|
|
6890
|
+
*/
|
|
6891
|
+
export type ExportFormat = "json" | "jsonl" | "csv";
|
|
6835
6892
|
/**
|
|
6836
|
-
*
|
|
6837
|
-
*
|
|
6893
|
+
* Handler with element bound to `this` context.
|
|
6894
|
+
* Use `function` syntax (not arrow functions) to access `this`.
|
|
6838
6895
|
*
|
|
6839
|
-
* @
|
|
6896
|
+
* @example
|
|
6897
|
+
* ```typescript
|
|
6898
|
+
* crawler.onText('h1', async function(text) {
|
|
6899
|
+
* console.log(text, this.tagName); // `this` is the element
|
|
6900
|
+
* });
|
|
6901
|
+
* ```
|
|
6902
|
+
*/
|
|
6903
|
+
export type ElementBoundHandler<TValue, TElement = Element> = (this: TElement, value: TValue) => Promise<void>;
|
|
6904
|
+
/**
|
|
6905
|
+
* Handler for attribute extraction with element bound to `this`.
|
|
6906
|
+
* Receives both the attribute value and attribute name.
|
|
6907
|
+
*/
|
|
6908
|
+
export type AttributeHandler = (this: Element, value: string, attributeName: string) => Promise<void>;
|
|
6909
|
+
/**
|
|
6910
|
+
* Crawl statistics
|
|
6840
6911
|
*/
|
|
6841
|
-
export
|
|
6912
|
+
export interface CrawlStats {
|
|
6913
|
+
urlsVisited: number;
|
|
6914
|
+
urlsQueued: number;
|
|
6915
|
+
urlsFailed: number;
|
|
6916
|
+
startTime: number;
|
|
6917
|
+
endTime?: number;
|
|
6918
|
+
currentDepth: number;
|
|
6919
|
+
}
|
|
6842
6920
|
/**
|
|
6843
6921
|
* A powerful web crawler that provides event-driven HTML parsing and data extraction.
|
|
6844
6922
|
* Supports caching, proxy rotation, retry mechanisms, and email lead discovery.
|
|
@@ -6886,29 +6964,126 @@ export declare class Crawler {
|
|
|
6886
6964
|
private isStorageReady;
|
|
6887
6965
|
private isCacheReady;
|
|
6888
6966
|
private leadsFinder;
|
|
6967
|
+
/** Navigation history for resumable crawling */
|
|
6968
|
+
private navigationHistory;
|
|
6969
|
+
private isNavigationHistoryReady;
|
|
6970
|
+
private isSessionReady;
|
|
6971
|
+
private currentSession;
|
|
6972
|
+
private navigationHistoryInitPromise;
|
|
6973
|
+
/** Adapter-specific request executor */
|
|
6974
|
+
private adapterExecutor;
|
|
6975
|
+
private adapterType;
|
|
6976
|
+
/** Track pending execute() calls for proper done() behavior */
|
|
6977
|
+
private pendingExecutions;
|
|
6978
|
+
/** robots.txt parser and validator */
|
|
6979
|
+
private robotsTxt;
|
|
6980
|
+
/** AutoThrottle: track response times per domain for adaptive rate limiting */
|
|
6981
|
+
private domainResponseTimes;
|
|
6982
|
+
private domainCurrentDelay;
|
|
6983
|
+
/** Crawl statistics */
|
|
6984
|
+
private crawlStats;
|
|
6985
|
+
/** URL depth tracking for maxDepth limit */
|
|
6986
|
+
private urlDepthMap;
|
|
6987
|
+
/** Lifecycle event handlers */
|
|
6988
|
+
private startHandlers;
|
|
6989
|
+
private finishHandlers;
|
|
6990
|
+
private redirectHandlers;
|
|
6991
|
+
/** Data collection for export */
|
|
6992
|
+
private collectedData;
|
|
6993
|
+
/** Flag to track if crawl has started */
|
|
6994
|
+
private crawlStarted;
|
|
6889
6995
|
/**
|
|
6890
6996
|
* Creates a new Crawler instance with the specified configuration.
|
|
6891
6997
|
*
|
|
6892
|
-
* @param
|
|
6893
|
-
* @param
|
|
6998
|
+
* @param crawlerOptions - Crawler configuration options
|
|
6999
|
+
* @param http - Optional Rezo HTTP client instance (creates default if not provided)
|
|
6894
7000
|
*
|
|
6895
7001
|
* @example
|
|
6896
7002
|
* ```typescript
|
|
7003
|
+
* // Basic usage (creates default Rezo instance)
|
|
6897
7004
|
* const crawler = new Crawler({
|
|
6898
|
-
*
|
|
6899
|
-
* baseUrl: 'https://api.example.com',
|
|
6900
|
-
* timeout: 30000,
|
|
7005
|
+
* baseUrl: 'https://example.com',
|
|
6901
7006
|
* enableCache: true,
|
|
6902
7007
|
* cacheDir: './cache',
|
|
6903
|
-
* socksProxies: [{ host: '127.0.0.1', port: 9050 }]
|
|
6904
|
-
* }, {
|
|
6905
|
-
* http: backupHttpClient,
|
|
6906
|
-
* useProxy: false,
|
|
6907
|
-
* concurrency: 5
|
|
6908
7008
|
* });
|
|
7009
|
+
*
|
|
7010
|
+
* // With resumable crawling
|
|
7011
|
+
* const crawler = new Crawler({
|
|
7012
|
+
* baseUrl: 'https://example.com',
|
|
7013
|
+
* enableNavigationHistory: true,
|
|
7014
|
+
* sessionId: 'my-session',
|
|
7015
|
+
* cacheDir: './cache',
|
|
7016
|
+
* });
|
|
7017
|
+
*
|
|
7018
|
+
* // With custom Rezo instance
|
|
7019
|
+
* const crawler = new Crawler({
|
|
7020
|
+
* baseUrl: 'https://example.com',
|
|
7021
|
+
* adapter: 'curl',
|
|
7022
|
+
* }, myRezoInstance);
|
|
6909
7023
|
* ```
|
|
6910
7024
|
*/
|
|
6911
|
-
constructor(crawlerOptions: ICrawlerOptions, http
|
|
7025
|
+
constructor(crawlerOptions: ICrawlerOptions, http?: Rezo);
|
|
7026
|
+
/**
|
|
7027
|
+
* Initialize the HTTP adapter based on configuration
|
|
7028
|
+
*/
|
|
7029
|
+
private initializeAdapter;
|
|
7030
|
+
/**
|
|
7031
|
+
* Initialize navigation history and session
|
|
7032
|
+
*/
|
|
7033
|
+
private initializeNavigationHistory;
|
|
7034
|
+
/**
|
|
7035
|
+
* Wait for navigation history and session to be ready
|
|
7036
|
+
*/
|
|
7037
|
+
private waitForNavigationHistory;
|
|
7038
|
+
/**
|
|
7039
|
+
* Ensure navigation history is ready and return it (or null if not enabled)
|
|
7040
|
+
* This is used by visit() and other methods that need to write to navigation history
|
|
7041
|
+
*/
|
|
7042
|
+
private ensureNavigationHistoryReady;
|
|
7043
|
+
/**
|
|
7044
|
+
* Add URL to navigation history queue
|
|
7045
|
+
*/
|
|
7046
|
+
private addToNavigationQueue;
|
|
7047
|
+
/**
|
|
7048
|
+
* Mark URL as visited in navigation history
|
|
7049
|
+
*/
|
|
7050
|
+
private markUrlVisited;
|
|
7051
|
+
/**
|
|
7052
|
+
* Get the current crawl session
|
|
7053
|
+
*/
|
|
7054
|
+
getSession(): CrawlSession | null;
|
|
7055
|
+
/**
|
|
7056
|
+
* Get the session ID
|
|
7057
|
+
*/
|
|
7058
|
+
getSessionId(): string;
|
|
7059
|
+
/**
|
|
7060
|
+
* Resume a previous crawl session
|
|
7061
|
+
* @param sessionId - Optional session ID to resume (uses current session if not provided)
|
|
7062
|
+
* @returns Promise resolving to the Crawler instance for chaining
|
|
7063
|
+
*/
|
|
7064
|
+
resume(sessionId?: string): Promise<Crawler>;
|
|
7065
|
+
/**
|
|
7066
|
+
* Get list of resumable sessions
|
|
7067
|
+
* @returns Promise resolving to array of sessions that can be resumed
|
|
7068
|
+
*/
|
|
7069
|
+
getResumableSessions(): Promise<CrawlSession[]>;
|
|
7070
|
+
/**
|
|
7071
|
+
* Pause the current crawl session
|
|
7072
|
+
*/
|
|
7073
|
+
pause(): Promise<void>;
|
|
7074
|
+
/**
|
|
7075
|
+
* Mark the current session as completed
|
|
7076
|
+
*/
|
|
7077
|
+
complete(): Promise<void>;
|
|
7078
|
+
/**
|
|
7079
|
+
* Get the current adapter type being used
|
|
7080
|
+
*/
|
|
7081
|
+
getAdapterType(): CrawlerAdapterType;
|
|
7082
|
+
/**
|
|
7083
|
+
* Switch to a different adapter at runtime
|
|
7084
|
+
* @param adapter - The adapter type to switch to
|
|
7085
|
+
*/
|
|
7086
|
+
setAdapter(adapter: CrawlerAdapterType): Promise<void>;
|
|
6912
7087
|
private rawResponseHandler;
|
|
6913
7088
|
private waitForCache;
|
|
6914
7089
|
private waitForStorage;
|
|
@@ -6985,6 +7160,54 @@ export declare class Crawler {
|
|
|
6985
7160
|
* ```
|
|
6986
7161
|
*/
|
|
6987
7162
|
onEmailLeads(handler: (emails: string[]) => Promise<void>): Crawler;
|
|
7163
|
+
/**
|
|
7164
|
+
* Registers a handler called before crawling starts.
|
|
7165
|
+
* Useful for initialization, logging, or setup tasks.
|
|
7166
|
+
*
|
|
7167
|
+
* @param handler - Function to call before crawling begins
|
|
7168
|
+
* @returns The crawler instance for method chaining
|
|
7169
|
+
*
|
|
7170
|
+
* @example
|
|
7171
|
+
* ```typescript
|
|
7172
|
+
* crawler.onStart(async () => {
|
|
7173
|
+
* console.log('Crawl session started');
|
|
7174
|
+
* await initializeDatabase();
|
|
7175
|
+
* });
|
|
7176
|
+
* ```
|
|
7177
|
+
*/
|
|
7178
|
+
onStart(handler: () => Promise<void>): Crawler;
|
|
7179
|
+
/**
|
|
7180
|
+
* Registers a handler called when crawling finishes.
|
|
7181
|
+
* Receives crawl statistics including URLs visited, failed, and timing.
|
|
7182
|
+
*
|
|
7183
|
+
* @param handler - Function to call when crawling completes
|
|
7184
|
+
* @returns The crawler instance for method chaining
|
|
7185
|
+
*
|
|
7186
|
+
* @example
|
|
7187
|
+
* ```typescript
|
|
7188
|
+
* crawler.onFinish(async (stats) => {
|
|
7189
|
+
* console.log(`Crawl completed: ${stats.urlsVisited} URLs in ${stats.endTime - stats.startTime}ms`);
|
|
7190
|
+
* await generateReport(stats);
|
|
7191
|
+
* });
|
|
7192
|
+
* ```
|
|
7193
|
+
*/
|
|
7194
|
+
onFinish(handler: (stats: CrawlStats) => Promise<void>): Crawler;
|
|
7195
|
+
/**
|
|
7196
|
+
* Registers a handler called when a redirect is followed.
|
|
7197
|
+
* Provides information about the original URL, final URL, and redirect count.
|
|
7198
|
+
*
|
|
7199
|
+
* @param handler - Function to handle redirect events
|
|
7200
|
+
* @returns The crawler instance for method chaining
|
|
7201
|
+
*
|
|
7202
|
+
* @example
|
|
7203
|
+
* ```typescript
|
|
7204
|
+
* crawler.onRedirect(async (event) => {
|
|
7205
|
+
* console.log(`Redirect: ${event.originalUrl} -> ${event.finalUrl}`);
|
|
7206
|
+
* trackRedirects(event);
|
|
7207
|
+
* });
|
|
7208
|
+
* ```
|
|
7209
|
+
*/
|
|
7210
|
+
onRedirect(handler: (event: RedirectEvent$1) => Promise<void>): Crawler;
|
|
6988
7211
|
/**
|
|
6989
7212
|
* Registers a handler for raw response data.
|
|
6990
7213
|
* Triggered for all responses, providing access to the raw Buffer data.
|
|
@@ -7080,21 +7303,23 @@ export declare class Crawler {
|
|
|
7080
7303
|
/**
|
|
7081
7304
|
* Registers a handler for href attributes from anchor and link elements.
|
|
7082
7305
|
* Automatically resolves relative URLs to absolute URLs.
|
|
7306
|
+
* Use `function` syntax (not arrow) to access `this` as the element.
|
|
7083
7307
|
*
|
|
7084
|
-
* @param handler - Function
|
|
7308
|
+
* @param handler - Function receiving href string, with `this` bound to the element
|
|
7085
7309
|
* @returns The crawler instance for method chaining
|
|
7086
7310
|
*
|
|
7087
7311
|
* @example
|
|
7088
7312
|
* ```typescript
|
|
7089
|
-
* crawler.onHref(async (href)
|
|
7313
|
+
* crawler.onHref(async function(href) {
|
|
7090
7314
|
* console.log('Found URL:', href);
|
|
7315
|
+
* console.log('Link text:', this.textContent); // `this` is the anchor/link element
|
|
7091
7316
|
* if (href.includes('/api/')) {
|
|
7092
7317
|
* await crawler.visit(href);
|
|
7093
7318
|
* }
|
|
7094
7319
|
* });
|
|
7095
7320
|
* ```
|
|
7096
7321
|
*/
|
|
7097
|
-
onHref(handler:
|
|
7322
|
+
onHref(handler: ElementBoundHandler<string, HTMLAnchorElement | HTMLLinkElement>): Crawler;
|
|
7098
7323
|
/**
|
|
7099
7324
|
* Registers a handler for elements matching a CSS selector.
|
|
7100
7325
|
* Provides fine-grained control over which elements to process.
|
|
@@ -7136,55 +7361,57 @@ export declare class Crawler {
|
|
|
7136
7361
|
/**
|
|
7137
7362
|
* Registers a handler for HTML element attributes.
|
|
7138
7363
|
* Can extract specific attributes from all elements or from elements matching a selector.
|
|
7364
|
+
* Use `function` syntax (not arrow) to access `this` as the element.
|
|
7139
7365
|
*
|
|
7140
7366
|
* @param attribute - The attribute name to extract
|
|
7141
|
-
* @param handler - Function
|
|
7367
|
+
* @param handler - Function receiving (value, attrName), with `this` bound to element
|
|
7142
7368
|
* @returns The crawler instance for method chaining
|
|
7143
7369
|
*
|
|
7144
7370
|
* @overload
|
|
7145
7371
|
* @param selection - CSS selector to filter elements
|
|
7146
7372
|
* @param attribute - The attribute name to extract
|
|
7147
|
-
* @param handler - Function
|
|
7373
|
+
* @param handler - Function receiving (value, attrName), with `this` bound to element
|
|
7148
7374
|
* @returns The crawler instance for method chaining
|
|
7149
7375
|
*
|
|
7150
7376
|
* @example
|
|
7151
7377
|
* ```typescript
|
|
7152
7378
|
* // Extract all 'data-id' attributes
|
|
7153
|
-
* crawler.onAttribute('data-id', async (value)
|
|
7154
|
-
* console.log('Found
|
|
7379
|
+
* crawler.onAttribute('data-id', async function(value, attrName) {
|
|
7380
|
+
* console.log('Found', attrName, ':', value, 'on:', this.tagName);
|
|
7155
7381
|
* });
|
|
7156
7382
|
*
|
|
7157
7383
|
* // Extract 'src' attributes from images only
|
|
7158
|
-
* crawler.onAttribute('img', 'src', async (
|
|
7159
|
-
* console.log('Image source:',
|
|
7384
|
+
* crawler.onAttribute('img', 'src', async function(value) {
|
|
7385
|
+
* console.log('Image source:', value, 'alt:', this.getAttribute('alt'));
|
|
7160
7386
|
* });
|
|
7161
7387
|
* ```
|
|
7162
7388
|
*/
|
|
7163
|
-
onAttribute(attribute: string, handler:
|
|
7164
|
-
onAttribute(selection: string, attribute: string, handler:
|
|
7389
|
+
onAttribute(attribute: string, handler: AttributeHandler): Crawler;
|
|
7390
|
+
onAttribute(selection: string, attribute: string, handler: AttributeHandler): Crawler;
|
|
7165
7391
|
/**
|
|
7166
7392
|
* Registers a handler for text content of elements matching a CSS selector.
|
|
7167
7393
|
* Extracts and processes the textContent of matching elements.
|
|
7394
|
+
* Use `function` syntax (not arrow) to access `this` as the element.
|
|
7168
7395
|
*
|
|
7169
7396
|
* @param selection - CSS selector to match elements
|
|
7170
|
-
* @param handler - Function
|
|
7397
|
+
* @param handler - Function receiving text string, with `this` bound to element
|
|
7171
7398
|
* @returns The crawler instance for method chaining
|
|
7172
7399
|
*
|
|
7173
7400
|
* @example
|
|
7174
7401
|
* ```typescript
|
|
7175
|
-
* // Extract all heading text
|
|
7176
|
-
* crawler.onText('h1, h2, h3', async (text)
|
|
7177
|
-
* console.log('Heading:', text.trim());
|
|
7402
|
+
* // Extract all heading text with element context
|
|
7403
|
+
* crawler.onText('h1, h2, h3', async function(text) {
|
|
7404
|
+
* console.log('Heading:', text.trim(), 'Tag:', this.tagName);
|
|
7178
7405
|
* });
|
|
7179
7406
|
*
|
|
7180
|
-
* // Extract product prices
|
|
7181
|
-
* crawler.onText('.price', async (
|
|
7182
|
-
* const numericPrice = parseFloat(
|
|
7183
|
-
* console.log('Price
|
|
7407
|
+
* // Extract product prices with element context
|
|
7408
|
+
* crawler.onText('.price', async function(text) {
|
|
7409
|
+
* const numericPrice = parseFloat(text.replace(/[^\d.]/g, ''));
|
|
7410
|
+
* console.log('Price:', numericPrice, 'Product:', this.closest('.product')?.id);
|
|
7184
7411
|
* });
|
|
7185
7412
|
* ```
|
|
7186
7413
|
*/
|
|
7187
|
-
onText(selection: string, handler:
|
|
7414
|
+
onText(selection: string, handler: ElementBoundHandler<string>): Crawler;
|
|
7188
7415
|
private _onBody;
|
|
7189
7416
|
private _onAttribute;
|
|
7190
7417
|
private _onText;
|
|
@@ -7199,6 +7426,86 @@ export declare class Crawler {
|
|
|
7199
7426
|
private _onEmailLeads;
|
|
7200
7427
|
private _onRawResponse;
|
|
7201
7428
|
private _onResponse;
|
|
7429
|
+
/**
|
|
7430
|
+
* Calculate adaptive delay based on server response times (AutoThrottle)
|
|
7431
|
+
*/
|
|
7432
|
+
private calculateAutoThrottleDelay;
|
|
7433
|
+
/**
|
|
7434
|
+
* Get current AutoThrottle delay for a domain
|
|
7435
|
+
*/
|
|
7436
|
+
private getAutoThrottleDelay;
|
|
7437
|
+
/**
|
|
7438
|
+
* Handle 429 Too Many Requests response with Retry-After header parsing
|
|
7439
|
+
*/
|
|
7440
|
+
private handle429Response;
|
|
7441
|
+
/**
|
|
7442
|
+
* Check if URL passes all crawl limit checks
|
|
7443
|
+
*/
|
|
7444
|
+
private checkCrawlLimits;
|
|
7445
|
+
/**
|
|
7446
|
+
* Check if a link should be followed based on nofollow rules
|
|
7447
|
+
*/
|
|
7448
|
+
private shouldFollowLink;
|
|
7449
|
+
/**
|
|
7450
|
+
* Check response size against maxResponseSize limit
|
|
7451
|
+
*/
|
|
7452
|
+
private checkResponseSize;
|
|
7453
|
+
/**
|
|
7454
|
+
* Collect data for later export
|
|
7455
|
+
*
|
|
7456
|
+
* @param data - Data to collect (will be added to export buffer)
|
|
7457
|
+
* @returns The crawler instance for method chaining
|
|
7458
|
+
*
|
|
7459
|
+
* @example
|
|
7460
|
+
* ```typescript
|
|
7461
|
+
* crawler.onDocument(async (doc) => {
|
|
7462
|
+
* crawler.collect({
|
|
7463
|
+
* title: doc.title,
|
|
7464
|
+
* url: doc.URL,
|
|
7465
|
+
* h1: doc.querySelector('h1')?.textContent
|
|
7466
|
+
* });
|
|
7467
|
+
* });
|
|
7468
|
+
* ```
|
|
7469
|
+
*/
|
|
7470
|
+
collect(data: any): Crawler;
|
|
7471
|
+
/**
|
|
7472
|
+
* Get all collected data
|
|
7473
|
+
*/
|
|
7474
|
+
getCollectedData(): any[];
|
|
7475
|
+
/**
|
|
7476
|
+
* Clear collected data
|
|
7477
|
+
*/
|
|
7478
|
+
clearCollectedData(): Crawler;
|
|
7479
|
+
/**
|
|
7480
|
+
* Export collected data to a file
|
|
7481
|
+
*
|
|
7482
|
+
* @param filePath - Output file path
|
|
7483
|
+
* @param format - Export format: 'json', 'jsonl', or 'csv'
|
|
7484
|
+
*
|
|
7485
|
+
* @example
|
|
7486
|
+
* ```typescript
|
|
7487
|
+
* await crawler.waitForAll();
|
|
7488
|
+
* await crawler.exportData('./output.json', 'json');
|
|
7489
|
+
* await crawler.exportData('./output.csv', 'csv');
|
|
7490
|
+
* ```
|
|
7491
|
+
*/
|
|
7492
|
+
exportData(filePath: string, format?: ExportFormat): Promise<void>;
|
|
7493
|
+
/**
|
|
7494
|
+
* Get current crawl statistics
|
|
7495
|
+
*/
|
|
7496
|
+
getStats(): CrawlStats;
|
|
7497
|
+
/**
|
|
7498
|
+
* Trigger onStart handlers (called once on first visit)
|
|
7499
|
+
*/
|
|
7500
|
+
private triggerStartHandlers;
|
|
7501
|
+
/**
|
|
7502
|
+
* Trigger onFinish handlers
|
|
7503
|
+
*/
|
|
7504
|
+
private triggerFinishHandlers;
|
|
7505
|
+
/**
|
|
7506
|
+
* Trigger onRedirect handlers
|
|
7507
|
+
*/
|
|
7508
|
+
private triggerRedirectHandlers;
|
|
7202
7509
|
private buildUrl;
|
|
7203
7510
|
/**
|
|
7204
7511
|
* Visits a URL and processes it according to registered event handlers.
|
|
@@ -7303,7 +7610,28 @@ export declare class Crawler {
|
|
|
7303
7610
|
* ```
|
|
7304
7611
|
*/
|
|
7305
7612
|
waitForAll(): Promise<void>;
|
|
7613
|
+
/**
|
|
7614
|
+
* Alias for waitForAll() - waits for all crawling operations to complete.
|
|
7615
|
+
* @returns Promise that resolves when done
|
|
7616
|
+
* @example
|
|
7617
|
+
* ```typescript
|
|
7618
|
+
* crawler.visit('https://example.com');
|
|
7619
|
+
* await crawler.done();
|
|
7620
|
+
* ```
|
|
7621
|
+
*/
|
|
7622
|
+
done(): Promise<void>;
|
|
7306
7623
|
close(): Promise<void>;
|
|
7624
|
+
/**
|
|
7625
|
+
* Destroys the crawler instance and releases all resources.
|
|
7626
|
+
* Clears all queued tasks, closes caches, and cleans up event handlers.
|
|
7627
|
+
* @returns Promise that resolves when destruction is complete
|
|
7628
|
+
* @example
|
|
7629
|
+
* ```typescript
|
|
7630
|
+
* await crawler.destroy();
|
|
7631
|
+
* // Crawler is now fully cleaned up
|
|
7632
|
+
* ```
|
|
7633
|
+
*/
|
|
7634
|
+
destroy(): Promise<void>;
|
|
7307
7635
|
}
|
|
7308
7636
|
|
|
7309
7637
|
export {};
|