rezo 1.0.43 → 1.0.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/index.cjs +6 -6
- package/dist/cache/index.cjs +9 -15
- package/dist/cache/index.js +0 -3
- package/dist/crawler/addon/decodo/index.cjs +1 -0
- package/dist/crawler/addon/decodo/index.js +1 -0
- package/dist/crawler/crawler-options.cjs +1 -0
- package/dist/crawler/crawler-options.js +1 -0
- package/dist/{plugin → crawler}/crawler.cjs +392 -32
- package/dist/{plugin → crawler}/crawler.js +392 -32
- package/dist/crawler/index.cjs +40 -0
- package/dist/{plugin → crawler}/index.js +4 -2
- package/dist/crawler/plugin/file-cacher.cjs +19 -0
- package/dist/crawler/plugin/file-cacher.js +19 -0
- package/dist/crawler/plugin/index.cjs +1 -0
- package/dist/crawler/plugin/index.js +1 -0
- package/dist/crawler/plugin/navigation-history.cjs +43 -0
- package/dist/crawler/plugin/navigation-history.js +43 -0
- package/dist/crawler/plugin/robots-txt.cjs +2 -0
- package/dist/crawler/plugin/robots-txt.js +2 -0
- package/dist/crawler/plugin/url-store.cjs +18 -0
- package/dist/crawler/plugin/url-store.js +18 -0
- package/dist/crawler.d.ts +315 -172
- package/dist/entries/crawler.cjs +5 -5
- package/dist/entries/crawler.js +2 -2
- package/dist/index.cjs +27 -27
- package/dist/internal/agents/index.cjs +10 -10
- package/dist/proxy/index.cjs +4 -4
- package/dist/queue/index.cjs +8 -8
- package/dist/responses/universal/index.cjs +11 -11
- package/package.json +2 -6
- package/dist/cache/file-cacher.cjs +0 -270
- package/dist/cache/file-cacher.js +0 -267
- package/dist/cache/navigation-history.cjs +0 -298
- package/dist/cache/navigation-history.js +0 -296
- package/dist/cache/url-store.cjs +0 -294
- package/dist/cache/url-store.js +0 -291
- package/dist/plugin/addon/decodo/index.cjs +0 -1
- package/dist/plugin/addon/decodo/index.js +0 -1
- package/dist/plugin/crawler-options.cjs +0 -1
- package/dist/plugin/crawler-options.js +0 -1
- package/dist/plugin/index.cjs +0 -36
- /package/dist/{plugin → crawler}/addon/decodo/options.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/decodo/options.js +0 -0
- /package/dist/{plugin → crawler}/addon/decodo/types.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/decodo/types.js +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/index.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/index.js +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/options.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/options.js +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/types.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/types.js +0 -0
- /package/dist/{plugin → crawler}/scraper.cjs +0 -0
- /package/dist/{plugin → crawler}/scraper.js +0 -0
package/dist/crawler.d.ts
CHANGED
|
@@ -6,43 +6,24 @@ import { SecureContext, TLSSocket } from 'node:tls';
|
|
|
6
6
|
import { Cookie as TouchCookie, CookieJar as TouchCookieJar, CreateCookieOptions } from 'tough-cookie';
|
|
7
7
|
|
|
8
8
|
/**
|
|
9
|
-
*
|
|
9
|
+
* CrawlerCache - High-performance SQLite-based response caching for web crawlers
|
|
10
10
|
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
11
|
+
* Optimized specifically for crawler workloads with:
|
|
12
|
+
* - WAL mode for high-throughput concurrent reads/writes
|
|
13
|
+
* - Batch operations for efficient bulk storage
|
|
14
|
+
* - Domain-based namespacing for organized cache management
|
|
15
|
+
* - Optional zstd compression for storage efficiency
|
|
13
16
|
*
|
|
14
17
|
* @module cache/file-cacher
|
|
15
18
|
* @author Rezo HTTP Client Library
|
|
16
|
-
*
|
|
17
|
-
* @example
|
|
18
|
-
* ```typescript
|
|
19
|
-
* import { FileCacher } from 'rezo';
|
|
20
|
-
*
|
|
21
|
-
* // Create a file cacher instance
|
|
22
|
-
* const cacher = await FileCacher.create({
|
|
23
|
-
* cacheDir: './cache',
|
|
24
|
-
* ttl: 3600000, // 1 hour
|
|
25
|
-
* compression: true,
|
|
26
|
-
* encryptNamespace: true
|
|
27
|
-
* });
|
|
28
|
-
*
|
|
29
|
-
* // Store and retrieve data
|
|
30
|
-
* await cacher.set('user:123', { name: 'John' }, 3600000, 'users');
|
|
31
|
-
* const user = await cacher.get('user:123', 'users');
|
|
32
|
-
*
|
|
33
|
-
* // Check existence and cleanup
|
|
34
|
-
* const exists = await cacher.has('user:123', 'users');
|
|
35
|
-
* await cacher.delete('user:123', 'users');
|
|
36
|
-
* await cacher.close();
|
|
37
|
-
* ```
|
|
38
19
|
*/
|
|
39
20
|
/**
|
|
40
|
-
* Configuration options for
|
|
21
|
+
* Configuration options for CrawlerCache
|
|
41
22
|
*/
|
|
42
23
|
export interface FileCacherOptions {
|
|
43
24
|
/**
|
|
44
25
|
* Directory path for storing cache databases
|
|
45
|
-
* @default '
|
|
26
|
+
* @default '/tmp/rezo-crawler/cache'
|
|
46
27
|
*/
|
|
47
28
|
cacheDir?: string;
|
|
48
29
|
/**
|
|
@@ -51,23 +32,18 @@ export interface FileCacherOptions {
|
|
|
51
32
|
*/
|
|
52
33
|
ttl?: number;
|
|
53
34
|
/**
|
|
54
|
-
* Enable zstd compression for stored values
|
|
35
|
+
* Enable zstd compression for stored values (Node.js 22.15+)
|
|
55
36
|
* Reduces storage size but adds CPU overhead
|
|
56
37
|
* @default false
|
|
57
38
|
*/
|
|
58
39
|
compression?: boolean;
|
|
59
40
|
/**
|
|
60
|
-
*
|
|
61
|
-
* @default false
|
|
62
|
-
*/
|
|
63
|
-
softDelete?: boolean;
|
|
64
|
-
/**
|
|
65
|
-
* Hash namespace names for privacy/security
|
|
41
|
+
* Hash namespace names for privacy
|
|
66
42
|
* @default false
|
|
67
43
|
*/
|
|
68
44
|
encryptNamespace?: boolean;
|
|
69
45
|
/**
|
|
70
|
-
* Maximum
|
|
46
|
+
* Maximum entries per namespace (0 = unlimited)
|
|
71
47
|
* @default 0
|
|
72
48
|
*/
|
|
73
49
|
maxEntries?: number;
|
|
@@ -77,155 +53,63 @@ declare class FileCacher {
|
|
|
77
53
|
private readonly options;
|
|
78
54
|
private readonly cacheDir;
|
|
79
55
|
private closed;
|
|
80
|
-
/**
|
|
81
|
-
* Private constructor - use FileCacher.create() instead
|
|
82
|
-
*/
|
|
83
56
|
private constructor();
|
|
84
57
|
/**
|
|
85
58
|
* Create a new FileCacher instance
|
|
86
|
-
*
|
|
87
|
-
* @param options - Configuration options
|
|
88
|
-
* @returns Promise resolving to initialized FileCacher instance
|
|
89
|
-
*
|
|
90
|
-
* @example
|
|
91
|
-
* ```typescript
|
|
92
|
-
* const cacher = await FileCacher.create({
|
|
93
|
-
* cacheDir: './my-cache',
|
|
94
|
-
* ttl: 3600000,
|
|
95
|
-
* compression: true
|
|
96
|
-
* });
|
|
97
|
-
* ```
|
|
98
59
|
*/
|
|
99
60
|
static create(options?: FileCacherOptions): Promise<FileCacher>;
|
|
100
61
|
/**
|
|
101
|
-
* Get or create database for a namespace
|
|
62
|
+
* Get or create optimized database for a namespace (domain)
|
|
102
63
|
*/
|
|
103
64
|
private getDatabase;
|
|
104
65
|
/**
|
|
105
|
-
* Store a
|
|
106
|
-
*
|
|
107
|
-
* @param key - Unique key for the cached item
|
|
108
|
-
* @param value - Value to cache (will be JSON serialized)
|
|
109
|
-
* @param ttl - Time-to-live in milliseconds (uses default if not specified)
|
|
110
|
-
* @param namespace - Namespace for isolation (default: 'default')
|
|
111
|
-
* @returns Promise resolving when stored
|
|
112
|
-
*
|
|
113
|
-
* @example
|
|
114
|
-
* ```typescript
|
|
115
|
-
* // Store with default TTL
|
|
116
|
-
* await cacher.set('key1', { data: 'value' });
|
|
117
|
-
*
|
|
118
|
-
* // Store with custom TTL and namespace
|
|
119
|
-
* await cacher.set('key2', responseData, 3600000, 'api-responses');
|
|
120
|
-
* ```
|
|
66
|
+
* Store a response in the cache
|
|
121
67
|
*/
|
|
122
68
|
set<T = any>(key: string, value: T, ttl?: number, namespace?: string): Promise<void>;
|
|
123
69
|
/**
|
|
124
|
-
*
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
*
|
|
133
|
-
* if (data) {
|
|
134
|
-
* console.log('Cache hit:', data);
|
|
135
|
-
* }
|
|
136
|
-
* ```
|
|
70
|
+
* Store multiple responses in a single transaction (batch operation)
|
|
71
|
+
*/
|
|
72
|
+
setMany<T = any>(entries: Array<{
|
|
73
|
+
key: string;
|
|
74
|
+
value: T;
|
|
75
|
+
ttl?: number;
|
|
76
|
+
}>, namespace?: string): Promise<void>;
|
|
77
|
+
/**
|
|
78
|
+
* Retrieve a cached response
|
|
137
79
|
*/
|
|
138
80
|
get<T = any>(key: string, namespace?: string): Promise<T | null>;
|
|
139
81
|
/**
|
|
140
|
-
* Check if a key exists
|
|
141
|
-
*
|
|
142
|
-
* @param key - Key to check
|
|
143
|
-
* @param namespace - Namespace to search in (default: 'default')
|
|
144
|
-
* @returns Promise resolving to true if key exists and is valid
|
|
145
|
-
*
|
|
146
|
-
* @example
|
|
147
|
-
* ```typescript
|
|
148
|
-
* if (await cacher.has('key1', 'my-namespace')) {
|
|
149
|
-
* const data = await cacher.get('key1', 'my-namespace');
|
|
150
|
-
* }
|
|
151
|
-
* ```
|
|
82
|
+
* Check if a key exists and is not expired
|
|
152
83
|
*/
|
|
153
84
|
has(key: string, namespace?: string): Promise<boolean>;
|
|
85
|
+
/**
|
|
86
|
+
* Check multiple keys at once (batch operation)
|
|
87
|
+
*/
|
|
88
|
+
hasMany(keys: string[], namespace?: string): Promise<Set<string>>;
|
|
154
89
|
/**
|
|
155
90
|
* Delete a key from the cache
|
|
156
|
-
*
|
|
157
|
-
* @param key - Key to delete
|
|
158
|
-
* @param namespace - Namespace to delete from (default: 'default')
|
|
159
|
-
* @returns Promise resolving to true if key was deleted
|
|
160
|
-
*
|
|
161
|
-
* @example
|
|
162
|
-
* ```typescript
|
|
163
|
-
* await cacher.delete('obsolete-key', 'my-namespace');
|
|
164
|
-
* ```
|
|
165
91
|
*/
|
|
166
92
|
delete(key: string, namespace?: string): Promise<boolean>;
|
|
167
93
|
/**
|
|
168
94
|
* Clear all entries in a namespace
|
|
169
|
-
*
|
|
170
|
-
* @param namespace - Namespace to clear (default: 'default')
|
|
171
|
-
* @returns Promise resolving when cleared
|
|
172
|
-
*
|
|
173
|
-
* @example
|
|
174
|
-
* ```typescript
|
|
175
|
-
* // Clear all cached data for a domain
|
|
176
|
-
* await cacher.clear('example.com');
|
|
177
|
-
* ```
|
|
178
95
|
*/
|
|
179
96
|
clear(namespace?: string): Promise<void>;
|
|
180
97
|
/**
|
|
181
|
-
* Remove all expired entries
|
|
182
|
-
*
|
|
183
|
-
* @param namespace - Namespace to cleanup (default: 'default')
|
|
184
|
-
* @returns Promise resolving to number of entries removed
|
|
185
|
-
*
|
|
186
|
-
* @example
|
|
187
|
-
* ```typescript
|
|
188
|
-
* const removed = await cacher.cleanup('my-namespace');
|
|
189
|
-
* console.log(`Removed ${removed} expired entries`);
|
|
190
|
-
* ```
|
|
98
|
+
* Remove all expired entries
|
|
191
99
|
*/
|
|
192
100
|
cleanup(namespace?: string): Promise<number>;
|
|
193
101
|
/**
|
|
194
|
-
* Get statistics for a namespace
|
|
195
|
-
*
|
|
196
|
-
* @param namespace - Namespace to get stats for (default: 'default')
|
|
197
|
-
* @returns Promise resolving to cache statistics
|
|
198
|
-
*
|
|
199
|
-
* @example
|
|
200
|
-
* ```typescript
|
|
201
|
-
* const stats = await cacher.stats('my-namespace');
|
|
202
|
-
* console.log(`${stats.count} entries, ${stats.size} bytes`);
|
|
203
|
-
* ```
|
|
102
|
+
* Get cache statistics for a namespace
|
|
204
103
|
*/
|
|
205
104
|
stats(namespace?: string): Promise<{
|
|
206
105
|
count: number;
|
|
207
106
|
expired: number;
|
|
208
|
-
deleted: number;
|
|
209
107
|
}>;
|
|
210
108
|
/**
|
|
211
|
-
* Close all database connections
|
|
212
|
-
*
|
|
213
|
-
* @returns Promise resolving when all connections are closed
|
|
214
|
-
*
|
|
215
|
-
* @example
|
|
216
|
-
* ```typescript
|
|
217
|
-
* // Always close when done
|
|
218
|
-
* await cacher.close();
|
|
219
|
-
* ```
|
|
109
|
+
* Close all database connections
|
|
220
110
|
*/
|
|
221
111
|
close(): Promise<void>;
|
|
222
|
-
/**
|
|
223
|
-
* Check if the cacher has been closed
|
|
224
|
-
*/
|
|
225
112
|
get isClosed(): boolean;
|
|
226
|
-
/**
|
|
227
|
-
* Get the cache directory path
|
|
228
|
-
*/
|
|
229
113
|
get directory(): string;
|
|
230
114
|
}
|
|
231
115
|
export interface CrawlSession {
|
|
@@ -6232,17 +6116,25 @@ declare class Decodo {
|
|
|
6232
6116
|
/**
|
|
6233
6117
|
* Create a new Decodo client instance
|
|
6234
6118
|
*
|
|
6235
|
-
* @param config - Decodo API configuration
|
|
6236
|
-
* @throws Error if
|
|
6119
|
+
* @param config - Decodo API configuration (supports username/password OR token auth)
|
|
6120
|
+
* @throws Error if authentication credentials are missing
|
|
6237
6121
|
*
|
|
6238
6122
|
* @example
|
|
6239
6123
|
* ```typescript
|
|
6124
|
+
* // Username/password authentication
|
|
6240
6125
|
* const decodo = new Decodo({
|
|
6241
6126
|
* username: 'user',
|
|
6242
6127
|
* password: 'password',
|
|
6243
6128
|
* headless: 'html',
|
|
6244
6129
|
* country: 'US'
|
|
6245
6130
|
* });
|
|
6131
|
+
*
|
|
6132
|
+
* // Token authentication (alternative)
|
|
6133
|
+
* const decodo = new Decodo({
|
|
6134
|
+
* token: 'your_api_token',
|
|
6135
|
+
* headless: 'html',
|
|
6136
|
+
* country: 'US'
|
|
6137
|
+
* });
|
|
6246
6138
|
* ```
|
|
6247
6139
|
*/
|
|
6248
6140
|
constructor(config: DecodoConfig);
|
|
@@ -6472,6 +6364,42 @@ export interface ICrawlerOptions {
|
|
|
6472
6364
|
} | {
|
|
6473
6365
|
enable: false;
|
|
6474
6366
|
} | undefined | false;
|
|
6367
|
+
/** Decodo proxy service configuration for specific domains or global use */
|
|
6368
|
+
decodo?: {
|
|
6369
|
+
enable: true;
|
|
6370
|
+
labs: [
|
|
6371
|
+
{
|
|
6372
|
+
domain: Domain;
|
|
6373
|
+
isGlobal?: boolean;
|
|
6374
|
+
options: DecodoOptions;
|
|
6375
|
+
queueOptions: queueOptions$1;
|
|
6376
|
+
}
|
|
6377
|
+
];
|
|
6378
|
+
} | {
|
|
6379
|
+
enable: false;
|
|
6380
|
+
} | undefined | false;
|
|
6381
|
+
/** Maximum crawl depth from start URL (0 = unlimited, default: 0) */
|
|
6382
|
+
maxDepth?: number;
|
|
6383
|
+
/** Maximum total URLs to crawl (0 = unlimited, default: 0) */
|
|
6384
|
+
maxUrls?: number;
|
|
6385
|
+
/** Maximum response size in bytes to process (0 = unlimited, default: 0) */
|
|
6386
|
+
maxResponseSize?: number;
|
|
6387
|
+
/** Respect robots.txt rules (default: false) */
|
|
6388
|
+
respectRobotsTxt?: boolean;
|
|
6389
|
+
/** Follow rel="nofollow" links (default: false - ignores nofollow links) */
|
|
6390
|
+
followNofollow?: boolean;
|
|
6391
|
+
/** Enable automatic throttling based on server response times (default: true) */
|
|
6392
|
+
autoThrottle?: boolean;
|
|
6393
|
+
/** Target request delay in ms for AutoThrottle (default: 1000) */
|
|
6394
|
+
autoThrottleTargetDelay?: number;
|
|
6395
|
+
/** Minimum delay between requests in ms (default: 100) */
|
|
6396
|
+
autoThrottleMinDelay?: number;
|
|
6397
|
+
/** Maximum delay between requests in ms (default: 60000) */
|
|
6398
|
+
autoThrottleMaxDelay?: number;
|
|
6399
|
+
/** Maximum time to wait on 429 response in ms (default: 1800000 = 30 min) */
|
|
6400
|
+
maxWaitOn429?: number;
|
|
6401
|
+
/** Always wait on 429 regardless of time, shows warning (default: false) */
|
|
6402
|
+
alwaysWaitOn429?: boolean;
|
|
6475
6403
|
}
|
|
6476
6404
|
/**
|
|
6477
6405
|
* Advanced web crawler configuration class with support for domain-specific settings
|
|
@@ -6548,6 +6476,28 @@ export declare class CrawlerOptions {
|
|
|
6548
6476
|
throwFatalError?: boolean;
|
|
6549
6477
|
/** Enable debug logging */
|
|
6550
6478
|
debug?: boolean;
|
|
6479
|
+
/** Maximum crawl depth from start URL (0 = unlimited) */
|
|
6480
|
+
maxDepth: number;
|
|
6481
|
+
/** Maximum total URLs to crawl (0 = unlimited) */
|
|
6482
|
+
maxUrls: number;
|
|
6483
|
+
/** Maximum response size in bytes to process (0 = unlimited) */
|
|
6484
|
+
maxResponseSize: number;
|
|
6485
|
+
/** Respect robots.txt rules */
|
|
6486
|
+
respectRobotsTxt: boolean;
|
|
6487
|
+
/** Follow rel="nofollow" links */
|
|
6488
|
+
followNofollow: boolean;
|
|
6489
|
+
/** Enable automatic throttling based on server response times */
|
|
6490
|
+
autoThrottle: boolean;
|
|
6491
|
+
/** Target request delay in ms for AutoThrottle */
|
|
6492
|
+
autoThrottleTargetDelay: number;
|
|
6493
|
+
/** Minimum delay between requests in ms */
|
|
6494
|
+
autoThrottleMinDelay: number;
|
|
6495
|
+
/** Maximum delay between requests in ms */
|
|
6496
|
+
autoThrottleMaxDelay: number;
|
|
6497
|
+
/** Maximum time to wait on 429 response in ms */
|
|
6498
|
+
maxWaitOn429: number;
|
|
6499
|
+
/** Always wait on 429 regardless of time */
|
|
6500
|
+
alwaysWaitOn429: boolean;
|
|
6551
6501
|
/** Internal storage for Oxylabs configurations with domain mapping */
|
|
6552
6502
|
oxylabs: {
|
|
6553
6503
|
domain?: Domain;
|
|
@@ -6929,13 +6879,44 @@ export interface EmailDiscoveryEvent {
|
|
|
6929
6879
|
discoveredAt: string;
|
|
6930
6880
|
timestamp: Date;
|
|
6931
6881
|
}
|
|
6882
|
+
interface RedirectEvent$1 {
|
|
6883
|
+
originalUrl: string;
|
|
6884
|
+
finalUrl: string;
|
|
6885
|
+
redirectCount: number;
|
|
6886
|
+
statusCode: number;
|
|
6887
|
+
}
|
|
6888
|
+
/**
|
|
6889
|
+
* Export format options
|
|
6890
|
+
*/
|
|
6891
|
+
export type ExportFormat = "json" | "jsonl" | "csv";
|
|
6932
6892
|
/**
|
|
6933
|
-
*
|
|
6934
|
-
*
|
|
6893
|
+
* Handler with element bound to `this` context.
|
|
6894
|
+
* Use `function` syntax (not arrow functions) to access `this`.
|
|
6935
6895
|
*
|
|
6936
|
-
* @
|
|
6896
|
+
* @example
|
|
6897
|
+
* ```typescript
|
|
6898
|
+
* crawler.onText('h1', async function(text) {
|
|
6899
|
+
* console.log(text, this.tagName); // `this` is the element
|
|
6900
|
+
* });
|
|
6901
|
+
* ```
|
|
6902
|
+
*/
|
|
6903
|
+
export type ElementBoundHandler<TValue, TElement = Element> = (this: TElement, value: TValue) => Promise<void>;
|
|
6904
|
+
/**
|
|
6905
|
+
* Handler for attribute extraction with element bound to `this`.
|
|
6906
|
+
* Receives both the attribute value and attribute name.
|
|
6937
6907
|
*/
|
|
6938
|
-
export type
|
|
6908
|
+
export type AttributeHandler = (this: Element, value: string, attributeName: string) => Promise<void>;
|
|
6909
|
+
/**
|
|
6910
|
+
* Crawl statistics
|
|
6911
|
+
*/
|
|
6912
|
+
export interface CrawlStats {
|
|
6913
|
+
urlsVisited: number;
|
|
6914
|
+
urlsQueued: number;
|
|
6915
|
+
urlsFailed: number;
|
|
6916
|
+
startTime: number;
|
|
6917
|
+
endTime?: number;
|
|
6918
|
+
currentDepth: number;
|
|
6919
|
+
}
|
|
6939
6920
|
/**
|
|
6940
6921
|
* A powerful web crawler that provides event-driven HTML parsing and data extraction.
|
|
6941
6922
|
* Supports caching, proxy rotation, retry mechanisms, and email lead discovery.
|
|
@@ -6992,6 +6973,25 @@ export declare class Crawler {
|
|
|
6992
6973
|
/** Adapter-specific request executor */
|
|
6993
6974
|
private adapterExecutor;
|
|
6994
6975
|
private adapterType;
|
|
6976
|
+
/** Track pending execute() calls for proper done() behavior */
|
|
6977
|
+
private pendingExecutions;
|
|
6978
|
+
/** robots.txt parser and validator */
|
|
6979
|
+
private robotsTxt;
|
|
6980
|
+
/** AutoThrottle: track response times per domain for adaptive rate limiting */
|
|
6981
|
+
private domainResponseTimes;
|
|
6982
|
+
private domainCurrentDelay;
|
|
6983
|
+
/** Crawl statistics */
|
|
6984
|
+
private crawlStats;
|
|
6985
|
+
/** URL depth tracking for maxDepth limit */
|
|
6986
|
+
private urlDepthMap;
|
|
6987
|
+
/** Lifecycle event handlers */
|
|
6988
|
+
private startHandlers;
|
|
6989
|
+
private finishHandlers;
|
|
6990
|
+
private redirectHandlers;
|
|
6991
|
+
/** Data collection for export */
|
|
6992
|
+
private collectedData;
|
|
6993
|
+
/** Flag to track if crawl has started */
|
|
6994
|
+
private crawlStarted;
|
|
6995
6995
|
/**
|
|
6996
6996
|
* Creates a new Crawler instance with the specified configuration.
|
|
6997
6997
|
*
|
|
@@ -7160,6 +7160,54 @@ export declare class Crawler {
|
|
|
7160
7160
|
* ```
|
|
7161
7161
|
*/
|
|
7162
7162
|
onEmailLeads(handler: (emails: string[]) => Promise<void>): Crawler;
|
|
7163
|
+
/**
|
|
7164
|
+
* Registers a handler called before crawling starts.
|
|
7165
|
+
* Useful for initialization, logging, or setup tasks.
|
|
7166
|
+
*
|
|
7167
|
+
* @param handler - Function to call before crawling begins
|
|
7168
|
+
* @returns The crawler instance for method chaining
|
|
7169
|
+
*
|
|
7170
|
+
* @example
|
|
7171
|
+
* ```typescript
|
|
7172
|
+
* crawler.onStart(async () => {
|
|
7173
|
+
* console.log('Crawl session started');
|
|
7174
|
+
* await initializeDatabase();
|
|
7175
|
+
* });
|
|
7176
|
+
* ```
|
|
7177
|
+
*/
|
|
7178
|
+
onStart(handler: () => Promise<void>): Crawler;
|
|
7179
|
+
/**
|
|
7180
|
+
* Registers a handler called when crawling finishes.
|
|
7181
|
+
* Receives crawl statistics including URLs visited, failed, and timing.
|
|
7182
|
+
*
|
|
7183
|
+
* @param handler - Function to call when crawling completes
|
|
7184
|
+
* @returns The crawler instance for method chaining
|
|
7185
|
+
*
|
|
7186
|
+
* @example
|
|
7187
|
+
* ```typescript
|
|
7188
|
+
* crawler.onFinish(async (stats) => {
|
|
7189
|
+
* console.log(`Crawl completed: ${stats.urlsVisited} URLs in ${stats.endTime - stats.startTime}ms`);
|
|
7190
|
+
* await generateReport(stats);
|
|
7191
|
+
* });
|
|
7192
|
+
* ```
|
|
7193
|
+
*/
|
|
7194
|
+
onFinish(handler: (stats: CrawlStats) => Promise<void>): Crawler;
|
|
7195
|
+
/**
|
|
7196
|
+
* Registers a handler called when a redirect is followed.
|
|
7197
|
+
* Provides information about the original URL, final URL, and redirect count.
|
|
7198
|
+
*
|
|
7199
|
+
* @param handler - Function to handle redirect events
|
|
7200
|
+
* @returns The crawler instance for method chaining
|
|
7201
|
+
*
|
|
7202
|
+
* @example
|
|
7203
|
+
* ```typescript
|
|
7204
|
+
* crawler.onRedirect(async (event) => {
|
|
7205
|
+
* console.log(`Redirect: ${event.originalUrl} -> ${event.finalUrl}`);
|
|
7206
|
+
* trackRedirects(event);
|
|
7207
|
+
* });
|
|
7208
|
+
* ```
|
|
7209
|
+
*/
|
|
7210
|
+
onRedirect(handler: (event: RedirectEvent$1) => Promise<void>): Crawler;
|
|
7163
7211
|
/**
|
|
7164
7212
|
* Registers a handler for raw response data.
|
|
7165
7213
|
* Triggered for all responses, providing access to the raw Buffer data.
|
|
@@ -7255,21 +7303,23 @@ export declare class Crawler {
|
|
|
7255
7303
|
/**
|
|
7256
7304
|
* Registers a handler for href attributes from anchor and link elements.
|
|
7257
7305
|
* Automatically resolves relative URLs to absolute URLs.
|
|
7306
|
+
* Use `function` syntax (not arrow) to access `this` as the element.
|
|
7258
7307
|
*
|
|
7259
|
-
* @param handler - Function
|
|
7308
|
+
* @param handler - Function receiving href string, with `this` bound to the element
|
|
7260
7309
|
* @returns The crawler instance for method chaining
|
|
7261
7310
|
*
|
|
7262
7311
|
* @example
|
|
7263
7312
|
* ```typescript
|
|
7264
|
-
* crawler.onHref(async (href)
|
|
7313
|
+
* crawler.onHref(async function(href) {
|
|
7265
7314
|
* console.log('Found URL:', href);
|
|
7315
|
+
* console.log('Link text:', this.textContent); // `this` is the anchor/link element
|
|
7266
7316
|
* if (href.includes('/api/')) {
|
|
7267
7317
|
* await crawler.visit(href);
|
|
7268
7318
|
* }
|
|
7269
7319
|
* });
|
|
7270
7320
|
* ```
|
|
7271
7321
|
*/
|
|
7272
|
-
onHref(handler:
|
|
7322
|
+
onHref(handler: ElementBoundHandler<string, HTMLAnchorElement | HTMLLinkElement>): Crawler;
|
|
7273
7323
|
/**
|
|
7274
7324
|
* Registers a handler for elements matching a CSS selector.
|
|
7275
7325
|
* Provides fine-grained control over which elements to process.
|
|
@@ -7311,55 +7361,57 @@ export declare class Crawler {
|
|
|
7311
7361
|
/**
|
|
7312
7362
|
* Registers a handler for HTML element attributes.
|
|
7313
7363
|
* Can extract specific attributes from all elements or from elements matching a selector.
|
|
7364
|
+
* Use `function` syntax (not arrow) to access `this` as the element.
|
|
7314
7365
|
*
|
|
7315
7366
|
* @param attribute - The attribute name to extract
|
|
7316
|
-
* @param handler - Function
|
|
7367
|
+
* @param handler - Function receiving (value, attrName), with `this` bound to element
|
|
7317
7368
|
* @returns The crawler instance for method chaining
|
|
7318
7369
|
*
|
|
7319
7370
|
* @overload
|
|
7320
7371
|
* @param selection - CSS selector to filter elements
|
|
7321
7372
|
* @param attribute - The attribute name to extract
|
|
7322
|
-
* @param handler - Function
|
|
7373
|
+
* @param handler - Function receiving (value, attrName), with `this` bound to element
|
|
7323
7374
|
* @returns The crawler instance for method chaining
|
|
7324
7375
|
*
|
|
7325
7376
|
* @example
|
|
7326
7377
|
* ```typescript
|
|
7327
7378
|
* // Extract all 'data-id' attributes
|
|
7328
|
-
* crawler.onAttribute('data-id', async (value)
|
|
7329
|
-
* console.log('Found
|
|
7379
|
+
* crawler.onAttribute('data-id', async function(value, attrName) {
|
|
7380
|
+
* console.log('Found', attrName, ':', value, 'on:', this.tagName);
|
|
7330
7381
|
* });
|
|
7331
7382
|
*
|
|
7332
7383
|
* // Extract 'src' attributes from images only
|
|
7333
|
-
* crawler.onAttribute('img', 'src', async (
|
|
7334
|
-
* console.log('Image source:',
|
|
7384
|
+
* crawler.onAttribute('img', 'src', async function(value) {
|
|
7385
|
+
* console.log('Image source:', value, 'alt:', this.getAttribute('alt'));
|
|
7335
7386
|
* });
|
|
7336
7387
|
* ```
|
|
7337
7388
|
*/
|
|
7338
|
-
onAttribute(attribute: string, handler:
|
|
7339
|
-
onAttribute(selection: string, attribute: string, handler:
|
|
7389
|
+
onAttribute(attribute: string, handler: AttributeHandler): Crawler;
|
|
7390
|
+
onAttribute(selection: string, attribute: string, handler: AttributeHandler): Crawler;
|
|
7340
7391
|
/**
|
|
7341
7392
|
* Registers a handler for text content of elements matching a CSS selector.
|
|
7342
7393
|
* Extracts and processes the textContent of matching elements.
|
|
7394
|
+
* Use `function` syntax (not arrow) to access `this` as the element.
|
|
7343
7395
|
*
|
|
7344
7396
|
* @param selection - CSS selector to match elements
|
|
7345
|
-
* @param handler - Function
|
|
7397
|
+
* @param handler - Function receiving text string, with `this` bound to element
|
|
7346
7398
|
* @returns The crawler instance for method chaining
|
|
7347
7399
|
*
|
|
7348
7400
|
* @example
|
|
7349
7401
|
* ```typescript
|
|
7350
|
-
* // Extract all heading text
|
|
7351
|
-
* crawler.onText('h1, h2, h3', async (text)
|
|
7352
|
-
* console.log('Heading:', text.trim());
|
|
7402
|
+
* // Extract all heading text with element context
|
|
7403
|
+
* crawler.onText('h1, h2, h3', async function(text) {
|
|
7404
|
+
* console.log('Heading:', text.trim(), 'Tag:', this.tagName);
|
|
7353
7405
|
* });
|
|
7354
7406
|
*
|
|
7355
|
-
* // Extract product prices
|
|
7356
|
-
* crawler.onText('.price', async (
|
|
7357
|
-
* const numericPrice = parseFloat(
|
|
7358
|
-
* console.log('Price
|
|
7407
|
+
* // Extract product prices with element context
|
|
7408
|
+
* crawler.onText('.price', async function(text) {
|
|
7409
|
+
* const numericPrice = parseFloat(text.replace(/[^\d.]/g, ''));
|
|
7410
|
+
* console.log('Price:', numericPrice, 'Product:', this.closest('.product')?.id);
|
|
7359
7411
|
* });
|
|
7360
7412
|
* ```
|
|
7361
7413
|
*/
|
|
7362
|
-
onText(selection: string, handler:
|
|
7414
|
+
onText(selection: string, handler: ElementBoundHandler<string>): Crawler;
|
|
7363
7415
|
private _onBody;
|
|
7364
7416
|
private _onAttribute;
|
|
7365
7417
|
private _onText;
|
|
@@ -7374,6 +7426,86 @@ export declare class Crawler {
|
|
|
7374
7426
|
private _onEmailLeads;
|
|
7375
7427
|
private _onRawResponse;
|
|
7376
7428
|
private _onResponse;
|
|
7429
|
+
/**
|
|
7430
|
+
* Calculate adaptive delay based on server response times (AutoThrottle)
|
|
7431
|
+
*/
|
|
7432
|
+
private calculateAutoThrottleDelay;
|
|
7433
|
+
/**
|
|
7434
|
+
* Get current AutoThrottle delay for a domain
|
|
7435
|
+
*/
|
|
7436
|
+
private getAutoThrottleDelay;
|
|
7437
|
+
/**
|
|
7438
|
+
* Handle 429 Too Many Requests response with Retry-After header parsing
|
|
7439
|
+
*/
|
|
7440
|
+
private handle429Response;
|
|
7441
|
+
/**
|
|
7442
|
+
* Check if URL passes all crawl limit checks
|
|
7443
|
+
*/
|
|
7444
|
+
private checkCrawlLimits;
|
|
7445
|
+
/**
|
|
7446
|
+
* Check if a link should be followed based on nofollow rules
|
|
7447
|
+
*/
|
|
7448
|
+
private shouldFollowLink;
|
|
7449
|
+
/**
|
|
7450
|
+
* Check response size against maxResponseSize limit
|
|
7451
|
+
*/
|
|
7452
|
+
private checkResponseSize;
|
|
7453
|
+
/**
|
|
7454
|
+
* Collect data for later export
|
|
7455
|
+
*
|
|
7456
|
+
* @param data - Data to collect (will be added to export buffer)
|
|
7457
|
+
* @returns The crawler instance for method chaining
|
|
7458
|
+
*
|
|
7459
|
+
* @example
|
|
7460
|
+
* ```typescript
|
|
7461
|
+
* crawler.onDocument(async (doc) => {
|
|
7462
|
+
* crawler.collect({
|
|
7463
|
+
* title: doc.title,
|
|
7464
|
+
* url: doc.URL,
|
|
7465
|
+
* h1: doc.querySelector('h1')?.textContent
|
|
7466
|
+
* });
|
|
7467
|
+
* });
|
|
7468
|
+
* ```
|
|
7469
|
+
*/
|
|
7470
|
+
collect(data: any): Crawler;
|
|
7471
|
+
/**
|
|
7472
|
+
* Get all collected data
|
|
7473
|
+
*/
|
|
7474
|
+
getCollectedData(): any[];
|
|
7475
|
+
/**
|
|
7476
|
+
* Clear collected data
|
|
7477
|
+
*/
|
|
7478
|
+
clearCollectedData(): Crawler;
|
|
7479
|
+
/**
|
|
7480
|
+
* Export collected data to a file
|
|
7481
|
+
*
|
|
7482
|
+
* @param filePath - Output file path
|
|
7483
|
+
* @param format - Export format: 'json', 'jsonl', or 'csv'
|
|
7484
|
+
*
|
|
7485
|
+
* @example
|
|
7486
|
+
* ```typescript
|
|
7487
|
+
* await crawler.waitForAll();
|
|
7488
|
+
* await crawler.exportData('./output.json', 'json');
|
|
7489
|
+
* await crawler.exportData('./output.csv', 'csv');
|
|
7490
|
+
* ```
|
|
7491
|
+
*/
|
|
7492
|
+
exportData(filePath: string, format?: ExportFormat): Promise<void>;
|
|
7493
|
+
/**
|
|
7494
|
+
* Get current crawl statistics
|
|
7495
|
+
*/
|
|
7496
|
+
getStats(): CrawlStats;
|
|
7497
|
+
/**
|
|
7498
|
+
* Trigger onStart handlers (called once on first visit)
|
|
7499
|
+
*/
|
|
7500
|
+
private triggerStartHandlers;
|
|
7501
|
+
/**
|
|
7502
|
+
* Trigger onFinish handlers
|
|
7503
|
+
*/
|
|
7504
|
+
private triggerFinishHandlers;
|
|
7505
|
+
/**
|
|
7506
|
+
* Trigger onRedirect handlers
|
|
7507
|
+
*/
|
|
7508
|
+
private triggerRedirectHandlers;
|
|
7377
7509
|
private buildUrl;
|
|
7378
7510
|
/**
|
|
7379
7511
|
* Visits a URL and processes it according to registered event handlers.
|
|
@@ -7489,6 +7621,17 @@ export declare class Crawler {
|
|
|
7489
7621
|
*/
|
|
7490
7622
|
done(): Promise<void>;
|
|
7491
7623
|
close(): Promise<void>;
|
|
7624
|
+
/**
|
|
7625
|
+
* Destroys the crawler instance and releases all resources.
|
|
7626
|
+
* Clears all queued tasks, closes caches, and cleans up event handlers.
|
|
7627
|
+
* @returns Promise that resolves when destruction is complete
|
|
7628
|
+
* @example
|
|
7629
|
+
* ```typescript
|
|
7630
|
+
* await crawler.destroy();
|
|
7631
|
+
* // Crawler is now fully cleaned up
|
|
7632
|
+
* ```
|
|
7633
|
+
*/
|
|
7634
|
+
destroy(): Promise<void>;
|
|
7492
7635
|
}
|
|
7493
7636
|
|
|
7494
7637
|
export {};
|
package/dist/entries/crawler.cjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.Crawler =
|
|
3
|
-
const
|
|
4
|
-
exports.CrawlerOptions =
|
|
5
|
-
exports.Domain =
|
|
1
|
+
const _mod_398eir = require('../crawler/crawler.cjs');
|
|
2
|
+
exports.Crawler = _mod_398eir.Crawler;;
|
|
3
|
+
const _mod_m4xc57 = require('../crawler/crawler-options.cjs');
|
|
4
|
+
exports.CrawlerOptions = _mod_m4xc57.CrawlerOptions;
|
|
5
|
+
exports.Domain = _mod_m4xc57.Domain;;
|