rezo 1.0.73 → 1.0.75
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/entries/curl.d.ts +4 -1
- package/dist/adapters/entries/fetch.d.ts +4 -1
- package/dist/adapters/entries/http.d.ts +4 -1
- package/dist/adapters/entries/http2.d.ts +4 -1
- package/dist/adapters/entries/react-native.d.ts +4 -1
- package/dist/adapters/entries/xhr.d.ts +4 -1
- package/dist/adapters/http.cjs +2 -1
- package/dist/adapters/http.js +2 -1
- package/dist/adapters/index.cjs +6 -6
- package/dist/cache/index.cjs +9 -9
- package/dist/crawler/crawler-options.cjs +1 -1
- package/dist/crawler/crawler-options.js +1 -1
- package/dist/crawler/crawler.cjs +92 -11
- package/dist/crawler/crawler.js +92 -11
- package/dist/crawler/index.cjs +40 -40
- package/dist/crawler/plugin/index.cjs +1 -1
- package/dist/crawler.d.ts +105 -0
- package/dist/entries/crawler.cjs +4 -4
- package/dist/errors/rezo-error.cjs +3 -72
- package/dist/errors/rezo-error.js +3 -72
- package/dist/index.cjs +30 -30
- package/dist/index.d.ts +4 -1
- package/dist/internal/agents/bun-socks-http.cjs +573 -0
- package/dist/internal/agents/bun-socks-http.js +570 -0
- package/dist/internal/agents/index.cjs +14 -10
- package/dist/internal/agents/index.js +1 -0
- package/dist/platform/browser.d.ts +4 -1
- package/dist/platform/bun.d.ts +4 -1
- package/dist/platform/deno.d.ts +4 -1
- package/dist/platform/node.d.ts +4 -1
- package/dist/platform/react-native.d.ts +4 -1
- package/dist/platform/worker.d.ts +4 -1
- package/dist/proxy/index.cjs +4 -4
- package/dist/queue/index.cjs +8 -8
- package/dist/queue/queue.cjs +4 -1
- package/dist/queue/queue.js +4 -1
- package/dist/responses/universal/index.cjs +11 -11
- package/dist/utils/agent-pool.cjs +35 -0
- package/dist/utils/agent-pool.js +35 -0
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/wget/index.cjs +49 -49
- package/dist/wget/index.d.ts +3 -0
- package/package.json +1 -1
package/dist/crawler/crawler.js
CHANGED
|
@@ -9,7 +9,7 @@ import { CappedMap } from './plugin/capped-map.js';
|
|
|
9
9
|
import { CappedArray } from './plugin/capped-array.js';
|
|
10
10
|
import { parseHTML } from "linkedom";
|
|
11
11
|
import path from "node:path";
|
|
12
|
-
import
|
|
12
|
+
import rezo from '../adapters/entries/http.js';
|
|
13
13
|
import { RezoQueue } from '../queue/queue.js';
|
|
14
14
|
import { Scraper } from './scraper.js';
|
|
15
15
|
import { CrawlerOptions } from './crawler-options.js';
|
|
@@ -85,6 +85,7 @@ export class Crawler {
|
|
|
85
85
|
startHandlers = [];
|
|
86
86
|
finishHandlers = [];
|
|
87
87
|
redirectHandlers = [];
|
|
88
|
+
queueChangeHandlers = [];
|
|
88
89
|
collectedData = new CappedArray({
|
|
89
90
|
maxSize: 1e5,
|
|
90
91
|
evictionRatio: 0.1,
|
|
@@ -94,20 +95,28 @@ export class Crawler {
|
|
|
94
95
|
});
|
|
95
96
|
crawlStarted = false;
|
|
96
97
|
startHandlersPromise = null;
|
|
97
|
-
constructor(crawlerOptions, http =
|
|
98
|
+
constructor(crawlerOptions, http = rezo.create()) {
|
|
98
99
|
this.http = http;
|
|
99
100
|
this.config = new CrawlerOptions(crawlerOptions);
|
|
100
101
|
this.adapterType = this.config.adapter;
|
|
101
102
|
const concurrency = this.config.concurrency;
|
|
102
103
|
this.queue = new RezoQueue({
|
|
103
|
-
|
|
104
|
-
|
|
104
|
+
name: "crawler",
|
|
105
|
+
concurrency
|
|
105
106
|
});
|
|
106
107
|
this.originalConcurrency = concurrency;
|
|
107
108
|
this.scraperQueue = new RezoQueue({
|
|
108
|
-
|
|
109
|
-
|
|
109
|
+
name: "scraper",
|
|
110
|
+
concurrency: this.config.scraperConcurrency
|
|
110
111
|
});
|
|
112
|
+
this._subscribeToQueueEvents(this.queue, "crawler");
|
|
113
|
+
this._subscribeToQueueEvents(this.scraperQueue, "scraper");
|
|
114
|
+
this.config.onLimiterAdded = (queue) => {
|
|
115
|
+
if (!this.subscribedLimiterQueues.has(queue)) {
|
|
116
|
+
this._subscribeToQueueEvents(queue, "limiter");
|
|
117
|
+
this.subscribedLimiterQueues.add(queue);
|
|
118
|
+
}
|
|
119
|
+
};
|
|
111
120
|
this.memoryMonitor = new MemoryMonitor({ warningRatio: 0.7, criticalRatio: 0.85 });
|
|
112
121
|
this.healthMetrics = new HealthMetrics({ windowSize: 60000 });
|
|
113
122
|
const enableCache = this.config.enableCache;
|
|
@@ -178,7 +187,9 @@ export class Crawler {
|
|
|
178
187
|
if (this.config.baseUrl) {
|
|
179
188
|
this.urlDepthMap.set(this.config.baseUrl, 0);
|
|
180
189
|
}
|
|
181
|
-
this.
|
|
190
|
+
if (this.config.enableSignalHandlers) {
|
|
191
|
+
this.registerShutdownHandlers();
|
|
192
|
+
}
|
|
182
193
|
}
|
|
183
194
|
registerShutdownHandlers() {
|
|
184
195
|
if (this.shutdownHandler)
|
|
@@ -227,6 +238,7 @@ export class Crawler {
|
|
|
227
238
|
}
|
|
228
239
|
await this.destroy();
|
|
229
240
|
console.log("[Crawler] Graceful shutdown complete");
|
|
241
|
+
process.exit(0);
|
|
230
242
|
}
|
|
231
243
|
async initializeAdapter() {
|
|
232
244
|
try {
|
|
@@ -595,6 +607,11 @@ export class Crawler {
|
|
|
595
607
|
this.redirectHandlers.push(handler);
|
|
596
608
|
return this;
|
|
597
609
|
}
|
|
610
|
+
onQueueChange(handler) {
|
|
611
|
+
this.queueChangeHandlers.push(handler);
|
|
612
|
+
this._subscribeToLimiterQueues();
|
|
613
|
+
return this;
|
|
614
|
+
}
|
|
598
615
|
onRawData(handler) {
|
|
599
616
|
this.rawResponseEvents.push({
|
|
600
617
|
handler: "_onRawResponse",
|
|
@@ -665,6 +682,52 @@ export class Crawler {
|
|
|
665
682
|
});
|
|
666
683
|
return this;
|
|
667
684
|
}
|
|
685
|
+
subscribedLimiterQueues = new Set;
|
|
686
|
+
_subscribeToQueueEvents(queue, queueType) {
|
|
687
|
+
const emitEvent = (event, taskId) => {
|
|
688
|
+
if (this.queueChangeHandlers.length === 0)
|
|
689
|
+
return;
|
|
690
|
+
const state = queue.state;
|
|
691
|
+
const queueChangeEvent = {
|
|
692
|
+
queueName: queue.name,
|
|
693
|
+
queueType,
|
|
694
|
+
event,
|
|
695
|
+
pending: state.pending,
|
|
696
|
+
size: state.size,
|
|
697
|
+
total: state.total,
|
|
698
|
+
isPaused: state.isPaused,
|
|
699
|
+
isIdle: state.isIdle,
|
|
700
|
+
taskId
|
|
701
|
+
};
|
|
702
|
+
for (const handler of this.queueChangeHandlers) {
|
|
703
|
+
try {
|
|
704
|
+
handler(queueChangeEvent);
|
|
705
|
+
} catch (err) {
|
|
706
|
+
if (this.config.debug)
|
|
707
|
+
console.error("[Crawler] onQueueChange handler error:", err);
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
};
|
|
711
|
+
queue.on("add", (data) => emitEvent("add", data.id));
|
|
712
|
+
queue.on("start", (data) => emitEvent("start", data.id));
|
|
713
|
+
queue.on("completed", (data) => emitEvent("completed", data.id));
|
|
714
|
+
queue.on("error", (data) => emitEvent("error", data.id));
|
|
715
|
+
queue.on("timeout", (data) => emitEvent("timeout", data.id));
|
|
716
|
+
queue.on("cancelled", (data) => emitEvent("cancelled", data.id));
|
|
717
|
+
queue.on("idle", () => emitEvent("idle"));
|
|
718
|
+
queue.on("active", () => emitEvent("active"));
|
|
719
|
+
queue.on("paused", () => emitEvent("paused"));
|
|
720
|
+
queue.on("resumed", () => emitEvent("resumed"));
|
|
721
|
+
}
|
|
722
|
+
_subscribeToLimiterQueues() {
|
|
723
|
+
const limiters = this.config.getLimiters();
|
|
724
|
+
for (const limiter of limiters) {
|
|
725
|
+
if (!this.subscribedLimiterQueues.has(limiter.pqueue)) {
|
|
726
|
+
this._subscribeToQueueEvents(limiter.pqueue, "limiter");
|
|
727
|
+
this.subscribedLimiterQueues.add(limiter.pqueue);
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
}
|
|
668
731
|
_onBody(handler, document) {
|
|
669
732
|
this.queue.add(() => handler(document.body));
|
|
670
733
|
}
|
|
@@ -1056,8 +1119,6 @@ export class Crawler {
|
|
|
1056
1119
|
const headersObj = headers instanceof Headers ? Object.fromEntries(headers.entries()) : headers;
|
|
1057
1120
|
this.addToNavigationQueue(url, method, body, headersObj);
|
|
1058
1121
|
}
|
|
1059
|
-
if (url.includes(`/www.yellowpages.com/search?`))
|
|
1060
|
-
console.log("Visiting: ", url);
|
|
1061
1122
|
this.crawlStarted = true;
|
|
1062
1123
|
if (deepEmailFinder) {
|
|
1063
1124
|
this.execute2(method, url, body, _options, forceRevisit, emailMetadata);
|
|
@@ -1120,7 +1181,16 @@ export class Crawler {
|
|
|
1120
1181
|
}
|
|
1121
1182
|
this.crawlStats.urlsQueued++;
|
|
1122
1183
|
const domain = new URL(url).hostname;
|
|
1123
|
-
const
|
|
1184
|
+
const limiterRandomDelay = this.config.getRandomDelay(url, true);
|
|
1185
|
+
let delay = 0;
|
|
1186
|
+
if (limiterRandomDelay !== undefined && limiterRandomDelay > 0) {
|
|
1187
|
+
delay = Math.floor(Math.random() * limiterRandomDelay);
|
|
1188
|
+
if (this.config.debug) {
|
|
1189
|
+
console.log(`[RandomDelay] ${domain}: ${delay}ms (max: ${limiterRandomDelay}ms)`);
|
|
1190
|
+
}
|
|
1191
|
+
} else {
|
|
1192
|
+
delay = this.getAutoThrottleDelay(domain);
|
|
1193
|
+
}
|
|
1124
1194
|
if (delay > 0) {
|
|
1125
1195
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
1126
1196
|
}
|
|
@@ -1133,7 +1203,7 @@ export class Crawler {
|
|
|
1133
1203
|
return;
|
|
1134
1204
|
}
|
|
1135
1205
|
const requestStartTime = Date.now();
|
|
1136
|
-
const response = cache && method === "GET" && !skipCache ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) :
|
|
1206
|
+
const response = cache && method === "GET" && !skipCache ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) : method === "GET" ? await this.http.get(url, options) : method === "PATCH" ? await this.http.patch(url, body, options) : method === "POST" ? await this.http.post(url, body, options) : await this.http.put(url, body, options);
|
|
1137
1207
|
if (!response) {
|
|
1138
1208
|
this.crawlStats.urlsFailed++;
|
|
1139
1209
|
this.healthMetrics.recordRequest(Date.now() - requestStartTime, false);
|
|
@@ -1268,6 +1338,17 @@ export class Crawler {
|
|
|
1268
1338
|
}
|
|
1269
1339
|
}
|
|
1270
1340
|
async waitForAll() {
|
|
1341
|
+
if (!this.crawlStarted) {
|
|
1342
|
+
await new Promise((resolve) => setImmediate(resolve));
|
|
1343
|
+
const maxWaitForStart = 1000;
|
|
1344
|
+
const startWait = Date.now();
|
|
1345
|
+
while (!this.crawlStarted && Date.now() - startWait < maxWaitForStart) {
|
|
1346
|
+
await new Promise((resolve) => setTimeout(resolve, 10));
|
|
1347
|
+
}
|
|
1348
|
+
if (!this.crawlStarted) {
|
|
1349
|
+
return;
|
|
1350
|
+
}
|
|
1351
|
+
}
|
|
1271
1352
|
const MIN_DELAY = 50;
|
|
1272
1353
|
const MAX_DELAY = 500;
|
|
1273
1354
|
let currentDelay = MIN_DELAY;
|
package/dist/crawler/index.cjs
CHANGED
|
@@ -1,40 +1,40 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.Crawler =
|
|
3
|
-
const
|
|
4
|
-
exports.CrawlerOptions =
|
|
5
|
-
const
|
|
6
|
-
exports.RobotsTxt =
|
|
7
|
-
const
|
|
8
|
-
exports.FileCacher =
|
|
9
|
-
const
|
|
10
|
-
exports.UrlStore =
|
|
11
|
-
const
|
|
12
|
-
exports.NavigationHistory =
|
|
13
|
-
const
|
|
14
|
-
exports.Oxylabs =
|
|
15
|
-
const
|
|
16
|
-
exports.OXYLABS_BROWSER_TYPES =
|
|
17
|
-
exports.OXYLABS_COMMON_LOCALES =
|
|
18
|
-
exports.OXYLABS_COMMON_GEO_LOCATIONS =
|
|
19
|
-
exports.OXYLABS_US_STATES =
|
|
20
|
-
exports.OXYLABS_EUROPEAN_COUNTRIES =
|
|
21
|
-
exports.OXYLABS_ASIAN_COUNTRIES =
|
|
22
|
-
exports.getRandomOxylabsBrowserType =
|
|
23
|
-
exports.getRandomOxylabsLocale =
|
|
24
|
-
exports.getRandomOxylabsGeoLocation =
|
|
25
|
-
const
|
|
26
|
-
exports.Decodo =
|
|
27
|
-
const
|
|
28
|
-
exports.DECODO_DEVICE_TYPES =
|
|
29
|
-
exports.DECODO_HEADLESS_MODES =
|
|
30
|
-
exports.DECODO_COMMON_LOCALES =
|
|
31
|
-
exports.DECODO_COMMON_COUNTRIES =
|
|
32
|
-
exports.DECODO_EUROPEAN_COUNTRIES =
|
|
33
|
-
exports.DECODO_ASIAN_COUNTRIES =
|
|
34
|
-
exports.DECODO_US_STATES =
|
|
35
|
-
exports.DECODO_COMMON_CITIES =
|
|
36
|
-
exports.getRandomDecodoDeviceType =
|
|
37
|
-
exports.getRandomDecodoLocale =
|
|
38
|
-
exports.getRandomDecodoCountry =
|
|
39
|
-
exports.getRandomDecodoCity =
|
|
40
|
-
exports.generateDecodoSessionId =
|
|
1
|
+
const _mod_5bvyq1 = require('./crawler.cjs');
|
|
2
|
+
exports.Crawler = _mod_5bvyq1.Crawler;;
|
|
3
|
+
const _mod_4xvaze = require('./crawler-options.cjs');
|
|
4
|
+
exports.CrawlerOptions = _mod_4xvaze.CrawlerOptions;;
|
|
5
|
+
const _mod_iv735a = require('./plugin/robots-txt.cjs');
|
|
6
|
+
exports.RobotsTxt = _mod_iv735a.RobotsTxt;;
|
|
7
|
+
const _mod_gfu8y8 = require('./plugin/file-cacher.cjs');
|
|
8
|
+
exports.FileCacher = _mod_gfu8y8.FileCacher;;
|
|
9
|
+
const _mod_e921br = require('./plugin/url-store.cjs');
|
|
10
|
+
exports.UrlStore = _mod_e921br.UrlStore;;
|
|
11
|
+
const _mod_zhqmsp = require('./plugin/navigation-history.cjs');
|
|
12
|
+
exports.NavigationHistory = _mod_zhqmsp.NavigationHistory;;
|
|
13
|
+
const _mod_pv5ztj = require('./addon/oxylabs/index.cjs');
|
|
14
|
+
exports.Oxylabs = _mod_pv5ztj.Oxylabs;;
|
|
15
|
+
const _mod_9qqy6e = require('./addon/oxylabs/options.cjs');
|
|
16
|
+
exports.OXYLABS_BROWSER_TYPES = _mod_9qqy6e.OXYLABS_BROWSER_TYPES;
|
|
17
|
+
exports.OXYLABS_COMMON_LOCALES = _mod_9qqy6e.OXYLABS_COMMON_LOCALES;
|
|
18
|
+
exports.OXYLABS_COMMON_GEO_LOCATIONS = _mod_9qqy6e.OXYLABS_COMMON_GEO_LOCATIONS;
|
|
19
|
+
exports.OXYLABS_US_STATES = _mod_9qqy6e.OXYLABS_US_STATES;
|
|
20
|
+
exports.OXYLABS_EUROPEAN_COUNTRIES = _mod_9qqy6e.OXYLABS_EUROPEAN_COUNTRIES;
|
|
21
|
+
exports.OXYLABS_ASIAN_COUNTRIES = _mod_9qqy6e.OXYLABS_ASIAN_COUNTRIES;
|
|
22
|
+
exports.getRandomOxylabsBrowserType = _mod_9qqy6e.getRandomBrowserType;
|
|
23
|
+
exports.getRandomOxylabsLocale = _mod_9qqy6e.getRandomLocale;
|
|
24
|
+
exports.getRandomOxylabsGeoLocation = _mod_9qqy6e.getRandomGeoLocation;;
|
|
25
|
+
const _mod_tth5gh = require('./addon/decodo/index.cjs');
|
|
26
|
+
exports.Decodo = _mod_tth5gh.Decodo;;
|
|
27
|
+
const _mod_fr8mjs = require('./addon/decodo/options.cjs');
|
|
28
|
+
exports.DECODO_DEVICE_TYPES = _mod_fr8mjs.DECODO_DEVICE_TYPES;
|
|
29
|
+
exports.DECODO_HEADLESS_MODES = _mod_fr8mjs.DECODO_HEADLESS_MODES;
|
|
30
|
+
exports.DECODO_COMMON_LOCALES = _mod_fr8mjs.DECODO_COMMON_LOCALES;
|
|
31
|
+
exports.DECODO_COMMON_COUNTRIES = _mod_fr8mjs.DECODO_COMMON_COUNTRIES;
|
|
32
|
+
exports.DECODO_EUROPEAN_COUNTRIES = _mod_fr8mjs.DECODO_EUROPEAN_COUNTRIES;
|
|
33
|
+
exports.DECODO_ASIAN_COUNTRIES = _mod_fr8mjs.DECODO_ASIAN_COUNTRIES;
|
|
34
|
+
exports.DECODO_US_STATES = _mod_fr8mjs.DECODO_US_STATES;
|
|
35
|
+
exports.DECODO_COMMON_CITIES = _mod_fr8mjs.DECODO_COMMON_CITIES;
|
|
36
|
+
exports.getRandomDecodoDeviceType = _mod_fr8mjs.getRandomDeviceType;
|
|
37
|
+
exports.getRandomDecodoLocale = _mod_fr8mjs.getRandomLocale;
|
|
38
|
+
exports.getRandomDecodoCountry = _mod_fr8mjs.getRandomCountry;
|
|
39
|
+
exports.getRandomDecodoCity = _mod_fr8mjs.getRandomCity;
|
|
40
|
+
exports.generateDecodoSessionId = _mod_fr8mjs.generateSessionId;;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
var e=require("./file-cacher.cjs");exports.FileCacher=e.FileCacher;var r=require("./url-store.cjs");exports.UrlStore=r.UrlStore;var o=require("./result-stream.cjs");exports.ResultStream=o.ResultStream;var t=require("./memory-monitor.cjs");exports.MemoryMonitor=t.MemoryMonitor;var
|
|
1
|
+
var e=require("./file-cacher.cjs");exports.FileCacher=e.FileCacher;var r=require("./url-store.cjs");exports.UrlStore=r.UrlStore;var o=require("./result-stream.cjs");exports.ResultStream=o.ResultStream;var t=require("./memory-monitor.cjs");exports.MemoryMonitor=t.MemoryMonitor;var c=require("./health-metrics.cjs");exports.HealthMetrics=c.HealthMetrics;var m=require("./capped-map.cjs");exports.CappedMap=m.CappedMap;var a=require("./capped-array.cjs");exports.CappedArray=a.CappedArray;
|
package/dist/crawler.d.ts
CHANGED
|
@@ -2065,6 +2065,8 @@ declare class RezoError<T = any> extends Error {
|
|
|
2065
2065
|
* Queue configuration options
|
|
2066
2066
|
*/
|
|
2067
2067
|
export interface QueueConfig {
|
|
2068
|
+
/** Name of the queue - useful for debugging and logging */
|
|
2069
|
+
name?: string;
|
|
2068
2070
|
/** Maximum concurrent tasks (default: Infinity) */
|
|
2069
2071
|
concurrency?: number;
|
|
2070
2072
|
/** Auto-start processing when tasks are added (default: true) */
|
|
@@ -2195,6 +2197,7 @@ declare class RezoQueue<T = any> {
|
|
|
2195
2197
|
private isPausedFlag;
|
|
2196
2198
|
private intervalId?;
|
|
2197
2199
|
private intervalCount;
|
|
2200
|
+
readonly name: string;
|
|
2198
2201
|
private intervalStart;
|
|
2199
2202
|
private eventHandlers;
|
|
2200
2203
|
private statsData;
|
|
@@ -4579,6 +4582,8 @@ declare class Rezo {
|
|
|
4579
4582
|
*/
|
|
4580
4583
|
export type IProxy = RezoRequestConfig["proxy"];
|
|
4581
4584
|
interface queueOptions$1 {
|
|
4585
|
+
/** Queue name (optional) - if not provided, a random name will be generated */
|
|
4586
|
+
name?: string;
|
|
4582
4587
|
/** Maximum concurrent requests */
|
|
4583
4588
|
concurrency?: number;
|
|
4584
4589
|
/** Interval in milliseconds between batches */
|
|
@@ -4593,6 +4598,20 @@ interface queueOptions$1 {
|
|
|
4593
4598
|
autoStart?: boolean;
|
|
4594
4599
|
/** Carry over concurrency count between intervals */
|
|
4595
4600
|
carryoverConcurrencyCount?: boolean;
|
|
4601
|
+
/**
|
|
4602
|
+
* Random delay in milliseconds added to each request (0 to randomDelay).
|
|
4603
|
+
* When configured on a limiter, this takes
|
|
4604
|
+
* priority over autoThrottle for the matching domain.
|
|
4605
|
+
* @example
|
|
4606
|
+
* ```typescript
|
|
4607
|
+
* // Add 0-2000ms random jitter between requests
|
|
4608
|
+
* crawler.config.addLimiter({
|
|
4609
|
+
* domain: 'api.example.com',
|
|
4610
|
+
* options: { concurrency: 2, randomDelay: 2000 }
|
|
4611
|
+
* });
|
|
4612
|
+
* ```
|
|
4613
|
+
*/
|
|
4614
|
+
randomDelay?: number;
|
|
4596
4615
|
}
|
|
4597
4616
|
/**
|
|
4598
4617
|
* Crawler response structure
|
|
@@ -6504,6 +6523,8 @@ export interface ICrawlerOptions {
|
|
|
6504
6523
|
} | {
|
|
6505
6524
|
enable: false;
|
|
6506
6525
|
} | undefined | false;
|
|
6526
|
+
/** Enable graceful shutdown handlers for SIGINT/SIGTERM - saves session state (default: false) */
|
|
6527
|
+
enableSignalHandlers?: boolean;
|
|
6507
6528
|
/** Maximum concurrent requests for crawler (default: 100) */
|
|
6508
6529
|
concurrency?: number;
|
|
6509
6530
|
/** Maximum concurrent requests for scraper - separate queue (default: same as concurrency) */
|
|
@@ -6606,6 +6627,8 @@ export declare class CrawlerOptions {
|
|
|
6606
6627
|
throwFatalError?: boolean;
|
|
6607
6628
|
/** Enable debug logging */
|
|
6608
6629
|
debug?: boolean;
|
|
6630
|
+
/** Enable graceful shutdown handlers for SIGINT/SIGTERM - saves session state */
|
|
6631
|
+
enableSignalHandlers: boolean;
|
|
6609
6632
|
/** Maximum concurrent requests for crawler (default: 100) */
|
|
6610
6633
|
concurrency: number;
|
|
6611
6634
|
/** Maximum concurrent requests for scraper (default: same as concurrency) */
|
|
@@ -6648,6 +6671,8 @@ export declare class CrawlerOptions {
|
|
|
6648
6671
|
private proxies;
|
|
6649
6672
|
/** Internal storage for rate limiter configurations with domain mapping */
|
|
6650
6673
|
private limiters;
|
|
6674
|
+
/** Callback invoked when a new limiter is added (set by Crawler for onQueueChange) */
|
|
6675
|
+
onLimiterAdded?: (queue: RezoQueue) => void;
|
|
6651
6676
|
/** Internal storage for custom header configurations with domain mapping */
|
|
6652
6677
|
private requestHeaders;
|
|
6653
6678
|
/**
|
|
@@ -6902,6 +6927,32 @@ export declare class CrawlerOptions {
|
|
|
6902
6927
|
* ```
|
|
6903
6928
|
*/
|
|
6904
6929
|
destroyLimiters(): void;
|
|
6930
|
+
/**
|
|
6931
|
+
* Get all configured limiter queues
|
|
6932
|
+
* @description Returns all RezoQueue instances created by addLimiter().
|
|
6933
|
+
* Useful for subscribing to queue events across all limiters.
|
|
6934
|
+
* @returns Array of limiter configurations with their RezoQueue instances
|
|
6935
|
+
* @example
|
|
6936
|
+
* ```typescript
|
|
6937
|
+
* const limiters = options.getLimiters();
|
|
6938
|
+
* for (const limiter of limiters) {
|
|
6939
|
+
* limiter.pqueue.on('completed', () => console.log('Task done'));
|
|
6940
|
+
* }
|
|
6941
|
+
* ```
|
|
6942
|
+
*/
|
|
6943
|
+
getLimiters(): ReadonlyArray<{
|
|
6944
|
+
domain?: Domain;
|
|
6945
|
+
isGlobal?: boolean;
|
|
6946
|
+
pqueue: RezoQueue;
|
|
6947
|
+
randomDelay?: number;
|
|
6948
|
+
}>;
|
|
6949
|
+
/**
|
|
6950
|
+
* Get the randomDelay configured for a specific URL's domain
|
|
6951
|
+
* @param url - The URL to check
|
|
6952
|
+
* @param useGlobal - Whether to fall back to global limiters
|
|
6953
|
+
* @returns The randomDelay in milliseconds, or undefined if not configured
|
|
6954
|
+
*/
|
|
6955
|
+
getRandomDelay(url: string, useGlobal?: boolean): number | undefined;
|
|
6905
6956
|
/**
|
|
6906
6957
|
* Clear all global configurations from headers, proxies, limiters, Decodo, and Oxylabs
|
|
6907
6958
|
* @returns The CrawlerOptions instance for method chaining
|
|
@@ -7090,6 +7141,30 @@ export interface CrawlStats {
|
|
|
7090
7141
|
endTime?: number;
|
|
7091
7142
|
currentDepth: number;
|
|
7092
7143
|
}
|
|
7144
|
+
/**
|
|
7145
|
+
* Queue change event data passed to onQueueChange handler.
|
|
7146
|
+
* Fires when any queue's state changes (main crawler queue, scraper queue, or limiter queues).
|
|
7147
|
+
*/
|
|
7148
|
+
export interface QueueChangeEvent {
|
|
7149
|
+
/** Name of the queue that changed */
|
|
7150
|
+
queueName: string;
|
|
7151
|
+
/** Type of queue: 'crawler', 'scraper', or 'limiter' */
|
|
7152
|
+
queueType: "crawler" | "scraper" | "limiter";
|
|
7153
|
+
/** The event that triggered this change */
|
|
7154
|
+
event: "add" | "start" | "completed" | "error" | "timeout" | "cancelled" | "idle" | "active" | "paused" | "resumed";
|
|
7155
|
+
/** Number of tasks currently running */
|
|
7156
|
+
pending: number;
|
|
7157
|
+
/** Number of tasks waiting in queue */
|
|
7158
|
+
size: number;
|
|
7159
|
+
/** Total tasks (pending + size) */
|
|
7160
|
+
total: number;
|
|
7161
|
+
/** Is queue paused */
|
|
7162
|
+
isPaused: boolean;
|
|
7163
|
+
/** Is queue idle (no tasks) */
|
|
7164
|
+
isIdle: boolean;
|
|
7165
|
+
/** Task ID if event is task-specific (add, start, completed, error, timeout, cancelled) */
|
|
7166
|
+
taskId?: string;
|
|
7167
|
+
}
|
|
7093
7168
|
/**
|
|
7094
7169
|
* A powerful web crawler that provides event-driven HTML parsing and data extraction.
|
|
7095
7170
|
* Supports caching, proxy rotation, retry mechanisms, and email lead discovery.
|
|
@@ -7180,6 +7255,7 @@ export declare class Crawler {
|
|
|
7180
7255
|
private startHandlers;
|
|
7181
7256
|
private finishHandlers;
|
|
7182
7257
|
private redirectHandlers;
|
|
7258
|
+
private queueChangeHandlers;
|
|
7183
7259
|
/** Data collection for export - bounded to prevent memory issues */
|
|
7184
7260
|
private collectedData;
|
|
7185
7261
|
/** Flag to track if crawl has started */
|
|
@@ -7442,6 +7518,22 @@ export declare class Crawler {
|
|
|
7442
7518
|
* ```
|
|
7443
7519
|
*/
|
|
7444
7520
|
onRedirect(handler: (event: RedirectEvent$1) => Promise<void>): Crawler;
|
|
7521
|
+
/**
|
|
7522
|
+
* Registers a handler called when any queue's state changes.
|
|
7523
|
+
* Fires for the main crawler queue, scraper queue, and all limiter queues.
|
|
7524
|
+
* Each event reports only the specific queue that changed (not aggregated).
|
|
7525
|
+
*
|
|
7526
|
+
* @param handler - Function to handle queue change events (sync callback)
|
|
7527
|
+
* @returns The crawler instance for method chaining
|
|
7528
|
+
*
|
|
7529
|
+
* @example
|
|
7530
|
+
* ```typescript
|
|
7531
|
+
* crawler.onQueueChange((event) => {
|
|
7532
|
+
* console.log(`[${event.queueType}:${event.queueName}] ${event.event} - pending: ${event.pending}, size: ${event.size}`);
|
|
7533
|
+
* });
|
|
7534
|
+
* ```
|
|
7535
|
+
*/
|
|
7536
|
+
onQueueChange(handler: (event: QueueChangeEvent) => void): Crawler;
|
|
7445
7537
|
/**
|
|
7446
7538
|
* Registers a handler for raw response data.
|
|
7447
7539
|
* Triggered for all responses, providing access to the raw Buffer data.
|
|
@@ -7646,6 +7738,19 @@ export declare class Crawler {
|
|
|
7646
7738
|
* ```
|
|
7647
7739
|
*/
|
|
7648
7740
|
onText(selection: string, handler: ElementBoundHandler<string>): Crawler;
|
|
7741
|
+
/** Track which limiter queues we've already subscribed to */
|
|
7742
|
+
private subscribedLimiterQueues;
|
|
7743
|
+
/**
|
|
7744
|
+
* Subscribe to a queue's events to emit QueueChangeEvent
|
|
7745
|
+
* @param queue - The RezoQueue to subscribe to
|
|
7746
|
+
* @param queueType - Type of queue: 'crawler', 'scraper', or 'limiter'
|
|
7747
|
+
*/
|
|
7748
|
+
private _subscribeToQueueEvents;
|
|
7749
|
+
/**
|
|
7750
|
+
* Subscribe to all limiter queues that haven't been subscribed to yet.
|
|
7751
|
+
* Called when onQueueChange handlers are registered to ensure limiter events are captured.
|
|
7752
|
+
*/
|
|
7753
|
+
private _subscribeToLimiterQueues;
|
|
7649
7754
|
private _onBody;
|
|
7650
7755
|
private _onAttribute;
|
|
7651
7756
|
private _onText;
|
package/dist/entries/crawler.cjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.Crawler =
|
|
3
|
-
const
|
|
4
|
-
exports.CrawlerOptions =
|
|
1
|
+
const _mod_ezbo5o = require('../crawler/crawler.cjs');
|
|
2
|
+
exports.Crawler = _mod_ezbo5o.Crawler;;
|
|
3
|
+
const _mod_c2caeq = require('../crawler/crawler-options.cjs');
|
|
4
|
+
exports.CrawlerOptions = _mod_c2caeq.CrawlerOptions;;
|
|
@@ -553,9 +553,9 @@ function cleanStackTrace(stack) {
|
|
|
553
553
|
class RezoError extends Error {
|
|
554
554
|
constructor(message, config, code, request, response) {
|
|
555
555
|
super();
|
|
556
|
-
Object.defineProperty(this, "config", { value: config, enumerable:
|
|
557
|
-
Object.defineProperty(this, "request", { value: request, enumerable:
|
|
558
|
-
Object.defineProperty(this, "response", { value: response, enumerable:
|
|
556
|
+
Object.defineProperty(this, "config", { value: config, enumerable: !!config });
|
|
557
|
+
Object.defineProperty(this, "request", { value: request, enumerable: !!request });
|
|
558
|
+
Object.defineProperty(this, "response", { value: response, enumerable: !!response });
|
|
559
559
|
Object.defineProperty(this, "isRezoError", { value: true, enumerable: false });
|
|
560
560
|
if (code) {
|
|
561
561
|
Object.defineProperty(this, "code", { value: code, enumerable: true });
|
|
@@ -600,75 +600,6 @@ class RezoError extends Error {
|
|
|
600
600
|
}
|
|
601
601
|
}
|
|
602
602
|
}
|
|
603
|
-
[Symbol.for("nodejs.util.inspect.custom")](_depth, options) {
|
|
604
|
-
const parts = [];
|
|
605
|
-
const isDebug = this.config?.debug === true;
|
|
606
|
-
const inspect = options?.stylize ? (v) => require("util").inspect(v, { depth: 3, colors: true }) : JSON.stringify;
|
|
607
|
-
parts.push(`${this.name}: ${this.message}`);
|
|
608
|
-
if (this.code)
|
|
609
|
-
parts.push(` code: '${this.code}'`);
|
|
610
|
-
if (this.method)
|
|
611
|
-
parts.push(` method: '${this.method}'`);
|
|
612
|
-
if (this.url)
|
|
613
|
-
parts.push(` url: '${this.url}'`);
|
|
614
|
-
if (this.finalUrl && this.finalUrl !== this.url) {
|
|
615
|
-
parts.push(` finalUrl: '${this.finalUrl}'`);
|
|
616
|
-
}
|
|
617
|
-
if (this.status)
|
|
618
|
-
parts.push(` status: ${this.status}`);
|
|
619
|
-
if (this.statusText)
|
|
620
|
-
parts.push(` statusText: '${this.statusText}'`);
|
|
621
|
-
if (this.urls && this.urls.length > 1) {
|
|
622
|
-
parts.push(` urls: [${this.urls.map((u) => `'${u}'`).join(", ")}]`);
|
|
623
|
-
}
|
|
624
|
-
if (this.suggestion)
|
|
625
|
-
parts.push(` suggestion: ${this.suggestion}`);
|
|
626
|
-
if (isDebug) {
|
|
627
|
-
parts.push("");
|
|
628
|
-
parts.push(" --- Debug Info ---");
|
|
629
|
-
if (this.cause) {
|
|
630
|
-
const causeMsg = typeof this.cause === "string" ? this.cause : this.cause?.message || String(this.cause);
|
|
631
|
-
parts.push(` cause: ${causeMsg}`);
|
|
632
|
-
}
|
|
633
|
-
if (this.errno)
|
|
634
|
-
parts.push(` errno: ${this.errno}`);
|
|
635
|
-
if (this.hostname)
|
|
636
|
-
parts.push(` hostname: '${this.hostname}'`);
|
|
637
|
-
if (this.port)
|
|
638
|
-
parts.push(` port: ${this.port}`);
|
|
639
|
-
if (this.address)
|
|
640
|
-
parts.push(` address: '${this.address}'`);
|
|
641
|
-
if (this.syscall)
|
|
642
|
-
parts.push(` syscall: '${this.syscall}'`);
|
|
643
|
-
if (this.response) {
|
|
644
|
-
parts.push("");
|
|
645
|
-
parts.push(" --- Response ---");
|
|
646
|
-
parts.push(` response.status: ${this.response.status}`);
|
|
647
|
-
parts.push(` response.statusText: '${this.response.statusText || ""}'`);
|
|
648
|
-
parts.push(` response.finalUrl: '${this.response.finalUrl || ""}'`);
|
|
649
|
-
if (this.response.headers) {
|
|
650
|
-
parts.push(` response.headers: ${inspect(this.response.headers)}`);
|
|
651
|
-
}
|
|
652
|
-
if (this.response.data !== undefined) {
|
|
653
|
-
const dataStr = typeof this.response.data === "string" ? this.response.data.substring(0, 500) + (this.response.data.length > 500 ? "..." : "") : inspect(this.response.data);
|
|
654
|
-
parts.push(` response.data: ${dataStr}`);
|
|
655
|
-
}
|
|
656
|
-
}
|
|
657
|
-
if (this.response?.config) {
|
|
658
|
-
parts.push("");
|
|
659
|
-
parts.push(" --- Request Config ---");
|
|
660
|
-
const { cookieJar, ...configWithoutJar } = this.response.config;
|
|
661
|
-
parts.push(` config: ${inspect(configWithoutJar)}`);
|
|
662
|
-
}
|
|
663
|
-
if (this.stack) {
|
|
664
|
-
parts.push("");
|
|
665
|
-
parts.push(" --- Stack Trace ---");
|
|
666
|
-
parts.push(this.stack);
|
|
667
|
-
}
|
|
668
|
-
}
|
|
669
|
-
return parts.join(`
|
|
670
|
-
`);
|
|
671
|
-
}
|
|
672
603
|
static isRezoError(error) {
|
|
673
604
|
return error instanceof RezoError || error !== null && typeof error === "object" && error.isRezoError === true;
|
|
674
605
|
}
|
|
@@ -553,9 +553,9 @@ function cleanStackTrace(stack) {
|
|
|
553
553
|
export class RezoError extends Error {
|
|
554
554
|
constructor(message, config, code, request, response) {
|
|
555
555
|
super();
|
|
556
|
-
Object.defineProperty(this, "config", { value: config, enumerable:
|
|
557
|
-
Object.defineProperty(this, "request", { value: request, enumerable:
|
|
558
|
-
Object.defineProperty(this, "response", { value: response, enumerable:
|
|
556
|
+
Object.defineProperty(this, "config", { value: config, enumerable: !!config });
|
|
557
|
+
Object.defineProperty(this, "request", { value: request, enumerable: !!request });
|
|
558
|
+
Object.defineProperty(this, "response", { value: response, enumerable: !!response });
|
|
559
559
|
Object.defineProperty(this, "isRezoError", { value: true, enumerable: false });
|
|
560
560
|
if (code) {
|
|
561
561
|
Object.defineProperty(this, "code", { value: code, enumerable: true });
|
|
@@ -600,75 +600,6 @@ export class RezoError extends Error {
|
|
|
600
600
|
}
|
|
601
601
|
}
|
|
602
602
|
}
|
|
603
|
-
[Symbol.for("nodejs.util.inspect.custom")](_depth, options) {
|
|
604
|
-
const parts = [];
|
|
605
|
-
const isDebug = this.config?.debug === true;
|
|
606
|
-
const inspect = options?.stylize ? (v) => require("util").inspect(v, { depth: 3, colors: true }) : JSON.stringify;
|
|
607
|
-
parts.push(`${this.name}: ${this.message}`);
|
|
608
|
-
if (this.code)
|
|
609
|
-
parts.push(` code: '${this.code}'`);
|
|
610
|
-
if (this.method)
|
|
611
|
-
parts.push(` method: '${this.method}'`);
|
|
612
|
-
if (this.url)
|
|
613
|
-
parts.push(` url: '${this.url}'`);
|
|
614
|
-
if (this.finalUrl && this.finalUrl !== this.url) {
|
|
615
|
-
parts.push(` finalUrl: '${this.finalUrl}'`);
|
|
616
|
-
}
|
|
617
|
-
if (this.status)
|
|
618
|
-
parts.push(` status: ${this.status}`);
|
|
619
|
-
if (this.statusText)
|
|
620
|
-
parts.push(` statusText: '${this.statusText}'`);
|
|
621
|
-
if (this.urls && this.urls.length > 1) {
|
|
622
|
-
parts.push(` urls: [${this.urls.map((u) => `'${u}'`).join(", ")}]`);
|
|
623
|
-
}
|
|
624
|
-
if (this.suggestion)
|
|
625
|
-
parts.push(` suggestion: ${this.suggestion}`);
|
|
626
|
-
if (isDebug) {
|
|
627
|
-
parts.push("");
|
|
628
|
-
parts.push(" --- Debug Info ---");
|
|
629
|
-
if (this.cause) {
|
|
630
|
-
const causeMsg = typeof this.cause === "string" ? this.cause : this.cause?.message || String(this.cause);
|
|
631
|
-
parts.push(` cause: ${causeMsg}`);
|
|
632
|
-
}
|
|
633
|
-
if (this.errno)
|
|
634
|
-
parts.push(` errno: ${this.errno}`);
|
|
635
|
-
if (this.hostname)
|
|
636
|
-
parts.push(` hostname: '${this.hostname}'`);
|
|
637
|
-
if (this.port)
|
|
638
|
-
parts.push(` port: ${this.port}`);
|
|
639
|
-
if (this.address)
|
|
640
|
-
parts.push(` address: '${this.address}'`);
|
|
641
|
-
if (this.syscall)
|
|
642
|
-
parts.push(` syscall: '${this.syscall}'`);
|
|
643
|
-
if (this.response) {
|
|
644
|
-
parts.push("");
|
|
645
|
-
parts.push(" --- Response ---");
|
|
646
|
-
parts.push(` response.status: ${this.response.status}`);
|
|
647
|
-
parts.push(` response.statusText: '${this.response.statusText || ""}'`);
|
|
648
|
-
parts.push(` response.finalUrl: '${this.response.finalUrl || ""}'`);
|
|
649
|
-
if (this.response.headers) {
|
|
650
|
-
parts.push(` response.headers: ${inspect(this.response.headers)}`);
|
|
651
|
-
}
|
|
652
|
-
if (this.response.data !== undefined) {
|
|
653
|
-
const dataStr = typeof this.response.data === "string" ? this.response.data.substring(0, 500) + (this.response.data.length > 500 ? "..." : "") : inspect(this.response.data);
|
|
654
|
-
parts.push(` response.data: ${dataStr}`);
|
|
655
|
-
}
|
|
656
|
-
}
|
|
657
|
-
if (this.response?.config) {
|
|
658
|
-
parts.push("");
|
|
659
|
-
parts.push(" --- Request Config ---");
|
|
660
|
-
const { cookieJar, ...configWithoutJar } = this.response.config;
|
|
661
|
-
parts.push(` config: ${inspect(configWithoutJar)}`);
|
|
662
|
-
}
|
|
663
|
-
if (this.stack) {
|
|
664
|
-
parts.push("");
|
|
665
|
-
parts.push(" --- Stack Trace ---");
|
|
666
|
-
parts.push(this.stack);
|
|
667
|
-
}
|
|
668
|
-
}
|
|
669
|
-
return parts.join(`
|
|
670
|
-
`);
|
|
671
|
-
}
|
|
672
603
|
static isRezoError(error) {
|
|
673
604
|
return error instanceof RezoError || error !== null && typeof error === "object" && error.isRezoError === true;
|
|
674
605
|
}
|