rezo 1.0.72 → 1.0.73
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/entries/curl.d.ts +13 -2
- package/dist/adapters/entries/fetch.d.ts +13 -2
- package/dist/adapters/entries/http.d.ts +13 -2
- package/dist/adapters/entries/http2.d.ts +13 -2
- package/dist/adapters/entries/react-native.d.ts +13 -2
- package/dist/adapters/entries/xhr.d.ts +13 -2
- package/dist/adapters/index.cjs +6 -6
- package/dist/cache/index.cjs +9 -9
- package/dist/crawler/crawler-options.cjs +1 -1
- package/dist/crawler/crawler-options.js +1 -1
- package/dist/crawler/crawler.cjs +307 -86
- package/dist/crawler/crawler.js +307 -86
- package/dist/crawler/index.cjs +40 -40
- package/dist/crawler/plugin/capped-array.cjs +1 -0
- package/dist/crawler/plugin/capped-array.js +1 -0
- package/dist/crawler/plugin/capped-map.cjs +1 -0
- package/dist/crawler/plugin/capped-map.js +1 -0
- package/dist/crawler/plugin/file-cacher.cjs +20 -18
- package/dist/crawler/plugin/file-cacher.js +20 -18
- package/dist/crawler/plugin/health-metrics.cjs +2 -0
- package/dist/crawler/plugin/health-metrics.js +2 -0
- package/dist/crawler/plugin/index.cjs +1 -1
- package/dist/crawler/plugin/index.js +1 -1
- package/dist/crawler/plugin/memory-monitor.cjs +1 -0
- package/dist/crawler/plugin/memory-monitor.js +1 -0
- package/dist/crawler/plugin/navigation-history.cjs +5 -5
- package/dist/crawler/plugin/navigation-history.js +3 -3
- package/dist/crawler/plugin/result-stream.cjs +5 -0
- package/dist/crawler/plugin/result-stream.js +5 -0
- package/dist/crawler/plugin/sqlite-utils.cjs +1 -0
- package/dist/crawler/plugin/sqlite-utils.js +1 -0
- package/dist/crawler/plugin/url-store.cjs +5 -5
- package/dist/crawler/plugin/url-store.js +5 -5
- package/dist/crawler/scraper.cjs +1 -1
- package/dist/crawler/scraper.js +1 -1
- package/dist/crawler.d.ts +148 -25
- package/dist/entries/crawler.cjs +4 -4
- package/dist/index.cjs +30 -30
- package/dist/index.d.ts +13 -2
- package/dist/internal/agents/index.cjs +10 -10
- package/dist/platform/browser.d.ts +13 -2
- package/dist/platform/bun.d.ts +13 -2
- package/dist/platform/deno.d.ts +13 -2
- package/dist/platform/node.d.ts +13 -2
- package/dist/platform/react-native.d.ts +13 -2
- package/dist/platform/worker.d.ts +13 -2
- package/dist/proxy/index.cjs +4 -4
- package/dist/queue/index.cjs +8 -8
- package/dist/queue/queue.cjs +58 -13
- package/dist/queue/queue.js +58 -13
- package/dist/responses/universal/index.cjs +11 -11
- package/dist/utils/agent-pool.cjs +2 -0
- package/dist/utils/agent-pool.js +2 -0
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/wget/index.cjs +49 -49
- package/dist/wget/index.d.ts +12 -1
- package/package.json +1 -1
package/dist/crawler/crawler.cjs
CHANGED
|
@@ -3,6 +3,10 @@ const { FileCacher } = require('./plugin/file-cacher.cjs');
|
|
|
3
3
|
const { UrlStore } = require('./plugin/url-store.cjs');
|
|
4
4
|
const { NavigationHistory } = require('./plugin/navigation-history.cjs');
|
|
5
5
|
const { RobotsTxt } = require('./plugin/robots-txt.cjs');
|
|
6
|
+
const { MemoryMonitor } = require('./plugin/memory-monitor.cjs');
|
|
7
|
+
const { HealthMetrics } = require('./plugin/health-metrics.cjs');
|
|
8
|
+
const { CappedMap } = require('./plugin/capped-map.cjs');
|
|
9
|
+
const { CappedArray } = require('./plugin/capped-array.cjs');
|
|
6
10
|
const { parseHTML } = require("linkedom");
|
|
7
11
|
const path = require("node:path");
|
|
8
12
|
const { Rezo } = require('../adapters/entries/http.cjs');
|
|
@@ -43,6 +47,7 @@ class Crawler {
|
|
|
43
47
|
emailLeadsEvents = [];
|
|
44
48
|
cacher = null;
|
|
45
49
|
queue;
|
|
50
|
+
scraperQueue;
|
|
46
51
|
isCacheEnabled;
|
|
47
52
|
config;
|
|
48
53
|
urlStorage;
|
|
@@ -56,13 +61,12 @@ class Crawler {
|
|
|
56
61
|
navigationHistoryInitPromise = null;
|
|
57
62
|
adapterExecutor = null;
|
|
58
63
|
adapterType;
|
|
59
|
-
pendingExecutions = new Set;
|
|
60
|
-
pendingVisitCount = 0;
|
|
61
64
|
isDestroyed = false;
|
|
62
|
-
|
|
65
|
+
shutdownRequested = false;
|
|
66
|
+
queueOptions = { concurrency: 100 };
|
|
63
67
|
robotsTxt;
|
|
64
|
-
domainResponseTimes = new
|
|
65
|
-
domainCurrentDelay = new
|
|
68
|
+
domainResponseTimes = new CappedMap({ maxSize: 500 });
|
|
69
|
+
domainCurrentDelay = new CappedMap({ maxSize: 500 });
|
|
66
70
|
crawlStats = {
|
|
67
71
|
urlsVisited: 0,
|
|
68
72
|
urlsQueued: 0,
|
|
@@ -70,19 +74,42 @@ class Crawler {
|
|
|
70
74
|
startTime: 0,
|
|
71
75
|
currentDepth: 0
|
|
72
76
|
};
|
|
73
|
-
urlDepthMap = new
|
|
77
|
+
urlDepthMap = new CappedMap({ maxSize: 50000 });
|
|
78
|
+
cleanupInterval;
|
|
79
|
+
checkpointInterval;
|
|
80
|
+
lastCheckpointTime = 0;
|
|
81
|
+
memoryMonitor;
|
|
82
|
+
healthMetrics;
|
|
83
|
+
originalConcurrency = 100;
|
|
84
|
+
shutdownHandler = null;
|
|
74
85
|
startHandlers = [];
|
|
75
86
|
finishHandlers = [];
|
|
76
87
|
redirectHandlers = [];
|
|
77
|
-
collectedData =
|
|
88
|
+
collectedData = new CappedArray({
|
|
89
|
+
maxSize: 1e5,
|
|
90
|
+
evictionRatio: 0.1,
|
|
91
|
+
onEviction: (evicted, remaining) => {
|
|
92
|
+
console.warn(`[Crawler] collectedData auto-evicted ${evicted.length} oldest entries. ${remaining} entries remaining. Consider using exportData() more frequently.`);
|
|
93
|
+
}
|
|
94
|
+
});
|
|
78
95
|
crawlStarted = false;
|
|
96
|
+
startHandlersPromise = null;
|
|
79
97
|
constructor(crawlerOptions, http = new Rezo) {
|
|
80
98
|
this.http = http;
|
|
81
|
-
this.queue = new RezoQueue({
|
|
82
|
-
concurrency: 1000
|
|
83
|
-
});
|
|
84
99
|
this.config = new CrawlerOptions(crawlerOptions);
|
|
85
100
|
this.adapterType = this.config.adapter;
|
|
101
|
+
const concurrency = this.config.concurrency;
|
|
102
|
+
this.queue = new RezoQueue({
|
|
103
|
+
concurrency,
|
|
104
|
+
timeout: 60000
|
|
105
|
+
});
|
|
106
|
+
this.originalConcurrency = concurrency;
|
|
107
|
+
this.scraperQueue = new RezoQueue({
|
|
108
|
+
concurrency: this.config.scraperConcurrency,
|
|
109
|
+
timeout: 60000
|
|
110
|
+
});
|
|
111
|
+
this.memoryMonitor = new MemoryMonitor({ warningRatio: 0.7, criticalRatio: 0.85 });
|
|
112
|
+
this.healthMetrics = new HealthMetrics({ windowSize: 60000 });
|
|
86
113
|
const enableCache = this.config.enableCache;
|
|
87
114
|
this.isCacheEnabled = enableCache;
|
|
88
115
|
if (enableCache) {
|
|
@@ -94,7 +121,7 @@ class Crawler {
|
|
|
94
121
|
FileCacher.create({
|
|
95
122
|
cacheDir: dbUrl,
|
|
96
123
|
ttl: cacheTTL,
|
|
97
|
-
|
|
124
|
+
maxEntries: 1e5
|
|
98
125
|
}).then((storage) => {
|
|
99
126
|
this.cacher = storage;
|
|
100
127
|
this.isCacheReady = true;
|
|
@@ -151,6 +178,55 @@ class Crawler {
|
|
|
151
178
|
if (this.config.baseUrl) {
|
|
152
179
|
this.urlDepthMap.set(this.config.baseUrl, 0);
|
|
153
180
|
}
|
|
181
|
+
this.registerShutdownHandlers();
|
|
182
|
+
}
|
|
183
|
+
registerShutdownHandlers() {
|
|
184
|
+
if (this.shutdownHandler)
|
|
185
|
+
return;
|
|
186
|
+
this.shutdownHandler = () => this.gracefulShutdown();
|
|
187
|
+
process.on("SIGINT", this.shutdownHandler);
|
|
188
|
+
process.on("SIGTERM", this.shutdownHandler);
|
|
189
|
+
}
|
|
190
|
+
removeShutdownHandlers() {
|
|
191
|
+
if (this.shutdownHandler) {
|
|
192
|
+
process.off("SIGINT", this.shutdownHandler);
|
|
193
|
+
process.off("SIGTERM", this.shutdownHandler);
|
|
194
|
+
this.shutdownHandler = null;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
async gracefulShutdown() {
|
|
198
|
+
if (this.shutdownRequested || this.isDestroyed)
|
|
199
|
+
return;
|
|
200
|
+
this.shutdownRequested = true;
|
|
201
|
+
console.log(`
|
|
202
|
+
[Crawler] Shutdown requested, finishing current tasks...`);
|
|
203
|
+
this.queue.pause();
|
|
204
|
+
this.scraperQueue.pause();
|
|
205
|
+
const timeoutPromise = new Promise((resolve) => {
|
|
206
|
+
setTimeout(() => {
|
|
207
|
+
console.log("[Crawler] Shutdown timeout (5s), forcing exit...");
|
|
208
|
+
resolve();
|
|
209
|
+
}, 5000);
|
|
210
|
+
});
|
|
211
|
+
await Promise.race([
|
|
212
|
+
Promise.all([this.queue.onIdle(), this.scraperQueue.onIdle()]),
|
|
213
|
+
timeoutPromise
|
|
214
|
+
]);
|
|
215
|
+
if (this.navigationHistory && this.currentSession) {
|
|
216
|
+
try {
|
|
217
|
+
await this.navigationHistory.updateSessionStatus(this.currentSession.sessionId, "paused");
|
|
218
|
+
await this.navigationHistory.updateSessionStats(this.currentSession.sessionId, {
|
|
219
|
+
urlsVisited: this.crawlStats.urlsVisited,
|
|
220
|
+
urlsQueued: this.queue.size,
|
|
221
|
+
urlsFailed: this.crawlStats.urlsFailed
|
|
222
|
+
});
|
|
223
|
+
console.log(`[Crawler] Session saved: ${this.currentSession.sessionId}`);
|
|
224
|
+
} catch (err) {
|
|
225
|
+
console.warn("[Crawler] Failed to save session state:", err);
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
await this.destroy();
|
|
229
|
+
console.log("[Crawler] Graceful shutdown complete");
|
|
154
230
|
}
|
|
155
231
|
async initializeAdapter() {
|
|
156
232
|
try {
|
|
@@ -166,7 +242,6 @@ class Crawler {
|
|
|
166
242
|
if (!this.isDestroyed)
|
|
167
243
|
return;
|
|
168
244
|
this.queue = new RezoQueue(this.queueOptions);
|
|
169
|
-
this.pendingExecutions.clear();
|
|
170
245
|
this.isDestroyed = false;
|
|
171
246
|
if (this.config.debug) {
|
|
172
247
|
console.log("[Crawler] Restored from destroyed state");
|
|
@@ -342,17 +417,110 @@ class Crawler {
|
|
|
342
417
|
handler(data);
|
|
343
418
|
});
|
|
344
419
|
}
|
|
345
|
-
async waitForCache() {
|
|
420
|
+
async waitForCache(timeoutMs = 30000) {
|
|
346
421
|
if (this.isCacheReady)
|
|
347
422
|
return;
|
|
348
|
-
|
|
349
|
-
|
|
423
|
+
const start = Date.now();
|
|
424
|
+
while (!this.isCacheReady) {
|
|
425
|
+
if (Date.now() - start > timeoutMs) {
|
|
426
|
+
console.warn("[Crawler] Cache initialization timeout, continuing without cache");
|
|
427
|
+
this.isCacheReady = true;
|
|
428
|
+
return;
|
|
429
|
+
}
|
|
430
|
+
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
431
|
+
}
|
|
350
432
|
}
|
|
351
|
-
async waitForStorage() {
|
|
433
|
+
async waitForStorage(timeoutMs = 30000) {
|
|
352
434
|
if (this.isStorageReady)
|
|
353
435
|
return;
|
|
354
|
-
|
|
355
|
-
|
|
436
|
+
const start = Date.now();
|
|
437
|
+
while (!this.isStorageReady) {
|
|
438
|
+
if (Date.now() - start > timeoutMs) {
|
|
439
|
+
console.warn("[Crawler] Storage initialization timeout, continuing without URL tracking");
|
|
440
|
+
this.isStorageReady = true;
|
|
441
|
+
return;
|
|
442
|
+
}
|
|
443
|
+
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
startPeriodicCleanup() {
|
|
447
|
+
if (this.cleanupInterval)
|
|
448
|
+
return;
|
|
449
|
+
this.cleanupInterval = setInterval(() => {
|
|
450
|
+
const memStatus = this.memoryMonitor.check();
|
|
451
|
+
if (memStatus === "critical") {
|
|
452
|
+
this.queue.pause();
|
|
453
|
+
this.memoryMonitor.forceGC();
|
|
454
|
+
if (this.config.debug) {
|
|
455
|
+
const report = this.memoryMonitor.getReport();
|
|
456
|
+
console.warn(`[Crawler] CRITICAL memory (${report.usagePercent}%), pausing...`);
|
|
457
|
+
}
|
|
458
|
+
setTimeout(() => {
|
|
459
|
+
this.queue.concurrency = Math.max(5, Math.floor(this.originalConcurrency * 0.25));
|
|
460
|
+
this.queue.start();
|
|
461
|
+
}, 3000);
|
|
462
|
+
} else if (memStatus === "warning") {
|
|
463
|
+
const newConcurrency = Math.max(10, Math.floor(this.originalConcurrency * 0.5));
|
|
464
|
+
if (this.queue.concurrency > newConcurrency) {
|
|
465
|
+
this.queue.concurrency = newConcurrency;
|
|
466
|
+
if (this.config.debug) {
|
|
467
|
+
const report = this.memoryMonitor.getReport();
|
|
468
|
+
console.warn(`[Crawler] High memory (${report.usagePercent}%), reducing concurrency to ${newConcurrency}`);
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
} else {
|
|
472
|
+
if (this.queue.concurrency < this.originalConcurrency) {
|
|
473
|
+
this.queue.concurrency = Math.min(this.originalConcurrency, this.queue.concurrency + 10);
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
}, 30000);
|
|
477
|
+
if (this.cleanupInterval.unref) {
|
|
478
|
+
this.cleanupInterval.unref();
|
|
479
|
+
}
|
|
480
|
+
if (this.config.enableNavigationHistory) {
|
|
481
|
+
this.startAutoCheckpoint();
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
startAutoCheckpoint() {
|
|
485
|
+
if (this.checkpointInterval)
|
|
486
|
+
return;
|
|
487
|
+
const CHECKPOINT_INTERVAL = 5 * 60 * 1000;
|
|
488
|
+
this.checkpointInterval = setInterval(async () => {
|
|
489
|
+
if (this.shutdownRequested || this.isDestroyed)
|
|
490
|
+
return;
|
|
491
|
+
try {
|
|
492
|
+
await this.saveCheckpoint();
|
|
493
|
+
} catch (error) {
|
|
494
|
+
if (this.config.debug) {
|
|
495
|
+
console.error("[Crawler] Checkpoint save failed:", error);
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
}, CHECKPOINT_INTERVAL);
|
|
499
|
+
if (this.checkpointInterval.unref) {
|
|
500
|
+
this.checkpointInterval.unref();
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
async saveCheckpoint() {
|
|
504
|
+
if (!this.navigationHistory || !this.currentSession)
|
|
505
|
+
return;
|
|
506
|
+
const now = Date.now();
|
|
507
|
+
if (now - this.lastCheckpointTime < 60000)
|
|
508
|
+
return;
|
|
509
|
+
try {
|
|
510
|
+
await this.navigationHistory.updateSessionStats(this.currentSession.sessionId, {
|
|
511
|
+
urlsVisited: this.crawlStats.urlsVisited,
|
|
512
|
+
urlsQueued: this.crawlStats.urlsQueued,
|
|
513
|
+
urlsFailed: this.crawlStats.urlsFailed
|
|
514
|
+
});
|
|
515
|
+
this.lastCheckpointTime = now;
|
|
516
|
+
if (this.config.debug) {
|
|
517
|
+
console.log(`[Crawler] Checkpoint saved: ${this.crawlStats.urlsVisited} visited, ${this.crawlStats.urlsFailed} failed`);
|
|
518
|
+
}
|
|
519
|
+
} catch (error) {
|
|
520
|
+
if (this.config.debug) {
|
|
521
|
+
console.error("[Crawler] Failed to save checkpoint:", error);
|
|
522
|
+
}
|
|
523
|
+
}
|
|
356
524
|
}
|
|
357
525
|
async saveUrl(url) {
|
|
358
526
|
await this.waitForStorage();
|
|
@@ -696,14 +864,14 @@ class Crawler {
|
|
|
696
864
|
return this;
|
|
697
865
|
}
|
|
698
866
|
getCollectedData() {
|
|
699
|
-
return
|
|
867
|
+
return this.collectedData.toArray();
|
|
700
868
|
}
|
|
701
869
|
clearCollectedData() {
|
|
702
|
-
this.collectedData
|
|
870
|
+
this.collectedData.clear();
|
|
703
871
|
return this;
|
|
704
872
|
}
|
|
705
873
|
async exportData(filePath, format = "json") {
|
|
706
|
-
const data = this.collectedData;
|
|
874
|
+
const data = this.collectedData.toArray();
|
|
707
875
|
if (data.length === 0) {
|
|
708
876
|
if (this.config.debug) {
|
|
709
877
|
console.warn("[Crawler] No data to export");
|
|
@@ -764,20 +932,36 @@ class Crawler {
|
|
|
764
932
|
getStats() {
|
|
765
933
|
return { ...this.crawlStats };
|
|
766
934
|
}
|
|
935
|
+
getHealthSnapshot() {
|
|
936
|
+
return this.healthMetrics.getSnapshot(this.queue.size, this.queue.pending);
|
|
937
|
+
}
|
|
938
|
+
isHealthy(options) {
|
|
939
|
+
return this.healthMetrics.isHealthy(options);
|
|
940
|
+
}
|
|
941
|
+
getPrometheusMetrics(prefix = "crawler") {
|
|
942
|
+
return this.healthMetrics.toPrometheusFormat(prefix);
|
|
943
|
+
}
|
|
767
944
|
async triggerStartHandlers() {
|
|
768
945
|
if (this.crawlStarted)
|
|
769
946
|
return;
|
|
947
|
+
if (this.startHandlersPromise) {
|
|
948
|
+
return this.startHandlersPromise;
|
|
949
|
+
}
|
|
770
950
|
this.crawlStarted = true;
|
|
771
|
-
this.
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
951
|
+
this.startHandlersPromise = (async () => {
|
|
952
|
+
this.crawlStats.startTime = Date.now();
|
|
953
|
+
this.startPeriodicCleanup();
|
|
954
|
+
for (const handler of this.startHandlers) {
|
|
955
|
+
try {
|
|
956
|
+
this.queue.add(() => handler());
|
|
957
|
+
} catch (error) {
|
|
958
|
+
if (this.config.debug) {
|
|
959
|
+
console.error("[Crawler] onStart handler error:", error);
|
|
960
|
+
}
|
|
778
961
|
}
|
|
779
962
|
}
|
|
780
|
-
}
|
|
963
|
+
})();
|
|
964
|
+
return this.startHandlersPromise;
|
|
781
965
|
}
|
|
782
966
|
async triggerFinishHandlers() {
|
|
783
967
|
this.crawlStats.endTime = Date.now();
|
|
@@ -874,53 +1058,55 @@ class Crawler {
|
|
|
874
1058
|
}
|
|
875
1059
|
if (url.includes(`/www.yellowpages.com/search?`))
|
|
876
1060
|
console.log("Visiting: ", url);
|
|
1061
|
+
this.crawlStarted = true;
|
|
877
1062
|
if (deepEmailFinder) {
|
|
878
|
-
this.
|
|
879
|
-
const p = this.execute2(method, url, body, _options, forceRevisit, emailMetadata);
|
|
880
|
-
this.pendingExecutions.add(p);
|
|
881
|
-
p.finally(() => this.pendingExecutions.delete(p));
|
|
1063
|
+
this.execute2(method, url, body, _options, forceRevisit, emailMetadata);
|
|
882
1064
|
return this;
|
|
883
1065
|
}
|
|
884
|
-
this.
|
|
885
|
-
const p = this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, skipCache, emailMetadata);
|
|
886
|
-
this.pendingExecutions.add(p);
|
|
887
|
-
p.finally(() => this.pendingExecutions.delete(p));
|
|
1066
|
+
this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, skipCache, emailMetadata);
|
|
888
1067
|
return this;
|
|
889
1068
|
}
|
|
890
1069
|
async execute(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, skipCache, emailMetadata) {
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
1070
|
+
this.queue.add(async () => {
|
|
1071
|
+
await this.waitForStorage();
|
|
1072
|
+
if (this.isCacheEnabled) {
|
|
1073
|
+
await this.waitForCache();
|
|
1074
|
+
}
|
|
1075
|
+
if (this.config.enableNavigationHistory) {
|
|
1076
|
+
await this.waitForNavigationHistory();
|
|
1077
|
+
}
|
|
1078
|
+
await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, 0, undefined, skipCache, emailMetadata);
|
|
1079
|
+
}).catch((err) => {
|
|
1080
|
+
if (this.config.debug)
|
|
1081
|
+
console.warn("[Crawler] execute() task error:", err?.message);
|
|
1082
|
+
});
|
|
901
1083
|
}
|
|
902
1084
|
async execute2(method, url, body, options = {}, forceRevisit, emailMetadata) {
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
1085
|
+
this.scraperQueue.add(async () => {
|
|
1086
|
+
await this.waitForStorage();
|
|
1087
|
+
if (this.isCacheEnabled) {
|
|
1088
|
+
await this.waitForCache();
|
|
1089
|
+
}
|
|
1090
|
+
if (this.config.enableNavigationHistory) {
|
|
1091
|
+
await this.waitForNavigationHistory();
|
|
1092
|
+
}
|
|
1093
|
+
await this.leadsFinder.parseExternalWebsite(url, method, body, {
|
|
1094
|
+
httpConfig: options,
|
|
1095
|
+
saveCache: this.saveCache.bind(this),
|
|
1096
|
+
saveUrl: this.saveUrl.bind(this),
|
|
1097
|
+
getCache: this.getCache.bind(this),
|
|
1098
|
+
hasUrlInCache: this.hasUrlInCache.bind(this),
|
|
1099
|
+
onEmailDiscovered: this.emailDiscoveredEvents,
|
|
1100
|
+
onEmails: this.emailLeadsEvents,
|
|
1101
|
+
queue: this.scraperQueue,
|
|
1102
|
+
depth: 1,
|
|
1103
|
+
allowCrossDomainTravel: true,
|
|
1104
|
+
emailMetadata
|
|
1105
|
+
}, forceRevisit, true);
|
|
1106
|
+
}).catch((err) => {
|
|
1107
|
+
if (this.config.debug)
|
|
1108
|
+
console.warn("[Crawler] execute2() task error:", err?.message);
|
|
1109
|
+
});
|
|
924
1110
|
}
|
|
925
1111
|
async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0, parentUrl, skipCache, emailMetadata) {
|
|
926
1112
|
try {
|
|
@@ -948,6 +1134,14 @@ class Crawler {
|
|
|
948
1134
|
}
|
|
949
1135
|
const requestStartTime = Date.now();
|
|
950
1136
|
const response = cache && method === "GET" && !skipCache ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) : await (method === "GET" ? this.http.get(url, options) : method === "PATCH" ? this.http.patch(url, body, options) : method === "POST" ? this.http.post(url, body, options) : this.http.put(url, body, options));
|
|
1137
|
+
if (!response) {
|
|
1138
|
+
this.crawlStats.urlsFailed++;
|
|
1139
|
+
this.healthMetrics.recordRequest(Date.now() - requestStartTime, false);
|
|
1140
|
+
if (this.config.debug) {
|
|
1141
|
+
console.log(`[Crawler] Request failed for ${url}`);
|
|
1142
|
+
}
|
|
1143
|
+
return;
|
|
1144
|
+
}
|
|
951
1145
|
if (!cache) {
|
|
952
1146
|
const responseTime = Date.now() - requestStartTime;
|
|
953
1147
|
this.calculateAutoThrottleDelay(domain, responseTime);
|
|
@@ -973,6 +1167,8 @@ class Crawler {
|
|
|
973
1167
|
}
|
|
974
1168
|
}
|
|
975
1169
|
this.crawlStats.urlsVisited++;
|
|
1170
|
+
const finalResponseTime = cache ? 0 : Date.now() - requestStartTime;
|
|
1171
|
+
this.healthMetrics.recordRequest(finalResponseTime, true);
|
|
976
1172
|
if (res.finalUrl && res.finalUrl !== url && this.redirectHandlers.length > 0) {
|
|
977
1173
|
await this.triggerRedirectHandlers({
|
|
978
1174
|
originalUrl: url,
|
|
@@ -992,7 +1188,7 @@ class Crawler {
|
|
|
992
1188
|
});
|
|
993
1189
|
if (res.contentType && res.contentType.includes("/json")) {
|
|
994
1190
|
if (this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) {
|
|
995
|
-
this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.
|
|
1191
|
+
this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.scraperQueue, emailMetadata);
|
|
996
1192
|
}
|
|
997
1193
|
for (let i = 0;i < this.jsonEvents.length; i++) {
|
|
998
1194
|
const event = this.jsonEvents[i];
|
|
@@ -1007,7 +1203,7 @@ class Crawler {
|
|
|
1007
1203
|
if (!res.contentType || !res.contentType.includes("/html") || typeof res.data !== "string")
|
|
1008
1204
|
return;
|
|
1009
1205
|
if ((this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) && isEmail) {
|
|
1010
|
-
this.leadsFinder.extractEmails(res.data, res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.
|
|
1206
|
+
this.leadsFinder.extractEmails(res.data, res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.scraperQueue, emailMetadata);
|
|
1011
1207
|
}
|
|
1012
1208
|
const { document } = parseHTML(res.data.addBaseUrl(res.finalUrl));
|
|
1013
1209
|
document.URL = res.finalUrl;
|
|
@@ -1072,23 +1268,33 @@ class Crawler {
|
|
|
1072
1268
|
}
|
|
1073
1269
|
}
|
|
1074
1270
|
async waitForAll() {
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1271
|
+
const MIN_DELAY = 50;
|
|
1272
|
+
const MAX_DELAY = 500;
|
|
1273
|
+
let currentDelay = MIN_DELAY;
|
|
1274
|
+
let consecutiveIdleChecks = 0;
|
|
1275
|
+
const REQUIRED_IDLE_CHECKS = 3;
|
|
1080
1276
|
while (true) {
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1277
|
+
await this.queue.onIdle();
|
|
1278
|
+
await new Promise((resolve) => setTimeout(resolve, currentDelay));
|
|
1279
|
+
const scraperSize = this.scraperQueue.size;
|
|
1280
|
+
const scraperPending = this.scraperQueue.pending;
|
|
1281
|
+
if (scraperSize === 0 && scraperPending === 0) {
|
|
1282
|
+
const queueSize = this.queue.size;
|
|
1283
|
+
const queuePending = this.queue.pending;
|
|
1284
|
+
if (queueSize === 0 && queuePending === 0) {
|
|
1285
|
+
consecutiveIdleChecks++;
|
|
1286
|
+
if (consecutiveIdleChecks >= REQUIRED_IDLE_CHECKS) {
|
|
1287
|
+
break;
|
|
1288
|
+
}
|
|
1289
|
+
currentDelay = Math.max(MIN_DELAY, currentDelay / 2);
|
|
1290
|
+
} else {
|
|
1291
|
+
consecutiveIdleChecks = 0;
|
|
1292
|
+
currentDelay = Math.min(MAX_DELAY, currentDelay * 1.5);
|
|
1293
|
+
}
|
|
1294
|
+
} else {
|
|
1295
|
+
consecutiveIdleChecks = 0;
|
|
1296
|
+
currentDelay = Math.min(MAX_DELAY, currentDelay * 1.5);
|
|
1297
|
+
await this.scraperQueue.onIdle();
|
|
1092
1298
|
}
|
|
1093
1299
|
}
|
|
1094
1300
|
await this.triggerFinishHandlers();
|
|
@@ -1112,7 +1318,18 @@ class Crawler {
|
|
|
1112
1318
|
if (this.isDestroyed)
|
|
1113
1319
|
return;
|
|
1114
1320
|
this.isDestroyed = true;
|
|
1321
|
+
this.removeShutdownHandlers();
|
|
1322
|
+
if (this.cleanupInterval) {
|
|
1323
|
+
clearInterval(this.cleanupInterval);
|
|
1324
|
+
this.cleanupInterval = undefined;
|
|
1325
|
+
}
|
|
1326
|
+
if (this.checkpointInterval) {
|
|
1327
|
+
clearInterval(this.checkpointInterval);
|
|
1328
|
+
this.checkpointInterval = undefined;
|
|
1329
|
+
}
|
|
1330
|
+
this.memoryMonitor.destroy();
|
|
1115
1331
|
this.queue.destroy();
|
|
1332
|
+
this.scraperQueue.destroy();
|
|
1116
1333
|
this.config.destroyLimiters();
|
|
1117
1334
|
this.events.length = 0;
|
|
1118
1335
|
this.jsonEvents.length = 0;
|
|
@@ -1121,6 +1338,10 @@ class Crawler {
|
|
|
1121
1338
|
this.rawResponseEvents.length = 0;
|
|
1122
1339
|
this.emailDiscoveredEvents.length = 0;
|
|
1123
1340
|
this.emailLeadsEvents.length = 0;
|
|
1341
|
+
this.domainResponseTimes.clear();
|
|
1342
|
+
this.domainCurrentDelay.clear();
|
|
1343
|
+
this.urlDepthMap.clear();
|
|
1344
|
+
this.collectedData.clear();
|
|
1124
1345
|
await this.close();
|
|
1125
1346
|
resetGlobalAgentPool();
|
|
1126
1347
|
}
|