rezo 1.0.72 → 1.0.74
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/entries/curl.d.ts +13 -2
- package/dist/adapters/entries/fetch.d.ts +13 -2
- package/dist/adapters/entries/http.d.ts +13 -2
- package/dist/adapters/entries/http2.d.ts +13 -2
- package/dist/adapters/entries/react-native.d.ts +13 -2
- package/dist/adapters/entries/xhr.d.ts +13 -2
- package/dist/adapters/index.cjs +6 -6
- package/dist/cache/index.cjs +9 -9
- package/dist/crawler/crawler-options.cjs +1 -1
- package/dist/crawler/crawler-options.js +1 -1
- package/dist/crawler/crawler.cjs +320 -89
- package/dist/crawler/crawler.js +320 -89
- package/dist/crawler/index.cjs +40 -40
- package/dist/crawler/plugin/capped-array.cjs +1 -0
- package/dist/crawler/plugin/capped-array.js +1 -0
- package/dist/crawler/plugin/capped-map.cjs +1 -0
- package/dist/crawler/plugin/capped-map.js +1 -0
- package/dist/crawler/plugin/file-cacher.cjs +20 -18
- package/dist/crawler/plugin/file-cacher.js +20 -18
- package/dist/crawler/plugin/health-metrics.cjs +2 -0
- package/dist/crawler/plugin/health-metrics.js +2 -0
- package/dist/crawler/plugin/index.cjs +1 -1
- package/dist/crawler/plugin/index.js +1 -1
- package/dist/crawler/plugin/memory-monitor.cjs +1 -0
- package/dist/crawler/plugin/memory-monitor.js +1 -0
- package/dist/crawler/plugin/navigation-history.cjs +5 -5
- package/dist/crawler/plugin/navigation-history.js +3 -3
- package/dist/crawler/plugin/result-stream.cjs +5 -0
- package/dist/crawler/plugin/result-stream.js +5 -0
- package/dist/crawler/plugin/sqlite-utils.cjs +1 -0
- package/dist/crawler/plugin/sqlite-utils.js +1 -0
- package/dist/crawler/plugin/url-store.cjs +5 -5
- package/dist/crawler/plugin/url-store.js +5 -5
- package/dist/crawler/scraper.cjs +1 -1
- package/dist/crawler/scraper.js +1 -1
- package/dist/crawler.d.ts +152 -25
- package/dist/entries/crawler.cjs +4 -4
- package/dist/errors/rezo-error.cjs +3 -72
- package/dist/errors/rezo-error.js +3 -72
- package/dist/index.cjs +30 -30
- package/dist/index.d.ts +13 -2
- package/dist/internal/agents/index.cjs +10 -10
- package/dist/platform/browser.d.ts +13 -2
- package/dist/platform/bun.d.ts +13 -2
- package/dist/platform/deno.d.ts +13 -2
- package/dist/platform/node.d.ts +13 -2
- package/dist/platform/react-native.d.ts +13 -2
- package/dist/platform/worker.d.ts +13 -2
- package/dist/proxy/index.cjs +4 -4
- package/dist/queue/index.cjs +8 -8
- package/dist/queue/queue.cjs +58 -13
- package/dist/queue/queue.js +58 -13
- package/dist/responses/universal/index.cjs +11 -11
- package/dist/utils/agent-pool.cjs +37 -0
- package/dist/utils/agent-pool.js +37 -0
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/wget/index.cjs +49 -49
- package/dist/wget/index.d.ts +12 -1
- package/package.json +1 -1
package/dist/crawler/crawler.cjs
CHANGED
|
@@ -3,9 +3,13 @@ const { FileCacher } = require('./plugin/file-cacher.cjs');
|
|
|
3
3
|
const { UrlStore } = require('./plugin/url-store.cjs');
|
|
4
4
|
const { NavigationHistory } = require('./plugin/navigation-history.cjs');
|
|
5
5
|
const { RobotsTxt } = require('./plugin/robots-txt.cjs');
|
|
6
|
+
const { MemoryMonitor } = require('./plugin/memory-monitor.cjs');
|
|
7
|
+
const { HealthMetrics } = require('./plugin/health-metrics.cjs');
|
|
8
|
+
const { CappedMap } = require('./plugin/capped-map.cjs');
|
|
9
|
+
const { CappedArray } = require('./plugin/capped-array.cjs');
|
|
6
10
|
const { parseHTML } = require("linkedom");
|
|
7
11
|
const path = require("node:path");
|
|
8
|
-
const
|
|
12
|
+
const rezo = require('../adapters/entries/http.cjs');
|
|
9
13
|
const { RezoQueue } = require('../queue/queue.cjs');
|
|
10
14
|
const { Scraper } = require('./scraper.cjs');
|
|
11
15
|
const { CrawlerOptions } = require('./crawler-options.cjs');
|
|
@@ -43,6 +47,7 @@ class Crawler {
|
|
|
43
47
|
emailLeadsEvents = [];
|
|
44
48
|
cacher = null;
|
|
45
49
|
queue;
|
|
50
|
+
scraperQueue;
|
|
46
51
|
isCacheEnabled;
|
|
47
52
|
config;
|
|
48
53
|
urlStorage;
|
|
@@ -56,13 +61,12 @@ class Crawler {
|
|
|
56
61
|
navigationHistoryInitPromise = null;
|
|
57
62
|
adapterExecutor = null;
|
|
58
63
|
adapterType;
|
|
59
|
-
pendingExecutions = new Set;
|
|
60
|
-
pendingVisitCount = 0;
|
|
61
64
|
isDestroyed = false;
|
|
62
|
-
|
|
65
|
+
shutdownRequested = false;
|
|
66
|
+
queueOptions = { concurrency: 100 };
|
|
63
67
|
robotsTxt;
|
|
64
|
-
domainResponseTimes = new
|
|
65
|
-
domainCurrentDelay = new
|
|
68
|
+
domainResponseTimes = new CappedMap({ maxSize: 500 });
|
|
69
|
+
domainCurrentDelay = new CappedMap({ maxSize: 500 });
|
|
66
70
|
crawlStats = {
|
|
67
71
|
urlsVisited: 0,
|
|
68
72
|
urlsQueued: 0,
|
|
@@ -70,19 +74,40 @@ class Crawler {
|
|
|
70
74
|
startTime: 0,
|
|
71
75
|
currentDepth: 0
|
|
72
76
|
};
|
|
73
|
-
urlDepthMap = new
|
|
77
|
+
urlDepthMap = new CappedMap({ maxSize: 50000 });
|
|
78
|
+
cleanupInterval;
|
|
79
|
+
checkpointInterval;
|
|
80
|
+
lastCheckpointTime = 0;
|
|
81
|
+
memoryMonitor;
|
|
82
|
+
healthMetrics;
|
|
83
|
+
originalConcurrency = 100;
|
|
84
|
+
shutdownHandler = null;
|
|
74
85
|
startHandlers = [];
|
|
75
86
|
finishHandlers = [];
|
|
76
87
|
redirectHandlers = [];
|
|
77
|
-
collectedData =
|
|
88
|
+
collectedData = new CappedArray({
|
|
89
|
+
maxSize: 1e5,
|
|
90
|
+
evictionRatio: 0.1,
|
|
91
|
+
onEviction: (evicted, remaining) => {
|
|
92
|
+
console.warn(`[Crawler] collectedData auto-evicted ${evicted.length} oldest entries. ${remaining} entries remaining. Consider using exportData() more frequently.`);
|
|
93
|
+
}
|
|
94
|
+
});
|
|
78
95
|
crawlStarted = false;
|
|
79
|
-
|
|
96
|
+
startHandlersPromise = null;
|
|
97
|
+
constructor(crawlerOptions, http = rezo.create()) {
|
|
80
98
|
this.http = http;
|
|
81
|
-
this.queue = new RezoQueue({
|
|
82
|
-
concurrency: 1000
|
|
83
|
-
});
|
|
84
99
|
this.config = new CrawlerOptions(crawlerOptions);
|
|
85
100
|
this.adapterType = this.config.adapter;
|
|
101
|
+
const concurrency = this.config.concurrency;
|
|
102
|
+
this.queue = new RezoQueue({
|
|
103
|
+
concurrency
|
|
104
|
+
});
|
|
105
|
+
this.originalConcurrency = concurrency;
|
|
106
|
+
this.scraperQueue = new RezoQueue({
|
|
107
|
+
concurrency: this.config.scraperConcurrency
|
|
108
|
+
});
|
|
109
|
+
this.memoryMonitor = new MemoryMonitor({ warningRatio: 0.7, criticalRatio: 0.85 });
|
|
110
|
+
this.healthMetrics = new HealthMetrics({ windowSize: 60000 });
|
|
86
111
|
const enableCache = this.config.enableCache;
|
|
87
112
|
this.isCacheEnabled = enableCache;
|
|
88
113
|
if (enableCache) {
|
|
@@ -94,7 +119,7 @@ class Crawler {
|
|
|
94
119
|
FileCacher.create({
|
|
95
120
|
cacheDir: dbUrl,
|
|
96
121
|
ttl: cacheTTL,
|
|
97
|
-
|
|
122
|
+
maxEntries: 1e5
|
|
98
123
|
}).then((storage) => {
|
|
99
124
|
this.cacher = storage;
|
|
100
125
|
this.isCacheReady = true;
|
|
@@ -151,6 +176,58 @@ class Crawler {
|
|
|
151
176
|
if (this.config.baseUrl) {
|
|
152
177
|
this.urlDepthMap.set(this.config.baseUrl, 0);
|
|
153
178
|
}
|
|
179
|
+
if (this.config.enableSignalHandlers) {
|
|
180
|
+
this.registerShutdownHandlers();
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
registerShutdownHandlers() {
|
|
184
|
+
if (this.shutdownHandler)
|
|
185
|
+
return;
|
|
186
|
+
this.shutdownHandler = () => this.gracefulShutdown();
|
|
187
|
+
process.on("SIGINT", this.shutdownHandler);
|
|
188
|
+
process.on("SIGTERM", this.shutdownHandler);
|
|
189
|
+
}
|
|
190
|
+
removeShutdownHandlers() {
|
|
191
|
+
if (this.shutdownHandler) {
|
|
192
|
+
process.off("SIGINT", this.shutdownHandler);
|
|
193
|
+
process.off("SIGTERM", this.shutdownHandler);
|
|
194
|
+
this.shutdownHandler = null;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
async gracefulShutdown() {
|
|
198
|
+
if (this.shutdownRequested || this.isDestroyed)
|
|
199
|
+
return;
|
|
200
|
+
this.shutdownRequested = true;
|
|
201
|
+
console.log(`
|
|
202
|
+
[Crawler] Shutdown requested, finishing current tasks...`);
|
|
203
|
+
this.queue.pause();
|
|
204
|
+
this.scraperQueue.pause();
|
|
205
|
+
const timeoutPromise = new Promise((resolve) => {
|
|
206
|
+
setTimeout(() => {
|
|
207
|
+
console.log("[Crawler] Shutdown timeout (5s), forcing exit...");
|
|
208
|
+
resolve();
|
|
209
|
+
}, 5000);
|
|
210
|
+
});
|
|
211
|
+
await Promise.race([
|
|
212
|
+
Promise.all([this.queue.onIdle(), this.scraperQueue.onIdle()]),
|
|
213
|
+
timeoutPromise
|
|
214
|
+
]);
|
|
215
|
+
if (this.navigationHistory && this.currentSession) {
|
|
216
|
+
try {
|
|
217
|
+
await this.navigationHistory.updateSessionStatus(this.currentSession.sessionId, "paused");
|
|
218
|
+
await this.navigationHistory.updateSessionStats(this.currentSession.sessionId, {
|
|
219
|
+
urlsVisited: this.crawlStats.urlsVisited,
|
|
220
|
+
urlsQueued: this.queue.size,
|
|
221
|
+
urlsFailed: this.crawlStats.urlsFailed
|
|
222
|
+
});
|
|
223
|
+
console.log(`[Crawler] Session saved: ${this.currentSession.sessionId}`);
|
|
224
|
+
} catch (err) {
|
|
225
|
+
console.warn("[Crawler] Failed to save session state:", err);
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
await this.destroy();
|
|
229
|
+
console.log("[Crawler] Graceful shutdown complete");
|
|
230
|
+
process.exit(0);
|
|
154
231
|
}
|
|
155
232
|
async initializeAdapter() {
|
|
156
233
|
try {
|
|
@@ -166,7 +243,6 @@ class Crawler {
|
|
|
166
243
|
if (!this.isDestroyed)
|
|
167
244
|
return;
|
|
168
245
|
this.queue = new RezoQueue(this.queueOptions);
|
|
169
|
-
this.pendingExecutions.clear();
|
|
170
246
|
this.isDestroyed = false;
|
|
171
247
|
if (this.config.debug) {
|
|
172
248
|
console.log("[Crawler] Restored from destroyed state");
|
|
@@ -342,17 +418,110 @@ class Crawler {
|
|
|
342
418
|
handler(data);
|
|
343
419
|
});
|
|
344
420
|
}
|
|
345
|
-
async waitForCache() {
|
|
421
|
+
async waitForCache(timeoutMs = 30000) {
|
|
346
422
|
if (this.isCacheReady)
|
|
347
423
|
return;
|
|
348
|
-
|
|
349
|
-
|
|
424
|
+
const start = Date.now();
|
|
425
|
+
while (!this.isCacheReady) {
|
|
426
|
+
if (Date.now() - start > timeoutMs) {
|
|
427
|
+
console.warn("[Crawler] Cache initialization timeout, continuing without cache");
|
|
428
|
+
this.isCacheReady = true;
|
|
429
|
+
return;
|
|
430
|
+
}
|
|
431
|
+
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
432
|
+
}
|
|
350
433
|
}
|
|
351
|
-
async waitForStorage() {
|
|
434
|
+
async waitForStorage(timeoutMs = 30000) {
|
|
352
435
|
if (this.isStorageReady)
|
|
353
436
|
return;
|
|
354
|
-
|
|
355
|
-
|
|
437
|
+
const start = Date.now();
|
|
438
|
+
while (!this.isStorageReady) {
|
|
439
|
+
if (Date.now() - start > timeoutMs) {
|
|
440
|
+
console.warn("[Crawler] Storage initialization timeout, continuing without URL tracking");
|
|
441
|
+
this.isStorageReady = true;
|
|
442
|
+
return;
|
|
443
|
+
}
|
|
444
|
+
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
startPeriodicCleanup() {
|
|
448
|
+
if (this.cleanupInterval)
|
|
449
|
+
return;
|
|
450
|
+
this.cleanupInterval = setInterval(() => {
|
|
451
|
+
const memStatus = this.memoryMonitor.check();
|
|
452
|
+
if (memStatus === "critical") {
|
|
453
|
+
this.queue.pause();
|
|
454
|
+
this.memoryMonitor.forceGC();
|
|
455
|
+
if (this.config.debug) {
|
|
456
|
+
const report = this.memoryMonitor.getReport();
|
|
457
|
+
console.warn(`[Crawler] CRITICAL memory (${report.usagePercent}%), pausing...`);
|
|
458
|
+
}
|
|
459
|
+
setTimeout(() => {
|
|
460
|
+
this.queue.concurrency = Math.max(5, Math.floor(this.originalConcurrency * 0.25));
|
|
461
|
+
this.queue.start();
|
|
462
|
+
}, 3000);
|
|
463
|
+
} else if (memStatus === "warning") {
|
|
464
|
+
const newConcurrency = Math.max(10, Math.floor(this.originalConcurrency * 0.5));
|
|
465
|
+
if (this.queue.concurrency > newConcurrency) {
|
|
466
|
+
this.queue.concurrency = newConcurrency;
|
|
467
|
+
if (this.config.debug) {
|
|
468
|
+
const report = this.memoryMonitor.getReport();
|
|
469
|
+
console.warn(`[Crawler] High memory (${report.usagePercent}%), reducing concurrency to ${newConcurrency}`);
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
} else {
|
|
473
|
+
if (this.queue.concurrency < this.originalConcurrency) {
|
|
474
|
+
this.queue.concurrency = Math.min(this.originalConcurrency, this.queue.concurrency + 10);
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
}, 30000);
|
|
478
|
+
if (this.cleanupInterval.unref) {
|
|
479
|
+
this.cleanupInterval.unref();
|
|
480
|
+
}
|
|
481
|
+
if (this.config.enableNavigationHistory) {
|
|
482
|
+
this.startAutoCheckpoint();
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
startAutoCheckpoint() {
|
|
486
|
+
if (this.checkpointInterval)
|
|
487
|
+
return;
|
|
488
|
+
const CHECKPOINT_INTERVAL = 5 * 60 * 1000;
|
|
489
|
+
this.checkpointInterval = setInterval(async () => {
|
|
490
|
+
if (this.shutdownRequested || this.isDestroyed)
|
|
491
|
+
return;
|
|
492
|
+
try {
|
|
493
|
+
await this.saveCheckpoint();
|
|
494
|
+
} catch (error) {
|
|
495
|
+
if (this.config.debug) {
|
|
496
|
+
console.error("[Crawler] Checkpoint save failed:", error);
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
}, CHECKPOINT_INTERVAL);
|
|
500
|
+
if (this.checkpointInterval.unref) {
|
|
501
|
+
this.checkpointInterval.unref();
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
async saveCheckpoint() {
|
|
505
|
+
if (!this.navigationHistory || !this.currentSession)
|
|
506
|
+
return;
|
|
507
|
+
const now = Date.now();
|
|
508
|
+
if (now - this.lastCheckpointTime < 60000)
|
|
509
|
+
return;
|
|
510
|
+
try {
|
|
511
|
+
await this.navigationHistory.updateSessionStats(this.currentSession.sessionId, {
|
|
512
|
+
urlsVisited: this.crawlStats.urlsVisited,
|
|
513
|
+
urlsQueued: this.crawlStats.urlsQueued,
|
|
514
|
+
urlsFailed: this.crawlStats.urlsFailed
|
|
515
|
+
});
|
|
516
|
+
this.lastCheckpointTime = now;
|
|
517
|
+
if (this.config.debug) {
|
|
518
|
+
console.log(`[Crawler] Checkpoint saved: ${this.crawlStats.urlsVisited} visited, ${this.crawlStats.urlsFailed} failed`);
|
|
519
|
+
}
|
|
520
|
+
} catch (error) {
|
|
521
|
+
if (this.config.debug) {
|
|
522
|
+
console.error("[Crawler] Failed to save checkpoint:", error);
|
|
523
|
+
}
|
|
524
|
+
}
|
|
356
525
|
}
|
|
357
526
|
async saveUrl(url) {
|
|
358
527
|
await this.waitForStorage();
|
|
@@ -696,14 +865,14 @@ class Crawler {
|
|
|
696
865
|
return this;
|
|
697
866
|
}
|
|
698
867
|
getCollectedData() {
|
|
699
|
-
return
|
|
868
|
+
return this.collectedData.toArray();
|
|
700
869
|
}
|
|
701
870
|
clearCollectedData() {
|
|
702
|
-
this.collectedData
|
|
871
|
+
this.collectedData.clear();
|
|
703
872
|
return this;
|
|
704
873
|
}
|
|
705
874
|
async exportData(filePath, format = "json") {
|
|
706
|
-
const data = this.collectedData;
|
|
875
|
+
const data = this.collectedData.toArray();
|
|
707
876
|
if (data.length === 0) {
|
|
708
877
|
if (this.config.debug) {
|
|
709
878
|
console.warn("[Crawler] No data to export");
|
|
@@ -764,20 +933,36 @@ class Crawler {
|
|
|
764
933
|
getStats() {
|
|
765
934
|
return { ...this.crawlStats };
|
|
766
935
|
}
|
|
936
|
+
getHealthSnapshot() {
|
|
937
|
+
return this.healthMetrics.getSnapshot(this.queue.size, this.queue.pending);
|
|
938
|
+
}
|
|
939
|
+
isHealthy(options) {
|
|
940
|
+
return this.healthMetrics.isHealthy(options);
|
|
941
|
+
}
|
|
942
|
+
getPrometheusMetrics(prefix = "crawler") {
|
|
943
|
+
return this.healthMetrics.toPrometheusFormat(prefix);
|
|
944
|
+
}
|
|
767
945
|
async triggerStartHandlers() {
|
|
768
946
|
if (this.crawlStarted)
|
|
769
947
|
return;
|
|
948
|
+
if (this.startHandlersPromise) {
|
|
949
|
+
return this.startHandlersPromise;
|
|
950
|
+
}
|
|
770
951
|
this.crawlStarted = true;
|
|
771
|
-
this.
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
952
|
+
this.startHandlersPromise = (async () => {
|
|
953
|
+
this.crawlStats.startTime = Date.now();
|
|
954
|
+
this.startPeriodicCleanup();
|
|
955
|
+
for (const handler of this.startHandlers) {
|
|
956
|
+
try {
|
|
957
|
+
this.queue.add(() => handler());
|
|
958
|
+
} catch (error) {
|
|
959
|
+
if (this.config.debug) {
|
|
960
|
+
console.error("[Crawler] onStart handler error:", error);
|
|
961
|
+
}
|
|
778
962
|
}
|
|
779
963
|
}
|
|
780
|
-
}
|
|
964
|
+
})();
|
|
965
|
+
return this.startHandlersPromise;
|
|
781
966
|
}
|
|
782
967
|
async triggerFinishHandlers() {
|
|
783
968
|
this.crawlStats.endTime = Date.now();
|
|
@@ -872,55 +1057,55 @@ class Crawler {
|
|
|
872
1057
|
const headersObj = headers instanceof Headers ? Object.fromEntries(headers.entries()) : headers;
|
|
873
1058
|
this.addToNavigationQueue(url, method, body, headersObj);
|
|
874
1059
|
}
|
|
875
|
-
|
|
876
|
-
console.log("Visiting: ", url);
|
|
1060
|
+
this.crawlStarted = true;
|
|
877
1061
|
if (deepEmailFinder) {
|
|
878
|
-
this.
|
|
879
|
-
const p = this.execute2(method, url, body, _options, forceRevisit, emailMetadata);
|
|
880
|
-
this.pendingExecutions.add(p);
|
|
881
|
-
p.finally(() => this.pendingExecutions.delete(p));
|
|
1062
|
+
this.execute2(method, url, body, _options, forceRevisit, emailMetadata);
|
|
882
1063
|
return this;
|
|
883
1064
|
}
|
|
884
|
-
this.
|
|
885
|
-
const p = this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, skipCache, emailMetadata);
|
|
886
|
-
this.pendingExecutions.add(p);
|
|
887
|
-
p.finally(() => this.pendingExecutions.delete(p));
|
|
1065
|
+
this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, skipCache, emailMetadata);
|
|
888
1066
|
return this;
|
|
889
1067
|
}
|
|
890
1068
|
async execute(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, skipCache, emailMetadata) {
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
1069
|
+
this.queue.add(async () => {
|
|
1070
|
+
await this.waitForStorage();
|
|
1071
|
+
if (this.isCacheEnabled) {
|
|
1072
|
+
await this.waitForCache();
|
|
1073
|
+
}
|
|
1074
|
+
if (this.config.enableNavigationHistory) {
|
|
1075
|
+
await this.waitForNavigationHistory();
|
|
1076
|
+
}
|
|
1077
|
+
await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, 0, undefined, skipCache, emailMetadata);
|
|
1078
|
+
}).catch((err) => {
|
|
1079
|
+
if (this.config.debug)
|
|
1080
|
+
console.warn("[Crawler] execute() task error:", err?.message);
|
|
1081
|
+
});
|
|
901
1082
|
}
|
|
902
1083
|
async execute2(method, url, body, options = {}, forceRevisit, emailMetadata) {
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
1084
|
+
this.scraperQueue.add(async () => {
|
|
1085
|
+
await this.waitForStorage();
|
|
1086
|
+
if (this.isCacheEnabled) {
|
|
1087
|
+
await this.waitForCache();
|
|
1088
|
+
}
|
|
1089
|
+
if (this.config.enableNavigationHistory) {
|
|
1090
|
+
await this.waitForNavigationHistory();
|
|
1091
|
+
}
|
|
1092
|
+
await this.leadsFinder.parseExternalWebsite(url, method, body, {
|
|
1093
|
+
httpConfig: options,
|
|
1094
|
+
saveCache: this.saveCache.bind(this),
|
|
1095
|
+
saveUrl: this.saveUrl.bind(this),
|
|
1096
|
+
getCache: this.getCache.bind(this),
|
|
1097
|
+
hasUrlInCache: this.hasUrlInCache.bind(this),
|
|
1098
|
+
onEmailDiscovered: this.emailDiscoveredEvents,
|
|
1099
|
+
onEmails: this.emailLeadsEvents,
|
|
1100
|
+
queue: this.scraperQueue,
|
|
1101
|
+
depth: 1,
|
|
1102
|
+
allowCrossDomainTravel: true,
|
|
1103
|
+
emailMetadata
|
|
1104
|
+
}, forceRevisit, true);
|
|
1105
|
+
}).catch((err) => {
|
|
1106
|
+
if (this.config.debug)
|
|
1107
|
+
console.warn("[Crawler] execute2() task error:", err?.message);
|
|
1108
|
+
});
|
|
924
1109
|
}
|
|
925
1110
|
async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0, parentUrl, skipCache, emailMetadata) {
|
|
926
1111
|
try {
|
|
@@ -947,7 +1132,15 @@ class Crawler {
|
|
|
947
1132
|
return;
|
|
948
1133
|
}
|
|
949
1134
|
const requestStartTime = Date.now();
|
|
950
|
-
const response = cache && method === "GET" && !skipCache ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) :
|
|
1135
|
+
const response = cache && method === "GET" && !skipCache ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) : method === "GET" ? await this.http.get(url, options) : method === "PATCH" ? await this.http.patch(url, body, options) : method === "POST" ? await this.http.post(url, body, options) : await this.http.put(url, body, options);
|
|
1136
|
+
if (!response) {
|
|
1137
|
+
this.crawlStats.urlsFailed++;
|
|
1138
|
+
this.healthMetrics.recordRequest(Date.now() - requestStartTime, false);
|
|
1139
|
+
if (this.config.debug) {
|
|
1140
|
+
console.log(`[Crawler] Request failed for ${url}`);
|
|
1141
|
+
}
|
|
1142
|
+
return;
|
|
1143
|
+
}
|
|
951
1144
|
if (!cache) {
|
|
952
1145
|
const responseTime = Date.now() - requestStartTime;
|
|
953
1146
|
this.calculateAutoThrottleDelay(domain, responseTime);
|
|
@@ -973,6 +1166,8 @@ class Crawler {
|
|
|
973
1166
|
}
|
|
974
1167
|
}
|
|
975
1168
|
this.crawlStats.urlsVisited++;
|
|
1169
|
+
const finalResponseTime = cache ? 0 : Date.now() - requestStartTime;
|
|
1170
|
+
this.healthMetrics.recordRequest(finalResponseTime, true);
|
|
976
1171
|
if (res.finalUrl && res.finalUrl !== url && this.redirectHandlers.length > 0) {
|
|
977
1172
|
await this.triggerRedirectHandlers({
|
|
978
1173
|
originalUrl: url,
|
|
@@ -992,7 +1187,7 @@ class Crawler {
|
|
|
992
1187
|
});
|
|
993
1188
|
if (res.contentType && res.contentType.includes("/json")) {
|
|
994
1189
|
if (this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) {
|
|
995
|
-
this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.
|
|
1190
|
+
this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.scraperQueue, emailMetadata);
|
|
996
1191
|
}
|
|
997
1192
|
for (let i = 0;i < this.jsonEvents.length; i++) {
|
|
998
1193
|
const event = this.jsonEvents[i];
|
|
@@ -1007,7 +1202,7 @@ class Crawler {
|
|
|
1007
1202
|
if (!res.contentType || !res.contentType.includes("/html") || typeof res.data !== "string")
|
|
1008
1203
|
return;
|
|
1009
1204
|
if ((this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) && isEmail) {
|
|
1010
|
-
this.leadsFinder.extractEmails(res.data, res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.
|
|
1205
|
+
this.leadsFinder.extractEmails(res.data, res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.scraperQueue, emailMetadata);
|
|
1011
1206
|
}
|
|
1012
1207
|
const { document } = parseHTML(res.data.addBaseUrl(res.finalUrl));
|
|
1013
1208
|
document.URL = res.finalUrl;
|
|
@@ -1072,23 +1267,44 @@ class Crawler {
|
|
|
1072
1267
|
}
|
|
1073
1268
|
}
|
|
1074
1269
|
async waitForAll() {
|
|
1075
|
-
if (
|
|
1076
|
-
await
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
while (true) {
|
|
1081
|
-
while (this.pendingVisitCount > 0) {
|
|
1270
|
+
if (!this.crawlStarted) {
|
|
1271
|
+
await new Promise((resolve) => setImmediate(resolve));
|
|
1272
|
+
const maxWaitForStart = 1000;
|
|
1273
|
+
const startWait = Date.now();
|
|
1274
|
+
while (!this.crawlStarted && Date.now() - startWait < maxWaitForStart) {
|
|
1082
1275
|
await new Promise((resolve) => setTimeout(resolve, 10));
|
|
1083
1276
|
}
|
|
1084
|
-
if (this.
|
|
1085
|
-
|
|
1086
|
-
}
|
|
1087
|
-
if (this.queue.size > 0 || this.queue.pending > 0) {
|
|
1088
|
-
await this.queue.onIdle();
|
|
1277
|
+
if (!this.crawlStarted) {
|
|
1278
|
+
return;
|
|
1089
1279
|
}
|
|
1090
|
-
|
|
1091
|
-
|
|
1280
|
+
}
|
|
1281
|
+
const MIN_DELAY = 50;
|
|
1282
|
+
const MAX_DELAY = 500;
|
|
1283
|
+
let currentDelay = MIN_DELAY;
|
|
1284
|
+
let consecutiveIdleChecks = 0;
|
|
1285
|
+
const REQUIRED_IDLE_CHECKS = 3;
|
|
1286
|
+
while (true) {
|
|
1287
|
+
await this.queue.onIdle();
|
|
1288
|
+
await new Promise((resolve) => setTimeout(resolve, currentDelay));
|
|
1289
|
+
const scraperSize = this.scraperQueue.size;
|
|
1290
|
+
const scraperPending = this.scraperQueue.pending;
|
|
1291
|
+
if (scraperSize === 0 && scraperPending === 0) {
|
|
1292
|
+
const queueSize = this.queue.size;
|
|
1293
|
+
const queuePending = this.queue.pending;
|
|
1294
|
+
if (queueSize === 0 && queuePending === 0) {
|
|
1295
|
+
consecutiveIdleChecks++;
|
|
1296
|
+
if (consecutiveIdleChecks >= REQUIRED_IDLE_CHECKS) {
|
|
1297
|
+
break;
|
|
1298
|
+
}
|
|
1299
|
+
currentDelay = Math.max(MIN_DELAY, currentDelay / 2);
|
|
1300
|
+
} else {
|
|
1301
|
+
consecutiveIdleChecks = 0;
|
|
1302
|
+
currentDelay = Math.min(MAX_DELAY, currentDelay * 1.5);
|
|
1303
|
+
}
|
|
1304
|
+
} else {
|
|
1305
|
+
consecutiveIdleChecks = 0;
|
|
1306
|
+
currentDelay = Math.min(MAX_DELAY, currentDelay * 1.5);
|
|
1307
|
+
await this.scraperQueue.onIdle();
|
|
1092
1308
|
}
|
|
1093
1309
|
}
|
|
1094
1310
|
await this.triggerFinishHandlers();
|
|
@@ -1112,7 +1328,18 @@ class Crawler {
|
|
|
1112
1328
|
if (this.isDestroyed)
|
|
1113
1329
|
return;
|
|
1114
1330
|
this.isDestroyed = true;
|
|
1331
|
+
this.removeShutdownHandlers();
|
|
1332
|
+
if (this.cleanupInterval) {
|
|
1333
|
+
clearInterval(this.cleanupInterval);
|
|
1334
|
+
this.cleanupInterval = undefined;
|
|
1335
|
+
}
|
|
1336
|
+
if (this.checkpointInterval) {
|
|
1337
|
+
clearInterval(this.checkpointInterval);
|
|
1338
|
+
this.checkpointInterval = undefined;
|
|
1339
|
+
}
|
|
1340
|
+
this.memoryMonitor.destroy();
|
|
1115
1341
|
this.queue.destroy();
|
|
1342
|
+
this.scraperQueue.destroy();
|
|
1116
1343
|
this.config.destroyLimiters();
|
|
1117
1344
|
this.events.length = 0;
|
|
1118
1345
|
this.jsonEvents.length = 0;
|
|
@@ -1121,6 +1348,10 @@ class Crawler {
|
|
|
1121
1348
|
this.rawResponseEvents.length = 0;
|
|
1122
1349
|
this.emailDiscoveredEvents.length = 0;
|
|
1123
1350
|
this.emailLeadsEvents.length = 0;
|
|
1351
|
+
this.domainResponseTimes.clear();
|
|
1352
|
+
this.domainCurrentDelay.clear();
|
|
1353
|
+
this.urlDepthMap.clear();
|
|
1354
|
+
this.collectedData.clear();
|
|
1124
1355
|
await this.close();
|
|
1125
1356
|
resetGlobalAgentPool();
|
|
1126
1357
|
}
|