rezo 1.0.72 → 1.0.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/dist/adapters/entries/curl.d.ts +13 -2
  2. package/dist/adapters/entries/fetch.d.ts +13 -2
  3. package/dist/adapters/entries/http.d.ts +13 -2
  4. package/dist/adapters/entries/http2.d.ts +13 -2
  5. package/dist/adapters/entries/react-native.d.ts +13 -2
  6. package/dist/adapters/entries/xhr.d.ts +13 -2
  7. package/dist/adapters/index.cjs +6 -6
  8. package/dist/cache/index.cjs +9 -9
  9. package/dist/crawler/crawler-options.cjs +1 -1
  10. package/dist/crawler/crawler-options.js +1 -1
  11. package/dist/crawler/crawler.cjs +320 -89
  12. package/dist/crawler/crawler.js +320 -89
  13. package/dist/crawler/index.cjs +40 -40
  14. package/dist/crawler/plugin/capped-array.cjs +1 -0
  15. package/dist/crawler/plugin/capped-array.js +1 -0
  16. package/dist/crawler/plugin/capped-map.cjs +1 -0
  17. package/dist/crawler/plugin/capped-map.js +1 -0
  18. package/dist/crawler/plugin/file-cacher.cjs +20 -18
  19. package/dist/crawler/plugin/file-cacher.js +20 -18
  20. package/dist/crawler/plugin/health-metrics.cjs +2 -0
  21. package/dist/crawler/plugin/health-metrics.js +2 -0
  22. package/dist/crawler/plugin/index.cjs +1 -1
  23. package/dist/crawler/plugin/index.js +1 -1
  24. package/dist/crawler/plugin/memory-monitor.cjs +1 -0
  25. package/dist/crawler/plugin/memory-monitor.js +1 -0
  26. package/dist/crawler/plugin/navigation-history.cjs +5 -5
  27. package/dist/crawler/plugin/navigation-history.js +3 -3
  28. package/dist/crawler/plugin/result-stream.cjs +5 -0
  29. package/dist/crawler/plugin/result-stream.js +5 -0
  30. package/dist/crawler/plugin/sqlite-utils.cjs +1 -0
  31. package/dist/crawler/plugin/sqlite-utils.js +1 -0
  32. package/dist/crawler/plugin/url-store.cjs +5 -5
  33. package/dist/crawler/plugin/url-store.js +5 -5
  34. package/dist/crawler/scraper.cjs +1 -1
  35. package/dist/crawler/scraper.js +1 -1
  36. package/dist/crawler.d.ts +152 -25
  37. package/dist/entries/crawler.cjs +4 -4
  38. package/dist/errors/rezo-error.cjs +3 -72
  39. package/dist/errors/rezo-error.js +3 -72
  40. package/dist/index.cjs +30 -30
  41. package/dist/index.d.ts +13 -2
  42. package/dist/internal/agents/index.cjs +10 -10
  43. package/dist/platform/browser.d.ts +13 -2
  44. package/dist/platform/bun.d.ts +13 -2
  45. package/dist/platform/deno.d.ts +13 -2
  46. package/dist/platform/node.d.ts +13 -2
  47. package/dist/platform/react-native.d.ts +13 -2
  48. package/dist/platform/worker.d.ts +13 -2
  49. package/dist/proxy/index.cjs +4 -4
  50. package/dist/queue/index.cjs +8 -8
  51. package/dist/queue/queue.cjs +58 -13
  52. package/dist/queue/queue.js +58 -13
  53. package/dist/responses/universal/index.cjs +11 -11
  54. package/dist/utils/agent-pool.cjs +37 -0
  55. package/dist/utils/agent-pool.js +37 -0
  56. package/dist/version.cjs +1 -1
  57. package/dist/version.js +1 -1
  58. package/dist/wget/index.cjs +49 -49
  59. package/dist/wget/index.d.ts +12 -1
  60. package/package.json +1 -1
@@ -3,9 +3,13 @@ const { FileCacher } = require('./plugin/file-cacher.cjs');
3
3
  const { UrlStore } = require('./plugin/url-store.cjs');
4
4
  const { NavigationHistory } = require('./plugin/navigation-history.cjs');
5
5
  const { RobotsTxt } = require('./plugin/robots-txt.cjs');
6
+ const { MemoryMonitor } = require('./plugin/memory-monitor.cjs');
7
+ const { HealthMetrics } = require('./plugin/health-metrics.cjs');
8
+ const { CappedMap } = require('./plugin/capped-map.cjs');
9
+ const { CappedArray } = require('./plugin/capped-array.cjs');
6
10
  const { parseHTML } = require("linkedom");
7
11
  const path = require("node:path");
8
- const { Rezo } = require('../adapters/entries/http.cjs');
12
+ const rezo = require('../adapters/entries/http.cjs');
9
13
  const { RezoQueue } = require('../queue/queue.cjs');
10
14
  const { Scraper } = require('./scraper.cjs');
11
15
  const { CrawlerOptions } = require('./crawler-options.cjs');
@@ -43,6 +47,7 @@ class Crawler {
43
47
  emailLeadsEvents = [];
44
48
  cacher = null;
45
49
  queue;
50
+ scraperQueue;
46
51
  isCacheEnabled;
47
52
  config;
48
53
  urlStorage;
@@ -56,13 +61,12 @@ class Crawler {
56
61
  navigationHistoryInitPromise = null;
57
62
  adapterExecutor = null;
58
63
  adapterType;
59
- pendingExecutions = new Set;
60
- pendingVisitCount = 0;
61
64
  isDestroyed = false;
62
- queueOptions = { concurrency: 1000 };
65
+ shutdownRequested = false;
66
+ queueOptions = { concurrency: 100 };
63
67
  robotsTxt;
64
- domainResponseTimes = new Map;
65
- domainCurrentDelay = new Map;
68
+ domainResponseTimes = new CappedMap({ maxSize: 500 });
69
+ domainCurrentDelay = new CappedMap({ maxSize: 500 });
66
70
  crawlStats = {
67
71
  urlsVisited: 0,
68
72
  urlsQueued: 0,
@@ -70,19 +74,40 @@ class Crawler {
70
74
  startTime: 0,
71
75
  currentDepth: 0
72
76
  };
73
- urlDepthMap = new Map;
77
+ urlDepthMap = new CappedMap({ maxSize: 50000 });
78
+ cleanupInterval;
79
+ checkpointInterval;
80
+ lastCheckpointTime = 0;
81
+ memoryMonitor;
82
+ healthMetrics;
83
+ originalConcurrency = 100;
84
+ shutdownHandler = null;
74
85
  startHandlers = [];
75
86
  finishHandlers = [];
76
87
  redirectHandlers = [];
77
- collectedData = [];
88
+ collectedData = new CappedArray({
89
+ maxSize: 1e5,
90
+ evictionRatio: 0.1,
91
+ onEviction: (evicted, remaining) => {
92
+ console.warn(`[Crawler] collectedData auto-evicted ${evicted.length} oldest entries. ${remaining} entries remaining. Consider using exportData() more frequently.`);
93
+ }
94
+ });
78
95
  crawlStarted = false;
79
- constructor(crawlerOptions, http = new Rezo) {
96
+ startHandlersPromise = null;
97
+ constructor(crawlerOptions, http = rezo.create()) {
80
98
  this.http = http;
81
- this.queue = new RezoQueue({
82
- concurrency: 1000
83
- });
84
99
  this.config = new CrawlerOptions(crawlerOptions);
85
100
  this.adapterType = this.config.adapter;
101
+ const concurrency = this.config.concurrency;
102
+ this.queue = new RezoQueue({
103
+ concurrency
104
+ });
105
+ this.originalConcurrency = concurrency;
106
+ this.scraperQueue = new RezoQueue({
107
+ concurrency: this.config.scraperConcurrency
108
+ });
109
+ this.memoryMonitor = new MemoryMonitor({ warningRatio: 0.7, criticalRatio: 0.85 });
110
+ this.healthMetrics = new HealthMetrics({ windowSize: 60000 });
86
111
  const enableCache = this.config.enableCache;
87
112
  this.isCacheEnabled = enableCache;
88
113
  if (enableCache) {
@@ -94,7 +119,7 @@ class Crawler {
94
119
  FileCacher.create({
95
120
  cacheDir: dbUrl,
96
121
  ttl: cacheTTL,
97
- encryptNamespace: true
122
+ maxEntries: 1e5
98
123
  }).then((storage) => {
99
124
  this.cacher = storage;
100
125
  this.isCacheReady = true;
@@ -151,6 +176,58 @@ class Crawler {
151
176
  if (this.config.baseUrl) {
152
177
  this.urlDepthMap.set(this.config.baseUrl, 0);
153
178
  }
179
+ if (this.config.enableSignalHandlers) {
180
+ this.registerShutdownHandlers();
181
+ }
182
+ }
183
+ registerShutdownHandlers() {
184
+ if (this.shutdownHandler)
185
+ return;
186
+ this.shutdownHandler = () => this.gracefulShutdown();
187
+ process.on("SIGINT", this.shutdownHandler);
188
+ process.on("SIGTERM", this.shutdownHandler);
189
+ }
190
+ removeShutdownHandlers() {
191
+ if (this.shutdownHandler) {
192
+ process.off("SIGINT", this.shutdownHandler);
193
+ process.off("SIGTERM", this.shutdownHandler);
194
+ this.shutdownHandler = null;
195
+ }
196
+ }
197
+ async gracefulShutdown() {
198
+ if (this.shutdownRequested || this.isDestroyed)
199
+ return;
200
+ this.shutdownRequested = true;
201
+ console.log(`
202
+ [Crawler] Shutdown requested, finishing current tasks...`);
203
+ this.queue.pause();
204
+ this.scraperQueue.pause();
205
+ const timeoutPromise = new Promise((resolve) => {
206
+ setTimeout(() => {
207
+ console.log("[Crawler] Shutdown timeout (5s), forcing exit...");
208
+ resolve();
209
+ }, 5000);
210
+ });
211
+ await Promise.race([
212
+ Promise.all([this.queue.onIdle(), this.scraperQueue.onIdle()]),
213
+ timeoutPromise
214
+ ]);
215
+ if (this.navigationHistory && this.currentSession) {
216
+ try {
217
+ await this.navigationHistory.updateSessionStatus(this.currentSession.sessionId, "paused");
218
+ await this.navigationHistory.updateSessionStats(this.currentSession.sessionId, {
219
+ urlsVisited: this.crawlStats.urlsVisited,
220
+ urlsQueued: this.queue.size,
221
+ urlsFailed: this.crawlStats.urlsFailed
222
+ });
223
+ console.log(`[Crawler] Session saved: ${this.currentSession.sessionId}`);
224
+ } catch (err) {
225
+ console.warn("[Crawler] Failed to save session state:", err);
226
+ }
227
+ }
228
+ await this.destroy();
229
+ console.log("[Crawler] Graceful shutdown complete");
230
+ process.exit(0);
154
231
  }
155
232
  async initializeAdapter() {
156
233
  try {
@@ -166,7 +243,6 @@ class Crawler {
166
243
  if (!this.isDestroyed)
167
244
  return;
168
245
  this.queue = new RezoQueue(this.queueOptions);
169
- this.pendingExecutions.clear();
170
246
  this.isDestroyed = false;
171
247
  if (this.config.debug) {
172
248
  console.log("[Crawler] Restored from destroyed state");
@@ -342,17 +418,110 @@ class Crawler {
342
418
  handler(data);
343
419
  });
344
420
  }
345
- async waitForCache() {
421
+ async waitForCache(timeoutMs = 30000) {
346
422
  if (this.isCacheReady)
347
423
  return;
348
- await this.sleep(this.rnd(50, 200));
349
- await this.waitForCache();
424
+ const start = Date.now();
425
+ while (!this.isCacheReady) {
426
+ if (Date.now() - start > timeoutMs) {
427
+ console.warn("[Crawler] Cache initialization timeout, continuing without cache");
428
+ this.isCacheReady = true;
429
+ return;
430
+ }
431
+ await new Promise((resolve) => setTimeout(resolve, 100));
432
+ }
350
433
  }
351
- async waitForStorage() {
434
+ async waitForStorage(timeoutMs = 30000) {
352
435
  if (this.isStorageReady)
353
436
  return;
354
- await this.sleep(this.rnd(50, 200));
355
- await this.waitForStorage();
437
+ const start = Date.now();
438
+ while (!this.isStorageReady) {
439
+ if (Date.now() - start > timeoutMs) {
440
+ console.warn("[Crawler] Storage initialization timeout, continuing without URL tracking");
441
+ this.isStorageReady = true;
442
+ return;
443
+ }
444
+ await new Promise((resolve) => setTimeout(resolve, 100));
445
+ }
446
+ }
447
+ startPeriodicCleanup() {
448
+ if (this.cleanupInterval)
449
+ return;
450
+ this.cleanupInterval = setInterval(() => {
451
+ const memStatus = this.memoryMonitor.check();
452
+ if (memStatus === "critical") {
453
+ this.queue.pause();
454
+ this.memoryMonitor.forceGC();
455
+ if (this.config.debug) {
456
+ const report = this.memoryMonitor.getReport();
457
+ console.warn(`[Crawler] CRITICAL memory (${report.usagePercent}%), pausing...`);
458
+ }
459
+ setTimeout(() => {
460
+ this.queue.concurrency = Math.max(5, Math.floor(this.originalConcurrency * 0.25));
461
+ this.queue.start();
462
+ }, 3000);
463
+ } else if (memStatus === "warning") {
464
+ const newConcurrency = Math.max(10, Math.floor(this.originalConcurrency * 0.5));
465
+ if (this.queue.concurrency > newConcurrency) {
466
+ this.queue.concurrency = newConcurrency;
467
+ if (this.config.debug) {
468
+ const report = this.memoryMonitor.getReport();
469
+ console.warn(`[Crawler] High memory (${report.usagePercent}%), reducing concurrency to ${newConcurrency}`);
470
+ }
471
+ }
472
+ } else {
473
+ if (this.queue.concurrency < this.originalConcurrency) {
474
+ this.queue.concurrency = Math.min(this.originalConcurrency, this.queue.concurrency + 10);
475
+ }
476
+ }
477
+ }, 30000);
478
+ if (this.cleanupInterval.unref) {
479
+ this.cleanupInterval.unref();
480
+ }
481
+ if (this.config.enableNavigationHistory) {
482
+ this.startAutoCheckpoint();
483
+ }
484
+ }
485
+ startAutoCheckpoint() {
486
+ if (this.checkpointInterval)
487
+ return;
488
+ const CHECKPOINT_INTERVAL = 5 * 60 * 1000;
489
+ this.checkpointInterval = setInterval(async () => {
490
+ if (this.shutdownRequested || this.isDestroyed)
491
+ return;
492
+ try {
493
+ await this.saveCheckpoint();
494
+ } catch (error) {
495
+ if (this.config.debug) {
496
+ console.error("[Crawler] Checkpoint save failed:", error);
497
+ }
498
+ }
499
+ }, CHECKPOINT_INTERVAL);
500
+ if (this.checkpointInterval.unref) {
501
+ this.checkpointInterval.unref();
502
+ }
503
+ }
504
+ async saveCheckpoint() {
505
+ if (!this.navigationHistory || !this.currentSession)
506
+ return;
507
+ const now = Date.now();
508
+ if (now - this.lastCheckpointTime < 60000)
509
+ return;
510
+ try {
511
+ await this.navigationHistory.updateSessionStats(this.currentSession.sessionId, {
512
+ urlsVisited: this.crawlStats.urlsVisited,
513
+ urlsQueued: this.crawlStats.urlsQueued,
514
+ urlsFailed: this.crawlStats.urlsFailed
515
+ });
516
+ this.lastCheckpointTime = now;
517
+ if (this.config.debug) {
518
+ console.log(`[Crawler] Checkpoint saved: ${this.crawlStats.urlsVisited} visited, ${this.crawlStats.urlsFailed} failed`);
519
+ }
520
+ } catch (error) {
521
+ if (this.config.debug) {
522
+ console.error("[Crawler] Failed to save checkpoint:", error);
523
+ }
524
+ }
356
525
  }
357
526
  async saveUrl(url) {
358
527
  await this.waitForStorage();
@@ -696,14 +865,14 @@ class Crawler {
696
865
  return this;
697
866
  }
698
867
  getCollectedData() {
699
- return [...this.collectedData];
868
+ return this.collectedData.toArray();
700
869
  }
701
870
  clearCollectedData() {
702
- this.collectedData = [];
871
+ this.collectedData.clear();
703
872
  return this;
704
873
  }
705
874
  async exportData(filePath, format = "json") {
706
- const data = this.collectedData;
875
+ const data = this.collectedData.toArray();
707
876
  if (data.length === 0) {
708
877
  if (this.config.debug) {
709
878
  console.warn("[Crawler] No data to export");
@@ -764,20 +933,36 @@ class Crawler {
764
933
  getStats() {
765
934
  return { ...this.crawlStats };
766
935
  }
936
+ getHealthSnapshot() {
937
+ return this.healthMetrics.getSnapshot(this.queue.size, this.queue.pending);
938
+ }
939
+ isHealthy(options) {
940
+ return this.healthMetrics.isHealthy(options);
941
+ }
942
+ getPrometheusMetrics(prefix = "crawler") {
943
+ return this.healthMetrics.toPrometheusFormat(prefix);
944
+ }
767
945
  async triggerStartHandlers() {
768
946
  if (this.crawlStarted)
769
947
  return;
948
+ if (this.startHandlersPromise) {
949
+ return this.startHandlersPromise;
950
+ }
770
951
  this.crawlStarted = true;
771
- this.crawlStats.startTime = Date.now();
772
- for (const handler of this.startHandlers) {
773
- try {
774
- this.queue.add(() => handler());
775
- } catch (error) {
776
- if (this.config.debug) {
777
- console.error("[Crawler] onStart handler error:", error);
952
+ this.startHandlersPromise = (async () => {
953
+ this.crawlStats.startTime = Date.now();
954
+ this.startPeriodicCleanup();
955
+ for (const handler of this.startHandlers) {
956
+ try {
957
+ this.queue.add(() => handler());
958
+ } catch (error) {
959
+ if (this.config.debug) {
960
+ console.error("[Crawler] onStart handler error:", error);
961
+ }
778
962
  }
779
963
  }
780
- }
964
+ })();
965
+ return this.startHandlersPromise;
781
966
  }
782
967
  async triggerFinishHandlers() {
783
968
  this.crawlStats.endTime = Date.now();
@@ -872,55 +1057,55 @@ class Crawler {
872
1057
  const headersObj = headers instanceof Headers ? Object.fromEntries(headers.entries()) : headers;
873
1058
  this.addToNavigationQueue(url, method, body, headersObj);
874
1059
  }
875
- if (url.includes(`/www.yellowpages.com/search?`))
876
- console.log("Visiting: ", url);
1060
+ this.crawlStarted = true;
877
1061
  if (deepEmailFinder) {
878
- this.pendingVisitCount++;
879
- const p = this.execute2(method, url, body, _options, forceRevisit, emailMetadata);
880
- this.pendingExecutions.add(p);
881
- p.finally(() => this.pendingExecutions.delete(p));
1062
+ this.execute2(method, url, body, _options, forceRevisit, emailMetadata);
882
1063
  return this;
883
1064
  }
884
- this.pendingVisitCount++;
885
- const p = this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, skipCache, emailMetadata);
886
- this.pendingExecutions.add(p);
887
- p.finally(() => this.pendingExecutions.delete(p));
1065
+ this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, skipCache, emailMetadata);
888
1066
  return this;
889
1067
  }
890
1068
  async execute(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, skipCache, emailMetadata) {
891
- await this.waitForStorage();
892
- if (this.isCacheEnabled) {
893
- await this.waitForCache();
894
- }
895
- if (this.config.enableNavigationHistory) {
896
- await this.waitForNavigationHistory();
897
- }
898
- const task = this.queue.add(() => this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, 0, undefined, skipCache, emailMetadata));
899
- this.pendingVisitCount--;
900
- task.finally(() => this.pendingExecutions.delete(task));
1069
+ this.queue.add(async () => {
1070
+ await this.waitForStorage();
1071
+ if (this.isCacheEnabled) {
1072
+ await this.waitForCache();
1073
+ }
1074
+ if (this.config.enableNavigationHistory) {
1075
+ await this.waitForNavigationHistory();
1076
+ }
1077
+ await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, 0, undefined, skipCache, emailMetadata);
1078
+ }).catch((err) => {
1079
+ if (this.config.debug)
1080
+ console.warn("[Crawler] execute() task error:", err?.message);
1081
+ });
901
1082
  }
902
1083
  async execute2(method, url, body, options = {}, forceRevisit, emailMetadata) {
903
- await this.waitForStorage();
904
- if (this.isCacheEnabled) {
905
- await this.waitForCache();
906
- }
907
- if (this.config.enableNavigationHistory) {
908
- await this.waitForNavigationHistory();
909
- }
910
- this.queue.add(() => this.leadsFinder.parseExternalWebsite(url, method, body, {
911
- httpConfig: options,
912
- saveCache: this.saveCache.bind(this),
913
- saveUrl: this.saveUrl.bind(this),
914
- getCache: this.getCache.bind(this),
915
- hasUrlInCache: this.hasUrlInCache.bind(this),
916
- onEmailDiscovered: this.emailDiscoveredEvents,
917
- onEmails: this.emailLeadsEvents,
918
- queue: this.queue,
919
- depth: 1,
920
- allowCrossDomainTravel: true,
921
- emailMetadata
922
- }, forceRevisit, true)).then();
923
- this.pendingVisitCount--;
1084
+ this.scraperQueue.add(async () => {
1085
+ await this.waitForStorage();
1086
+ if (this.isCacheEnabled) {
1087
+ await this.waitForCache();
1088
+ }
1089
+ if (this.config.enableNavigationHistory) {
1090
+ await this.waitForNavigationHistory();
1091
+ }
1092
+ await this.leadsFinder.parseExternalWebsite(url, method, body, {
1093
+ httpConfig: options,
1094
+ saveCache: this.saveCache.bind(this),
1095
+ saveUrl: this.saveUrl.bind(this),
1096
+ getCache: this.getCache.bind(this),
1097
+ hasUrlInCache: this.hasUrlInCache.bind(this),
1098
+ onEmailDiscovered: this.emailDiscoveredEvents,
1099
+ onEmails: this.emailLeadsEvents,
1100
+ queue: this.scraperQueue,
1101
+ depth: 1,
1102
+ allowCrossDomainTravel: true,
1103
+ emailMetadata
1104
+ }, forceRevisit, true);
1105
+ }).catch((err) => {
1106
+ if (this.config.debug)
1107
+ console.warn("[Crawler] execute2() task error:", err?.message);
1108
+ });
924
1109
  }
925
1110
  async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0, parentUrl, skipCache, emailMetadata) {
926
1111
  try {
@@ -947,7 +1132,15 @@ class Crawler {
947
1132
  return;
948
1133
  }
949
1134
  const requestStartTime = Date.now();
950
- const response = cache && method === "GET" && !skipCache ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) : await (method === "GET" ? this.http.get(url, options) : method === "PATCH" ? this.http.patch(url, body, options) : method === "POST" ? this.http.post(url, body, options) : this.http.put(url, body, options));
1135
+ const response = cache && method === "GET" && !skipCache ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) : method === "GET" ? await this.http.get(url, options) : method === "PATCH" ? await this.http.patch(url, body, options) : method === "POST" ? await this.http.post(url, body, options) : await this.http.put(url, body, options);
1136
+ if (!response) {
1137
+ this.crawlStats.urlsFailed++;
1138
+ this.healthMetrics.recordRequest(Date.now() - requestStartTime, false);
1139
+ if (this.config.debug) {
1140
+ console.log(`[Crawler] Request failed for ${url}`);
1141
+ }
1142
+ return;
1143
+ }
951
1144
  if (!cache) {
952
1145
  const responseTime = Date.now() - requestStartTime;
953
1146
  this.calculateAutoThrottleDelay(domain, responseTime);
@@ -973,6 +1166,8 @@ class Crawler {
973
1166
  }
974
1167
  }
975
1168
  this.crawlStats.urlsVisited++;
1169
+ const finalResponseTime = cache ? 0 : Date.now() - requestStartTime;
1170
+ this.healthMetrics.recordRequest(finalResponseTime, true);
976
1171
  if (res.finalUrl && res.finalUrl !== url && this.redirectHandlers.length > 0) {
977
1172
  await this.triggerRedirectHandlers({
978
1173
  originalUrl: url,
@@ -992,7 +1187,7 @@ class Crawler {
992
1187
  });
993
1188
  if (res.contentType && res.contentType.includes("/json")) {
994
1189
  if (this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) {
995
- this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.queue, emailMetadata);
1190
+ this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.scraperQueue, emailMetadata);
996
1191
  }
997
1192
  for (let i = 0;i < this.jsonEvents.length; i++) {
998
1193
  const event = this.jsonEvents[i];
@@ -1007,7 +1202,7 @@ class Crawler {
1007
1202
  if (!res.contentType || !res.contentType.includes("/html") || typeof res.data !== "string")
1008
1203
  return;
1009
1204
  if ((this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) && isEmail) {
1010
- this.leadsFinder.extractEmails(res.data, res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.queue, emailMetadata);
1205
+ this.leadsFinder.extractEmails(res.data, res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.scraperQueue, emailMetadata);
1011
1206
  }
1012
1207
  const { document } = parseHTML(res.data.addBaseUrl(res.finalUrl));
1013
1208
  document.URL = res.finalUrl;
@@ -1072,23 +1267,44 @@ class Crawler {
1072
1267
  }
1073
1268
  }
1074
1269
  async waitForAll() {
1075
- if (this.pendingVisitCount === 0 && this.pendingExecutions.size === 0 && this.queue.size === 0 && this.queue.pending === 0 && !this.crawlStarted) {
1076
- await this.triggerFinishHandlers();
1077
- await this.destroy();
1078
- return;
1079
- }
1080
- while (true) {
1081
- while (this.pendingVisitCount > 0) {
1270
+ if (!this.crawlStarted) {
1271
+ await new Promise((resolve) => setImmediate(resolve));
1272
+ const maxWaitForStart = 1000;
1273
+ const startWait = Date.now();
1274
+ while (!this.crawlStarted && Date.now() - startWait < maxWaitForStart) {
1082
1275
  await new Promise((resolve) => setTimeout(resolve, 10));
1083
1276
  }
1084
- if (this.pendingExecutions.size > 0) {
1085
- await Promise.allSettled([...this.pendingExecutions]);
1086
- }
1087
- if (this.queue.size > 0 || this.queue.pending > 0) {
1088
- await this.queue.onIdle();
1277
+ if (!this.crawlStarted) {
1278
+ return;
1089
1279
  }
1090
- if (this.pendingVisitCount === 0 && this.pendingExecutions.size === 0 && this.queue.size === 0 && this.queue.pending === 0) {
1091
- break;
1280
+ }
1281
+ const MIN_DELAY = 50;
1282
+ const MAX_DELAY = 500;
1283
+ let currentDelay = MIN_DELAY;
1284
+ let consecutiveIdleChecks = 0;
1285
+ const REQUIRED_IDLE_CHECKS = 3;
1286
+ while (true) {
1287
+ await this.queue.onIdle();
1288
+ await new Promise((resolve) => setTimeout(resolve, currentDelay));
1289
+ const scraperSize = this.scraperQueue.size;
1290
+ const scraperPending = this.scraperQueue.pending;
1291
+ if (scraperSize === 0 && scraperPending === 0) {
1292
+ const queueSize = this.queue.size;
1293
+ const queuePending = this.queue.pending;
1294
+ if (queueSize === 0 && queuePending === 0) {
1295
+ consecutiveIdleChecks++;
1296
+ if (consecutiveIdleChecks >= REQUIRED_IDLE_CHECKS) {
1297
+ break;
1298
+ }
1299
+ currentDelay = Math.max(MIN_DELAY, currentDelay / 2);
1300
+ } else {
1301
+ consecutiveIdleChecks = 0;
1302
+ currentDelay = Math.min(MAX_DELAY, currentDelay * 1.5);
1303
+ }
1304
+ } else {
1305
+ consecutiveIdleChecks = 0;
1306
+ currentDelay = Math.min(MAX_DELAY, currentDelay * 1.5);
1307
+ await this.scraperQueue.onIdle();
1092
1308
  }
1093
1309
  }
1094
1310
  await this.triggerFinishHandlers();
@@ -1112,7 +1328,18 @@ class Crawler {
1112
1328
  if (this.isDestroyed)
1113
1329
  return;
1114
1330
  this.isDestroyed = true;
1331
+ this.removeShutdownHandlers();
1332
+ if (this.cleanupInterval) {
1333
+ clearInterval(this.cleanupInterval);
1334
+ this.cleanupInterval = undefined;
1335
+ }
1336
+ if (this.checkpointInterval) {
1337
+ clearInterval(this.checkpointInterval);
1338
+ this.checkpointInterval = undefined;
1339
+ }
1340
+ this.memoryMonitor.destroy();
1115
1341
  this.queue.destroy();
1342
+ this.scraperQueue.destroy();
1116
1343
  this.config.destroyLimiters();
1117
1344
  this.events.length = 0;
1118
1345
  this.jsonEvents.length = 0;
@@ -1121,6 +1348,10 @@ class Crawler {
1121
1348
  this.rawResponseEvents.length = 0;
1122
1349
  this.emailDiscoveredEvents.length = 0;
1123
1350
  this.emailLeadsEvents.length = 0;
1351
+ this.domainResponseTimes.clear();
1352
+ this.domainCurrentDelay.clear();
1353
+ this.urlDepthMap.clear();
1354
+ this.collectedData.clear();
1124
1355
  await this.close();
1125
1356
  resetGlobalAgentPool();
1126
1357
  }