rezo 1.0.71 → 1.0.73

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/dist/adapters/entries/curl.d.ts +13 -2
  2. package/dist/adapters/entries/fetch.d.ts +13 -2
  3. package/dist/adapters/entries/http.d.ts +13 -2
  4. package/dist/adapters/entries/http2.d.ts +13 -2
  5. package/dist/adapters/entries/react-native.d.ts +13 -2
  6. package/dist/adapters/entries/xhr.d.ts +13 -2
  7. package/dist/adapters/index.cjs +6 -6
  8. package/dist/cache/index.cjs +9 -9
  9. package/dist/crawler/crawler-options.cjs +1 -1
  10. package/dist/crawler/crawler-options.js +1 -1
  11. package/dist/crawler/crawler.cjs +307 -86
  12. package/dist/crawler/crawler.js +307 -86
  13. package/dist/crawler/index.cjs +40 -40
  14. package/dist/crawler/plugin/capped-array.cjs +1 -0
  15. package/dist/crawler/plugin/capped-array.js +1 -0
  16. package/dist/crawler/plugin/capped-map.cjs +1 -0
  17. package/dist/crawler/plugin/capped-map.js +1 -0
  18. package/dist/crawler/plugin/file-cacher.cjs +20 -18
  19. package/dist/crawler/plugin/file-cacher.js +20 -18
  20. package/dist/crawler/plugin/health-metrics.cjs +2 -0
  21. package/dist/crawler/plugin/health-metrics.js +2 -0
  22. package/dist/crawler/plugin/index.cjs +1 -1
  23. package/dist/crawler/plugin/index.js +1 -1
  24. package/dist/crawler/plugin/memory-monitor.cjs +1 -0
  25. package/dist/crawler/plugin/memory-monitor.js +1 -0
  26. package/dist/crawler/plugin/navigation-history.cjs +5 -5
  27. package/dist/crawler/plugin/navigation-history.js +3 -3
  28. package/dist/crawler/plugin/result-stream.cjs +5 -0
  29. package/dist/crawler/plugin/result-stream.js +5 -0
  30. package/dist/crawler/plugin/sqlite-utils.cjs +1 -0
  31. package/dist/crawler/plugin/sqlite-utils.js +1 -0
  32. package/dist/crawler/plugin/url-store.cjs +5 -5
  33. package/dist/crawler/plugin/url-store.js +5 -5
  34. package/dist/crawler/scraper.cjs +1 -1
  35. package/dist/crawler/scraper.js +1 -1
  36. package/dist/crawler.d.ts +148 -25
  37. package/dist/entries/crawler.cjs +4 -4
  38. package/dist/index.cjs +30 -30
  39. package/dist/index.d.ts +13 -2
  40. package/dist/internal/agents/index.cjs +10 -10
  41. package/dist/platform/browser.d.ts +13 -2
  42. package/dist/platform/bun.d.ts +13 -2
  43. package/dist/platform/deno.d.ts +13 -2
  44. package/dist/platform/node.d.ts +13 -2
  45. package/dist/platform/react-native.d.ts +13 -2
  46. package/dist/platform/worker.d.ts +13 -2
  47. package/dist/proxy/index.cjs +4 -4
  48. package/dist/queue/index.cjs +8 -8
  49. package/dist/queue/queue.cjs +58 -13
  50. package/dist/queue/queue.js +58 -13
  51. package/dist/responses/universal/index.cjs +11 -11
  52. package/dist/utils/agent-pool.cjs +2 -0
  53. package/dist/utils/agent-pool.js +2 -0
  54. package/dist/utils/compression.cjs +6 -6
  55. package/dist/utils/compression.js +6 -6
  56. package/dist/version.cjs +1 -1
  57. package/dist/version.js +1 -1
  58. package/dist/wget/index.cjs +49 -49
  59. package/dist/wget/index.d.ts +12 -1
  60. package/package.json +1 -1
@@ -3,6 +3,10 @@ import { FileCacher } from './plugin/file-cacher.js';
3
3
  import { UrlStore } from './plugin/url-store.js';
4
4
  import { NavigationHistory } from './plugin/navigation-history.js';
5
5
  import { RobotsTxt } from './plugin/robots-txt.js';
6
+ import { MemoryMonitor } from './plugin/memory-monitor.js';
7
+ import { HealthMetrics } from './plugin/health-metrics.js';
8
+ import { CappedMap } from './plugin/capped-map.js';
9
+ import { CappedArray } from './plugin/capped-array.js';
6
10
  import { parseHTML } from "linkedom";
7
11
  import path from "node:path";
8
12
  import { Rezo } from '../adapters/entries/http.js';
@@ -43,6 +47,7 @@ export class Crawler {
43
47
  emailLeadsEvents = [];
44
48
  cacher = null;
45
49
  queue;
50
+ scraperQueue;
46
51
  isCacheEnabled;
47
52
  config;
48
53
  urlStorage;
@@ -56,13 +61,12 @@ export class Crawler {
56
61
  navigationHistoryInitPromise = null;
57
62
  adapterExecutor = null;
58
63
  adapterType;
59
- pendingExecutions = new Set;
60
- pendingVisitCount = 0;
61
64
  isDestroyed = false;
62
- queueOptions = { concurrency: 1000 };
65
+ shutdownRequested = false;
66
+ queueOptions = { concurrency: 100 };
63
67
  robotsTxt;
64
- domainResponseTimes = new Map;
65
- domainCurrentDelay = new Map;
68
+ domainResponseTimes = new CappedMap({ maxSize: 500 });
69
+ domainCurrentDelay = new CappedMap({ maxSize: 500 });
66
70
  crawlStats = {
67
71
  urlsVisited: 0,
68
72
  urlsQueued: 0,
@@ -70,19 +74,42 @@ export class Crawler {
70
74
  startTime: 0,
71
75
  currentDepth: 0
72
76
  };
73
- urlDepthMap = new Map;
77
+ urlDepthMap = new CappedMap({ maxSize: 50000 });
78
+ cleanupInterval;
79
+ checkpointInterval;
80
+ lastCheckpointTime = 0;
81
+ memoryMonitor;
82
+ healthMetrics;
83
+ originalConcurrency = 100;
84
+ shutdownHandler = null;
74
85
  startHandlers = [];
75
86
  finishHandlers = [];
76
87
  redirectHandlers = [];
77
- collectedData = [];
88
+ collectedData = new CappedArray({
89
+ maxSize: 1e5,
90
+ evictionRatio: 0.1,
91
+ onEviction: (evicted, remaining) => {
92
+ console.warn(`[Crawler] collectedData auto-evicted ${evicted.length} oldest entries. ${remaining} entries remaining. Consider using exportData() more frequently.`);
93
+ }
94
+ });
78
95
  crawlStarted = false;
96
+ startHandlersPromise = null;
79
97
  constructor(crawlerOptions, http = new Rezo) {
80
98
  this.http = http;
81
- this.queue = new RezoQueue({
82
- concurrency: 1000
83
- });
84
99
  this.config = new CrawlerOptions(crawlerOptions);
85
100
  this.adapterType = this.config.adapter;
101
+ const concurrency = this.config.concurrency;
102
+ this.queue = new RezoQueue({
103
+ concurrency,
104
+ timeout: 60000
105
+ });
106
+ this.originalConcurrency = concurrency;
107
+ this.scraperQueue = new RezoQueue({
108
+ concurrency: this.config.scraperConcurrency,
109
+ timeout: 60000
110
+ });
111
+ this.memoryMonitor = new MemoryMonitor({ warningRatio: 0.7, criticalRatio: 0.85 });
112
+ this.healthMetrics = new HealthMetrics({ windowSize: 60000 });
86
113
  const enableCache = this.config.enableCache;
87
114
  this.isCacheEnabled = enableCache;
88
115
  if (enableCache) {
@@ -94,7 +121,7 @@ export class Crawler {
94
121
  FileCacher.create({
95
122
  cacheDir: dbUrl,
96
123
  ttl: cacheTTL,
97
- encryptNamespace: true
124
+ maxEntries: 1e5
98
125
  }).then((storage) => {
99
126
  this.cacher = storage;
100
127
  this.isCacheReady = true;
@@ -151,6 +178,55 @@ export class Crawler {
151
178
  if (this.config.baseUrl) {
152
179
  this.urlDepthMap.set(this.config.baseUrl, 0);
153
180
  }
181
+ this.registerShutdownHandlers();
182
+ }
183
+ registerShutdownHandlers() {
184
+ if (this.shutdownHandler)
185
+ return;
186
+ this.shutdownHandler = () => this.gracefulShutdown();
187
+ process.on("SIGINT", this.shutdownHandler);
188
+ process.on("SIGTERM", this.shutdownHandler);
189
+ }
190
+ removeShutdownHandlers() {
191
+ if (this.shutdownHandler) {
192
+ process.off("SIGINT", this.shutdownHandler);
193
+ process.off("SIGTERM", this.shutdownHandler);
194
+ this.shutdownHandler = null;
195
+ }
196
+ }
197
+ async gracefulShutdown() {
198
+ if (this.shutdownRequested || this.isDestroyed)
199
+ return;
200
+ this.shutdownRequested = true;
201
+ console.log(`
202
+ [Crawler] Shutdown requested, finishing current tasks...`);
203
+ this.queue.pause();
204
+ this.scraperQueue.pause();
205
+ const timeoutPromise = new Promise((resolve) => {
206
+ setTimeout(() => {
207
+ console.log("[Crawler] Shutdown timeout (5s), forcing exit...");
208
+ resolve();
209
+ }, 5000);
210
+ });
211
+ await Promise.race([
212
+ Promise.all([this.queue.onIdle(), this.scraperQueue.onIdle()]),
213
+ timeoutPromise
214
+ ]);
215
+ if (this.navigationHistory && this.currentSession) {
216
+ try {
217
+ await this.navigationHistory.updateSessionStatus(this.currentSession.sessionId, "paused");
218
+ await this.navigationHistory.updateSessionStats(this.currentSession.sessionId, {
219
+ urlsVisited: this.crawlStats.urlsVisited,
220
+ urlsQueued: this.queue.size,
221
+ urlsFailed: this.crawlStats.urlsFailed
222
+ });
223
+ console.log(`[Crawler] Session saved: ${this.currentSession.sessionId}`);
224
+ } catch (err) {
225
+ console.warn("[Crawler] Failed to save session state:", err);
226
+ }
227
+ }
228
+ await this.destroy();
229
+ console.log("[Crawler] Graceful shutdown complete");
154
230
  }
155
231
  async initializeAdapter() {
156
232
  try {
@@ -166,7 +242,6 @@ export class Crawler {
166
242
  if (!this.isDestroyed)
167
243
  return;
168
244
  this.queue = new RezoQueue(this.queueOptions);
169
- this.pendingExecutions.clear();
170
245
  this.isDestroyed = false;
171
246
  if (this.config.debug) {
172
247
  console.log("[Crawler] Restored from destroyed state");
@@ -342,17 +417,110 @@ export class Crawler {
342
417
  handler(data);
343
418
  });
344
419
  }
345
- async waitForCache() {
420
+ async waitForCache(timeoutMs = 30000) {
346
421
  if (this.isCacheReady)
347
422
  return;
348
- await this.sleep(this.rnd(50, 200));
349
- await this.waitForCache();
423
+ const start = Date.now();
424
+ while (!this.isCacheReady) {
425
+ if (Date.now() - start > timeoutMs) {
426
+ console.warn("[Crawler] Cache initialization timeout, continuing without cache");
427
+ this.isCacheReady = true;
428
+ return;
429
+ }
430
+ await new Promise((resolve) => setTimeout(resolve, 100));
431
+ }
350
432
  }
351
- async waitForStorage() {
433
+ async waitForStorage(timeoutMs = 30000) {
352
434
  if (this.isStorageReady)
353
435
  return;
354
- await this.sleep(this.rnd(50, 200));
355
- await this.waitForStorage();
436
+ const start = Date.now();
437
+ while (!this.isStorageReady) {
438
+ if (Date.now() - start > timeoutMs) {
439
+ console.warn("[Crawler] Storage initialization timeout, continuing without URL tracking");
440
+ this.isStorageReady = true;
441
+ return;
442
+ }
443
+ await new Promise((resolve) => setTimeout(resolve, 100));
444
+ }
445
+ }
446
+ startPeriodicCleanup() {
447
+ if (this.cleanupInterval)
448
+ return;
449
+ this.cleanupInterval = setInterval(() => {
450
+ const memStatus = this.memoryMonitor.check();
451
+ if (memStatus === "critical") {
452
+ this.queue.pause();
453
+ this.memoryMonitor.forceGC();
454
+ if (this.config.debug) {
455
+ const report = this.memoryMonitor.getReport();
456
+ console.warn(`[Crawler] CRITICAL memory (${report.usagePercent}%), pausing...`);
457
+ }
458
+ setTimeout(() => {
459
+ this.queue.concurrency = Math.max(5, Math.floor(this.originalConcurrency * 0.25));
460
+ this.queue.start();
461
+ }, 3000);
462
+ } else if (memStatus === "warning") {
463
+ const newConcurrency = Math.max(10, Math.floor(this.originalConcurrency * 0.5));
464
+ if (this.queue.concurrency > newConcurrency) {
465
+ this.queue.concurrency = newConcurrency;
466
+ if (this.config.debug) {
467
+ const report = this.memoryMonitor.getReport();
468
+ console.warn(`[Crawler] High memory (${report.usagePercent}%), reducing concurrency to ${newConcurrency}`);
469
+ }
470
+ }
471
+ } else {
472
+ if (this.queue.concurrency < this.originalConcurrency) {
473
+ this.queue.concurrency = Math.min(this.originalConcurrency, this.queue.concurrency + 10);
474
+ }
475
+ }
476
+ }, 30000);
477
+ if (this.cleanupInterval.unref) {
478
+ this.cleanupInterval.unref();
479
+ }
480
+ if (this.config.enableNavigationHistory) {
481
+ this.startAutoCheckpoint();
482
+ }
483
+ }
484
+ startAutoCheckpoint() {
485
+ if (this.checkpointInterval)
486
+ return;
487
+ const CHECKPOINT_INTERVAL = 5 * 60 * 1000;
488
+ this.checkpointInterval = setInterval(async () => {
489
+ if (this.shutdownRequested || this.isDestroyed)
490
+ return;
491
+ try {
492
+ await this.saveCheckpoint();
493
+ } catch (error) {
494
+ if (this.config.debug) {
495
+ console.error("[Crawler] Checkpoint save failed:", error);
496
+ }
497
+ }
498
+ }, CHECKPOINT_INTERVAL);
499
+ if (this.checkpointInterval.unref) {
500
+ this.checkpointInterval.unref();
501
+ }
502
+ }
503
+ async saveCheckpoint() {
504
+ if (!this.navigationHistory || !this.currentSession)
505
+ return;
506
+ const now = Date.now();
507
+ if (now - this.lastCheckpointTime < 60000)
508
+ return;
509
+ try {
510
+ await this.navigationHistory.updateSessionStats(this.currentSession.sessionId, {
511
+ urlsVisited: this.crawlStats.urlsVisited,
512
+ urlsQueued: this.crawlStats.urlsQueued,
513
+ urlsFailed: this.crawlStats.urlsFailed
514
+ });
515
+ this.lastCheckpointTime = now;
516
+ if (this.config.debug) {
517
+ console.log(`[Crawler] Checkpoint saved: ${this.crawlStats.urlsVisited} visited, ${this.crawlStats.urlsFailed} failed`);
518
+ }
519
+ } catch (error) {
520
+ if (this.config.debug) {
521
+ console.error("[Crawler] Failed to save checkpoint:", error);
522
+ }
523
+ }
356
524
  }
357
525
  async saveUrl(url) {
358
526
  await this.waitForStorage();
@@ -696,14 +864,14 @@ export class Crawler {
696
864
  return this;
697
865
  }
698
866
  getCollectedData() {
699
- return [...this.collectedData];
867
+ return this.collectedData.toArray();
700
868
  }
701
869
  clearCollectedData() {
702
- this.collectedData = [];
870
+ this.collectedData.clear();
703
871
  return this;
704
872
  }
705
873
  async exportData(filePath, format = "json") {
706
- const data = this.collectedData;
874
+ const data = this.collectedData.toArray();
707
875
  if (data.length === 0) {
708
876
  if (this.config.debug) {
709
877
  console.warn("[Crawler] No data to export");
@@ -764,20 +932,36 @@ export class Crawler {
764
932
  getStats() {
765
933
  return { ...this.crawlStats };
766
934
  }
935
+ getHealthSnapshot() {
936
+ return this.healthMetrics.getSnapshot(this.queue.size, this.queue.pending);
937
+ }
938
+ isHealthy(options) {
939
+ return this.healthMetrics.isHealthy(options);
940
+ }
941
+ getPrometheusMetrics(prefix = "crawler") {
942
+ return this.healthMetrics.toPrometheusFormat(prefix);
943
+ }
767
944
  async triggerStartHandlers() {
768
945
  if (this.crawlStarted)
769
946
  return;
947
+ if (this.startHandlersPromise) {
948
+ return this.startHandlersPromise;
949
+ }
770
950
  this.crawlStarted = true;
771
- this.crawlStats.startTime = Date.now();
772
- for (const handler of this.startHandlers) {
773
- try {
774
- this.queue.add(() => handler());
775
- } catch (error) {
776
- if (this.config.debug) {
777
- console.error("[Crawler] onStart handler error:", error);
951
+ this.startHandlersPromise = (async () => {
952
+ this.crawlStats.startTime = Date.now();
953
+ this.startPeriodicCleanup();
954
+ for (const handler of this.startHandlers) {
955
+ try {
956
+ this.queue.add(() => handler());
957
+ } catch (error) {
958
+ if (this.config.debug) {
959
+ console.error("[Crawler] onStart handler error:", error);
960
+ }
778
961
  }
779
962
  }
780
- }
963
+ })();
964
+ return this.startHandlersPromise;
781
965
  }
782
966
  async triggerFinishHandlers() {
783
967
  this.crawlStats.endTime = Date.now();
@@ -874,53 +1058,55 @@ export class Crawler {
874
1058
  }
875
1059
  if (url.includes(`/www.yellowpages.com/search?`))
876
1060
  console.log("Visiting: ", url);
1061
+ this.crawlStarted = true;
877
1062
  if (deepEmailFinder) {
878
- this.pendingVisitCount++;
879
- const p = this.execute2(method, url, body, _options, forceRevisit, emailMetadata);
880
- this.pendingExecutions.add(p);
881
- p.finally(() => this.pendingExecutions.delete(p));
1063
+ this.execute2(method, url, body, _options, forceRevisit, emailMetadata);
882
1064
  return this;
883
1065
  }
884
- this.pendingVisitCount++;
885
- const p = this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, skipCache, emailMetadata);
886
- this.pendingExecutions.add(p);
887
- p.finally(() => this.pendingExecutions.delete(p));
1066
+ this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, skipCache, emailMetadata);
888
1067
  return this;
889
1068
  }
890
1069
  async execute(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, skipCache, emailMetadata) {
891
- await this.waitForStorage();
892
- if (this.isCacheEnabled) {
893
- await this.waitForCache();
894
- }
895
- if (this.config.enableNavigationHistory) {
896
- await this.waitForNavigationHistory();
897
- }
898
- const task = this.queue.add(() => this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, 0, undefined, skipCache, emailMetadata));
899
- this.pendingVisitCount--;
900
- task.finally(() => this.pendingExecutions.delete(task));
1070
+ this.queue.add(async () => {
1071
+ await this.waitForStorage();
1072
+ if (this.isCacheEnabled) {
1073
+ await this.waitForCache();
1074
+ }
1075
+ if (this.config.enableNavigationHistory) {
1076
+ await this.waitForNavigationHistory();
1077
+ }
1078
+ await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, 0, undefined, skipCache, emailMetadata);
1079
+ }).catch((err) => {
1080
+ if (this.config.debug)
1081
+ console.warn("[Crawler] execute() task error:", err?.message);
1082
+ });
901
1083
  }
902
1084
  async execute2(method, url, body, options = {}, forceRevisit, emailMetadata) {
903
- await this.waitForStorage();
904
- if (this.isCacheEnabled) {
905
- await this.waitForCache();
906
- }
907
- if (this.config.enableNavigationHistory) {
908
- await this.waitForNavigationHistory();
909
- }
910
- this.queue.add(() => this.leadsFinder.parseExternalWebsite(url, method, body, {
911
- httpConfig: options,
912
- saveCache: this.saveCache.bind(this),
913
- saveUrl: this.saveUrl.bind(this),
914
- getCache: this.getCache.bind(this),
915
- hasUrlInCache: this.hasUrlInCache.bind(this),
916
- onEmailDiscovered: this.emailDiscoveredEvents,
917
- onEmails: this.emailLeadsEvents,
918
- queue: this.queue,
919
- depth: 1,
920
- allowCrossDomainTravel: true,
921
- emailMetadata
922
- }, forceRevisit, true)).then();
923
- this.pendingVisitCount--;
1085
+ this.scraperQueue.add(async () => {
1086
+ await this.waitForStorage();
1087
+ if (this.isCacheEnabled) {
1088
+ await this.waitForCache();
1089
+ }
1090
+ if (this.config.enableNavigationHistory) {
1091
+ await this.waitForNavigationHistory();
1092
+ }
1093
+ await this.leadsFinder.parseExternalWebsite(url, method, body, {
1094
+ httpConfig: options,
1095
+ saveCache: this.saveCache.bind(this),
1096
+ saveUrl: this.saveUrl.bind(this),
1097
+ getCache: this.getCache.bind(this),
1098
+ hasUrlInCache: this.hasUrlInCache.bind(this),
1099
+ onEmailDiscovered: this.emailDiscoveredEvents,
1100
+ onEmails: this.emailLeadsEvents,
1101
+ queue: this.scraperQueue,
1102
+ depth: 1,
1103
+ allowCrossDomainTravel: true,
1104
+ emailMetadata
1105
+ }, forceRevisit, true);
1106
+ }).catch((err) => {
1107
+ if (this.config.debug)
1108
+ console.warn("[Crawler] execute2() task error:", err?.message);
1109
+ });
924
1110
  }
925
1111
  async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0, parentUrl, skipCache, emailMetadata) {
926
1112
  try {
@@ -948,6 +1134,14 @@ export class Crawler {
948
1134
  }
949
1135
  const requestStartTime = Date.now();
950
1136
  const response = cache && method === "GET" && !skipCache ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) : await (method === "GET" ? this.http.get(url, options) : method === "PATCH" ? this.http.patch(url, body, options) : method === "POST" ? this.http.post(url, body, options) : this.http.put(url, body, options));
1137
+ if (!response) {
1138
+ this.crawlStats.urlsFailed++;
1139
+ this.healthMetrics.recordRequest(Date.now() - requestStartTime, false);
1140
+ if (this.config.debug) {
1141
+ console.log(`[Crawler] Request failed for ${url}`);
1142
+ }
1143
+ return;
1144
+ }
951
1145
  if (!cache) {
952
1146
  const responseTime = Date.now() - requestStartTime;
953
1147
  this.calculateAutoThrottleDelay(domain, responseTime);
@@ -973,6 +1167,8 @@ export class Crawler {
973
1167
  }
974
1168
  }
975
1169
  this.crawlStats.urlsVisited++;
1170
+ const finalResponseTime = cache ? 0 : Date.now() - requestStartTime;
1171
+ this.healthMetrics.recordRequest(finalResponseTime, true);
976
1172
  if (res.finalUrl && res.finalUrl !== url && this.redirectHandlers.length > 0) {
977
1173
  await this.triggerRedirectHandlers({
978
1174
  originalUrl: url,
@@ -992,7 +1188,7 @@ export class Crawler {
992
1188
  });
993
1189
  if (res.contentType && res.contentType.includes("/json")) {
994
1190
  if (this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) {
995
- this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.queue, emailMetadata);
1191
+ this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.scraperQueue, emailMetadata);
996
1192
  }
997
1193
  for (let i = 0;i < this.jsonEvents.length; i++) {
998
1194
  const event = this.jsonEvents[i];
@@ -1007,7 +1203,7 @@ export class Crawler {
1007
1203
  if (!res.contentType || !res.contentType.includes("/html") || typeof res.data !== "string")
1008
1204
  return;
1009
1205
  if ((this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) && isEmail) {
1010
- this.leadsFinder.extractEmails(res.data, res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.queue, emailMetadata);
1206
+ this.leadsFinder.extractEmails(res.data, res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.scraperQueue, emailMetadata);
1011
1207
  }
1012
1208
  const { document } = parseHTML(res.data.addBaseUrl(res.finalUrl));
1013
1209
  document.URL = res.finalUrl;
@@ -1072,23 +1268,33 @@ export class Crawler {
1072
1268
  }
1073
1269
  }
1074
1270
  async waitForAll() {
1075
- if (this.pendingVisitCount === 0 && this.pendingExecutions.size === 0 && this.queue.size === 0 && this.queue.pending === 0 && !this.crawlStarted) {
1076
- await this.triggerFinishHandlers();
1077
- await this.destroy();
1078
- return;
1079
- }
1271
+ const MIN_DELAY = 50;
1272
+ const MAX_DELAY = 500;
1273
+ let currentDelay = MIN_DELAY;
1274
+ let consecutiveIdleChecks = 0;
1275
+ const REQUIRED_IDLE_CHECKS = 3;
1080
1276
  while (true) {
1081
- while (this.pendingVisitCount > 0) {
1082
- await new Promise((resolve) => setTimeout(resolve, 10));
1083
- }
1084
- if (this.pendingExecutions.size > 0) {
1085
- await Promise.allSettled([...this.pendingExecutions]);
1086
- }
1087
- if (this.queue.size > 0 || this.queue.pending > 0) {
1088
- await this.queue.onIdle();
1089
- }
1090
- if (this.pendingVisitCount === 0 && this.pendingExecutions.size === 0 && this.queue.size === 0 && this.queue.pending === 0) {
1091
- break;
1277
+ await this.queue.onIdle();
1278
+ await new Promise((resolve) => setTimeout(resolve, currentDelay));
1279
+ const scraperSize = this.scraperQueue.size;
1280
+ const scraperPending = this.scraperQueue.pending;
1281
+ if (scraperSize === 0 && scraperPending === 0) {
1282
+ const queueSize = this.queue.size;
1283
+ const queuePending = this.queue.pending;
1284
+ if (queueSize === 0 && queuePending === 0) {
1285
+ consecutiveIdleChecks++;
1286
+ if (consecutiveIdleChecks >= REQUIRED_IDLE_CHECKS) {
1287
+ break;
1288
+ }
1289
+ currentDelay = Math.max(MIN_DELAY, currentDelay / 2);
1290
+ } else {
1291
+ consecutiveIdleChecks = 0;
1292
+ currentDelay = Math.min(MAX_DELAY, currentDelay * 1.5);
1293
+ }
1294
+ } else {
1295
+ consecutiveIdleChecks = 0;
1296
+ currentDelay = Math.min(MAX_DELAY, currentDelay * 1.5);
1297
+ await this.scraperQueue.onIdle();
1092
1298
  }
1093
1299
  }
1094
1300
  await this.triggerFinishHandlers();
@@ -1112,7 +1318,18 @@ export class Crawler {
1112
1318
  if (this.isDestroyed)
1113
1319
  return;
1114
1320
  this.isDestroyed = true;
1321
+ this.removeShutdownHandlers();
1322
+ if (this.cleanupInterval) {
1323
+ clearInterval(this.cleanupInterval);
1324
+ this.cleanupInterval = undefined;
1325
+ }
1326
+ if (this.checkpointInterval) {
1327
+ clearInterval(this.checkpointInterval);
1328
+ this.checkpointInterval = undefined;
1329
+ }
1330
+ this.memoryMonitor.destroy();
1115
1331
  this.queue.destroy();
1332
+ this.scraperQueue.destroy();
1116
1333
  this.config.destroyLimiters();
1117
1334
  this.events.length = 0;
1118
1335
  this.jsonEvents.length = 0;
@@ -1121,6 +1338,10 @@ export class Crawler {
1121
1338
  this.rawResponseEvents.length = 0;
1122
1339
  this.emailDiscoveredEvents.length = 0;
1123
1340
  this.emailLeadsEvents.length = 0;
1341
+ this.domainResponseTimes.clear();
1342
+ this.domainCurrentDelay.clear();
1343
+ this.urlDepthMap.clear();
1344
+ this.collectedData.clear();
1124
1345
  await this.close();
1125
1346
  resetGlobalAgentPool();
1126
1347
  }
@@ -1,40 +1,40 @@
1
- const _mod_qcm6x2 = require('./crawler.cjs');
2
- exports.Crawler = _mod_qcm6x2.Crawler;;
3
- const _mod_732kra = require('./crawler-options.cjs');
4
- exports.CrawlerOptions = _mod_732kra.CrawlerOptions;;
5
- const _mod_z5v0eg = require('./plugin/robots-txt.cjs');
6
- exports.RobotsTxt = _mod_z5v0eg.RobotsTxt;;
7
- const _mod_cfyi08 = require('./plugin/file-cacher.cjs');
8
- exports.FileCacher = _mod_cfyi08.FileCacher;;
9
- const _mod_noooi1 = require('./plugin/url-store.cjs');
10
- exports.UrlStore = _mod_noooi1.UrlStore;;
11
- const _mod_i5bk7z = require('./plugin/navigation-history.cjs');
12
- exports.NavigationHistory = _mod_i5bk7z.NavigationHistory;;
13
- const _mod_b7gkfk = require('./addon/oxylabs/index.cjs');
14
- exports.Oxylabs = _mod_b7gkfk.Oxylabs;;
15
- const _mod_38cltn = require('./addon/oxylabs/options.cjs');
16
- exports.OXYLABS_BROWSER_TYPES = _mod_38cltn.OXYLABS_BROWSER_TYPES;
17
- exports.OXYLABS_COMMON_LOCALES = _mod_38cltn.OXYLABS_COMMON_LOCALES;
18
- exports.OXYLABS_COMMON_GEO_LOCATIONS = _mod_38cltn.OXYLABS_COMMON_GEO_LOCATIONS;
19
- exports.OXYLABS_US_STATES = _mod_38cltn.OXYLABS_US_STATES;
20
- exports.OXYLABS_EUROPEAN_COUNTRIES = _mod_38cltn.OXYLABS_EUROPEAN_COUNTRIES;
21
- exports.OXYLABS_ASIAN_COUNTRIES = _mod_38cltn.OXYLABS_ASIAN_COUNTRIES;
22
- exports.getRandomOxylabsBrowserType = _mod_38cltn.getRandomBrowserType;
23
- exports.getRandomOxylabsLocale = _mod_38cltn.getRandomLocale;
24
- exports.getRandomOxylabsGeoLocation = _mod_38cltn.getRandomGeoLocation;;
25
- const _mod_hg233z = require('./addon/decodo/index.cjs');
26
- exports.Decodo = _mod_hg233z.Decodo;;
27
- const _mod_5tt6b2 = require('./addon/decodo/options.cjs');
28
- exports.DECODO_DEVICE_TYPES = _mod_5tt6b2.DECODO_DEVICE_TYPES;
29
- exports.DECODO_HEADLESS_MODES = _mod_5tt6b2.DECODO_HEADLESS_MODES;
30
- exports.DECODO_COMMON_LOCALES = _mod_5tt6b2.DECODO_COMMON_LOCALES;
31
- exports.DECODO_COMMON_COUNTRIES = _mod_5tt6b2.DECODO_COMMON_COUNTRIES;
32
- exports.DECODO_EUROPEAN_COUNTRIES = _mod_5tt6b2.DECODO_EUROPEAN_COUNTRIES;
33
- exports.DECODO_ASIAN_COUNTRIES = _mod_5tt6b2.DECODO_ASIAN_COUNTRIES;
34
- exports.DECODO_US_STATES = _mod_5tt6b2.DECODO_US_STATES;
35
- exports.DECODO_COMMON_CITIES = _mod_5tt6b2.DECODO_COMMON_CITIES;
36
- exports.getRandomDecodoDeviceType = _mod_5tt6b2.getRandomDeviceType;
37
- exports.getRandomDecodoLocale = _mod_5tt6b2.getRandomLocale;
38
- exports.getRandomDecodoCountry = _mod_5tt6b2.getRandomCountry;
39
- exports.getRandomDecodoCity = _mod_5tt6b2.getRandomCity;
40
- exports.generateDecodoSessionId = _mod_5tt6b2.generateSessionId;;
1
+ const _mod_7qsaj1 = require('./crawler.cjs');
2
+ exports.Crawler = _mod_7qsaj1.Crawler;;
3
+ const _mod_5rlu0e = require('./crawler-options.cjs');
4
+ exports.CrawlerOptions = _mod_5rlu0e.CrawlerOptions;;
5
+ const _mod_skia2o = require('./plugin/robots-txt.cjs');
6
+ exports.RobotsTxt = _mod_skia2o.RobotsTxt;;
7
+ const _mod_w2aaxb = require('./plugin/file-cacher.cjs');
8
+ exports.FileCacher = _mod_w2aaxb.FileCacher;;
9
+ const _mod_f9de8f = require('./plugin/url-store.cjs');
10
+ exports.UrlStore = _mod_f9de8f.UrlStore;;
11
+ const _mod_wqtwlm = require('./plugin/navigation-history.cjs');
12
+ exports.NavigationHistory = _mod_wqtwlm.NavigationHistory;;
13
+ const _mod_hj9vi3 = require('./addon/oxylabs/index.cjs');
14
+ exports.Oxylabs = _mod_hj9vi3.Oxylabs;;
15
+ const _mod_7kzws5 = require('./addon/oxylabs/options.cjs');
16
+ exports.OXYLABS_BROWSER_TYPES = _mod_7kzws5.OXYLABS_BROWSER_TYPES;
17
+ exports.OXYLABS_COMMON_LOCALES = _mod_7kzws5.OXYLABS_COMMON_LOCALES;
18
+ exports.OXYLABS_COMMON_GEO_LOCATIONS = _mod_7kzws5.OXYLABS_COMMON_GEO_LOCATIONS;
19
+ exports.OXYLABS_US_STATES = _mod_7kzws5.OXYLABS_US_STATES;
20
+ exports.OXYLABS_EUROPEAN_COUNTRIES = _mod_7kzws5.OXYLABS_EUROPEAN_COUNTRIES;
21
+ exports.OXYLABS_ASIAN_COUNTRIES = _mod_7kzws5.OXYLABS_ASIAN_COUNTRIES;
22
+ exports.getRandomOxylabsBrowserType = _mod_7kzws5.getRandomBrowserType;
23
+ exports.getRandomOxylabsLocale = _mod_7kzws5.getRandomLocale;
24
+ exports.getRandomOxylabsGeoLocation = _mod_7kzws5.getRandomGeoLocation;;
25
+ const _mod_gdeyg9 = require('./addon/decodo/index.cjs');
26
+ exports.Decodo = _mod_gdeyg9.Decodo;;
27
+ const _mod_jbu0zi = require('./addon/decodo/options.cjs');
28
+ exports.DECODO_DEVICE_TYPES = _mod_jbu0zi.DECODO_DEVICE_TYPES;
29
+ exports.DECODO_HEADLESS_MODES = _mod_jbu0zi.DECODO_HEADLESS_MODES;
30
+ exports.DECODO_COMMON_LOCALES = _mod_jbu0zi.DECODO_COMMON_LOCALES;
31
+ exports.DECODO_COMMON_COUNTRIES = _mod_jbu0zi.DECODO_COMMON_COUNTRIES;
32
+ exports.DECODO_EUROPEAN_COUNTRIES = _mod_jbu0zi.DECODO_EUROPEAN_COUNTRIES;
33
+ exports.DECODO_ASIAN_COUNTRIES = _mod_jbu0zi.DECODO_ASIAN_COUNTRIES;
34
+ exports.DECODO_US_STATES = _mod_jbu0zi.DECODO_US_STATES;
35
+ exports.DECODO_COMMON_CITIES = _mod_jbu0zi.DECODO_COMMON_CITIES;
36
+ exports.getRandomDecodoDeviceType = _mod_jbu0zi.getRandomDeviceType;
37
+ exports.getRandomDecodoLocale = _mod_jbu0zi.getRandomLocale;
38
+ exports.getRandomDecodoCountry = _mod_jbu0zi.getRandomCountry;
39
+ exports.getRandomDecodoCity = _mod_jbu0zi.getRandomCity;
40
+ exports.generateDecodoSessionId = _mod_jbu0zi.generateSessionId;;
@@ -0,0 +1 @@
1
+ class i{items=[];maxSize;evictionCount;onEviction;constructor(t={}){this.maxSize=t.maxSize??1e5;let e=t.evictionRatio??0.1;this.evictionCount=Math.max(1,Math.ceil(this.maxSize*e)),this.onEviction=t.onEviction}push(t){if(this.items.length>=this.maxSize)this.evict();return this.items.push(t)}pushMany(t){for(let e of t)this.push(e);return this.items.length}get(t){return this.items[t]}get length(){return this.items.length}clear(){this.items.length=0}toArray(){return[...this.items]}[Symbol.iterator](){return this.items[Symbol.iterator]()}forEach(t){this.items.forEach(t)}map(t){return this.items.map(t)}filter(t){return this.items.filter(t)}evict(){let t=this.items.splice(0,this.evictionCount);if(this.onEviction&&t.length>0)this.onEviction(t,this.items.length)}get maxCapacity(){return this.maxSize}get isAtCapacity(){return this.items.length>=this.maxSize}}exports.CappedArray=i;exports.default=i;module.exports=Object.assign(i,exports);
@@ -0,0 +1 @@
1
+ class e{items=[];maxSize;evictionCount;onEviction;constructor(t={}){this.maxSize=t.maxSize??1e5;let i=t.evictionRatio??0.1;this.evictionCount=Math.max(1,Math.ceil(this.maxSize*i)),this.onEviction=t.onEviction}push(t){if(this.items.length>=this.maxSize)this.evict();return this.items.push(t)}pushMany(t){for(let i of t)this.push(i);return this.items.length}get(t){return this.items[t]}get length(){return this.items.length}clear(){this.items.length=0}toArray(){return[...this.items]}[Symbol.iterator](){return this.items[Symbol.iterator]()}forEach(t){this.items.forEach(t)}map(t){return this.items.map(t)}filter(t){return this.items.filter(t)}evict(){let t=this.items.splice(0,this.evictionCount);if(this.onEviction&&t.length>0)this.onEviction(t,this.items.length)}get maxCapacity(){return this.maxSize}get isAtCapacity(){return this.items.length>=this.maxSize}}var h=e;export{h as default,e as CappedArray};
@@ -0,0 +1 @@
1
+ class i{map;maxSize;evictionCount;constructor(t={}){this.maxSize=t.maxSize??1e4,this.evictionRatio=t.evictionRatio??0.1,this.evictionCount=Math.max(1,Math.ceil(this.maxSize*this.evictionRatio)),this.map=new Map}evictionRatio;set(t,e){if(this.map.has(t))this.map.delete(t);else if(this.map.size>=this.maxSize)this.evict();return this.map.set(t,e),this}get(t){return this.map.get(t)}getAndTouch(t){let e=this.map.get(t);if(e!==void 0)this.map.delete(t),this.map.set(t,e);return e}has(t){return this.map.has(t)}delete(t){return this.map.delete(t)}clear(){this.map.clear()}get size(){return this.map.size}entries(){return this.map.entries()}keys(){return this.map.keys()}values(){return this.map.values()}forEach(t){this.map.forEach(t)}evict(){let t=this.map.keys();for(let e=0;e<this.evictionCount;e++){let s=t.next();if(s.done)break;this.map.delete(s.value)}}toMap(){return new Map(this.map)}}exports.CappedMap=i;exports.default=i;module.exports=Object.assign(i,exports);
@@ -0,0 +1 @@
1
+ class s{map;maxSize;evictionCount;constructor(t={}){this.maxSize=t.maxSize??1e4,this.evictionRatio=t.evictionRatio??0.1,this.evictionCount=Math.max(1,Math.ceil(this.maxSize*this.evictionRatio)),this.map=new Map}evictionRatio;set(t,e){if(this.map.has(t))this.map.delete(t);else if(this.map.size>=this.maxSize)this.evict();return this.map.set(t,e),this}get(t){return this.map.get(t)}getAndTouch(t){let e=this.map.get(t);if(e!==void 0)this.map.delete(t),this.map.set(t,e);return e}has(t){return this.map.has(t)}delete(t){return this.map.delete(t)}clear(){this.map.clear()}get size(){return this.map.size}entries(){return this.map.entries()}keys(){return this.map.keys()}values(){return this.map.values()}forEach(t){this.map.forEach(t)}evict(){let t=this.map.keys();for(let e=0;e<this.evictionCount;e++){let i=t.next();if(i.done)break;this.map.delete(i.value)}}toMap(){return new Map(this.map)}}var h=s;export{h as default,s as CappedMap};