rezo 1.0.43 → 1.0.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/dist/adapters/index.cjs +6 -6
  2. package/dist/cache/index.cjs +9 -15
  3. package/dist/cache/index.js +0 -3
  4. package/dist/crawler/addon/decodo/index.cjs +1 -0
  5. package/dist/crawler/addon/decodo/index.js +1 -0
  6. package/dist/crawler/crawler-options.cjs +1 -0
  7. package/dist/crawler/crawler-options.js +1 -0
  8. package/dist/{plugin → crawler}/crawler.cjs +392 -32
  9. package/dist/{plugin → crawler}/crawler.js +392 -32
  10. package/dist/crawler/index.cjs +40 -0
  11. package/dist/{plugin → crawler}/index.js +4 -2
  12. package/dist/crawler/plugin/file-cacher.cjs +19 -0
  13. package/dist/crawler/plugin/file-cacher.js +19 -0
  14. package/dist/crawler/plugin/index.cjs +1 -0
  15. package/dist/crawler/plugin/index.js +1 -0
  16. package/dist/crawler/plugin/navigation-history.cjs +43 -0
  17. package/dist/crawler/plugin/navigation-history.js +43 -0
  18. package/dist/crawler/plugin/robots-txt.cjs +2 -0
  19. package/dist/crawler/plugin/robots-txt.js +2 -0
  20. package/dist/crawler/plugin/url-store.cjs +18 -0
  21. package/dist/crawler/plugin/url-store.js +18 -0
  22. package/dist/crawler.d.ts +315 -172
  23. package/dist/entries/crawler.cjs +5 -5
  24. package/dist/entries/crawler.js +2 -2
  25. package/dist/index.cjs +27 -27
  26. package/dist/internal/agents/index.cjs +10 -10
  27. package/dist/proxy/index.cjs +4 -4
  28. package/dist/queue/index.cjs +8 -8
  29. package/dist/responses/universal/index.cjs +11 -11
  30. package/package.json +2 -6
  31. package/dist/cache/file-cacher.cjs +0 -270
  32. package/dist/cache/file-cacher.js +0 -267
  33. package/dist/cache/navigation-history.cjs +0 -298
  34. package/dist/cache/navigation-history.js +0 -296
  35. package/dist/cache/url-store.cjs +0 -294
  36. package/dist/cache/url-store.js +0 -291
  37. package/dist/plugin/addon/decodo/index.cjs +0 -1
  38. package/dist/plugin/addon/decodo/index.js +0 -1
  39. package/dist/plugin/crawler-options.cjs +0 -1
  40. package/dist/plugin/crawler-options.js +0 -1
  41. package/dist/plugin/index.cjs +0 -36
  42. /package/dist/{plugin → crawler}/addon/decodo/options.cjs +0 -0
  43. /package/dist/{plugin → crawler}/addon/decodo/options.js +0 -0
  44. /package/dist/{plugin → crawler}/addon/decodo/types.cjs +0 -0
  45. /package/dist/{plugin → crawler}/addon/decodo/types.js +0 -0
  46. /package/dist/{plugin → crawler}/addon/oxylabs/index.cjs +0 -0
  47. /package/dist/{plugin → crawler}/addon/oxylabs/index.js +0 -0
  48. /package/dist/{plugin → crawler}/addon/oxylabs/options.cjs +0 -0
  49. /package/dist/{plugin → crawler}/addon/oxylabs/options.js +0 -0
  50. /package/dist/{plugin → crawler}/addon/oxylabs/types.cjs +0 -0
  51. /package/dist/{plugin → crawler}/addon/oxylabs/types.js +0 -0
  52. /package/dist/{plugin → crawler}/scraper.cjs +0 -0
  53. /package/dist/{plugin → crawler}/scraper.js +0 -0
@@ -1,7 +1,8 @@
1
1
  const fs = require("node:fs");
2
- const { FileCacher } = require('../cache/file-cacher.cjs');
3
- const { UrlStore } = require('../cache/url-store.cjs');
4
- const { NavigationHistory } = require('../cache/navigation-history.cjs');
2
+ const { FileCacher } = require('./plugin/file-cacher.cjs');
3
+ const { UrlStore } = require('./plugin/url-store.cjs');
4
+ const { NavigationHistory } = require('./plugin/navigation-history.cjs');
5
+ const { RobotsTxt } = require('./plugin/robots-txt.cjs');
5
6
  const { parseHTML } = require("linkedom");
6
7
  const path = require("node:path");
7
8
  const { Rezo } = require('../core/rezo.cjs');
@@ -54,6 +55,23 @@ class Crawler {
54
55
  navigationHistoryInitPromise = null;
55
56
  adapterExecutor = null;
56
57
  adapterType;
58
+ pendingExecutions = new Set;
59
+ robotsTxt;
60
+ domainResponseTimes = new Map;
61
+ domainCurrentDelay = new Map;
62
+ crawlStats = {
63
+ urlsVisited: 0,
64
+ urlsQueued: 0,
65
+ urlsFailed: 0,
66
+ startTime: 0,
67
+ currentDepth: 0
68
+ };
69
+ urlDepthMap = new Map;
70
+ startHandlers = [];
71
+ finishHandlers = [];
72
+ redirectHandlers = [];
73
+ collectedData = [];
74
+ crawlStarted = false;
57
75
  constructor(crawlerOptions, http = new Rezo) {
58
76
  this.http = http;
59
77
  this.queue = new RezoQueue({
@@ -71,7 +89,6 @@ class Crawler {
71
89
  fs.mkdirSync(path.dirname(dbUrl), { recursive: true });
72
90
  FileCacher.create({
73
91
  cacheDir: dbUrl,
74
- softDelete: false,
75
92
  ttl: cacheTTL,
76
93
  encryptNamespace: true
77
94
  }).then((storage) => {
@@ -110,6 +127,14 @@ class Crawler {
110
127
  }
111
128
  this.initializeAdapter();
112
129
  this.leadsFinder = new Scraper(this.http, this.config, this._onEmailLeads.bind(this), this._onEmailDiscovered.bind(this), this.config.debug);
130
+ this.robotsTxt = new RobotsTxt({
131
+ userAgent: this.config.userAgent || "RezoBot",
132
+ cacheTTL: 24 * 60 * 60 * 1000
133
+ });
134
+ this.crawlStats.startTime = Date.now();
135
+ if (this.config.baseUrl) {
136
+ this.urlDepthMap.set(this.config.baseUrl, 0);
137
+ }
113
138
  }
114
139
  async initializeAdapter() {
115
140
  try {
@@ -148,6 +173,8 @@ class Crawler {
148
173
  async waitForNavigationHistory() {
149
174
  if (!this.config.enableNavigationHistory)
150
175
  return;
176
+ if (this.isNavigationHistoryReady && this.isSessionReady)
177
+ return;
151
178
  if (this.navigationHistoryInitPromise) {
152
179
  await this.navigationHistoryInitPromise;
153
180
  }
@@ -361,6 +388,18 @@ class Crawler {
361
388
  this.emailLeadsEvents.push(handler);
362
389
  return this;
363
390
  }
391
+ onStart(handler) {
392
+ this.startHandlers.push(handler);
393
+ return this;
394
+ }
395
+ onFinish(handler) {
396
+ this.finishHandlers.push(handler);
397
+ return this;
398
+ }
399
+ onRedirect(handler) {
400
+ this.redirectHandlers.push(handler);
401
+ return this;
402
+ }
364
403
  onRawData(handler) {
365
404
  this.rawResponseEvents.push({
366
405
  handler: "_onRawResponse",
@@ -435,20 +474,25 @@ class Crawler {
435
474
  this.queue.add(() => handler(document.body));
436
475
  }
437
476
  _onAttribute(selection, attribute, handler, document) {
438
- selection = typeof attribute === "function" ? selection : null;
439
- attribute = typeof attribute === "function" ? selection : attribute;
440
- handler = typeof attribute === "function" ? attribute : handler;
441
- selection = selection || `[${attribute}]`;
442
- const elements = document.querySelectorAll(selection);
477
+ const isSimpleForm = typeof attribute === "function";
478
+ const actualAttribute = isSimpleForm ? selection : attribute;
479
+ const actualHandler = isSimpleForm ? attribute : handler;
480
+ const actualSelection = isSimpleForm ? `[${selection}]` : selection || `[${attribute}]`;
481
+ const elements = document.querySelectorAll(actualSelection);
443
482
  for (let i = 0;i < elements.length; i++) {
444
- if (elements[i].hasAttribute(attribute))
445
- this.queue.add(() => handler(elements[i].getAttribute(attribute)));
483
+ const el = elements[i];
484
+ if (el.hasAttribute(actualAttribute)) {
485
+ const value = el.getAttribute(actualAttribute);
486
+ this.queue.add(() => actualHandler.call(el, value, actualAttribute));
487
+ }
446
488
  }
447
489
  }
448
490
  _onText(selection, handler, document) {
449
491
  const elements = document.querySelectorAll(selection);
450
492
  for (let i = 0;i < elements.length; i++) {
451
- this.queue.add(() => handler(elements[i].textContent));
493
+ const el = elements[i];
494
+ const text = el.textContent;
495
+ this.queue.add(() => handler.call(el, text));
452
496
  }
453
497
  }
454
498
  _onSelection(selection, handler, document) {
@@ -466,8 +510,11 @@ class Crawler {
466
510
  _onHref(handler, document) {
467
511
  const elements = document.querySelectorAll("a, link");
468
512
  for (let i = 0;i < elements.length; i++) {
469
- if (elements[i].hasAttribute("href"))
470
- this.queue.add(() => handler(new URL(elements[i].getAttribute("href"), document.URL).href));
513
+ const el = elements[i];
514
+ if (el.hasAttribute("href")) {
515
+ const href = new URL(el.getAttribute("href"), document.URL).href;
516
+ this.queue.add(() => handler.call(el, href));
517
+ }
471
518
  }
472
519
  }
473
520
  _onAnchor(selection, handler, document) {
@@ -501,6 +548,233 @@ class Crawler {
501
548
  _onResponse(handler, response) {
502
549
  this.queue.add(() => handler(response));
503
550
  }
551
+ calculateAutoThrottleDelay(domain, responseTime) {
552
+ if (!this.config.autoThrottle)
553
+ return 0;
554
+ let times = this.domainResponseTimes.get(domain) || [];
555
+ times.push(responseTime);
556
+ if (times.length > 10) {
557
+ times = times.slice(-10);
558
+ }
559
+ this.domainResponseTimes.set(domain, times);
560
+ const avgResponseTime = times.reduce((a, b) => a + b, 0) / times.length;
561
+ const targetDelay = this.config.autoThrottleTargetDelay;
562
+ const loadFactor = avgResponseTime / 200;
563
+ let newDelay = Math.round(targetDelay * loadFactor);
564
+ newDelay = Math.max(this.config.autoThrottleMinDelay, newDelay);
565
+ newDelay = Math.min(this.config.autoThrottleMaxDelay, newDelay);
566
+ this.domainCurrentDelay.set(domain, newDelay);
567
+ if (this.config.debug) {
568
+ console.log(`[AutoThrottle] ${domain}: avgRT=${avgResponseTime.toFixed(0)}ms, delay=${newDelay}ms`);
569
+ }
570
+ return newDelay;
571
+ }
572
+ getAutoThrottleDelay(domain) {
573
+ if (!this.config.autoThrottle)
574
+ return 0;
575
+ return this.domainCurrentDelay.get(domain) || this.config.autoThrottleMinDelay;
576
+ }
577
+ async handle429Response(url, response) {
578
+ let retryAfter = 0;
579
+ const retryAfterHeader = response?.headers?.["retry-after"] || response?.headers?.get?.("retry-after");
580
+ if (retryAfterHeader) {
581
+ const parsed = parseInt(retryAfterHeader, 10);
582
+ if (!isNaN(parsed)) {
583
+ retryAfter = parsed * 1000;
584
+ } else {
585
+ const date = new Date(retryAfterHeader);
586
+ if (!isNaN(date.getTime())) {
587
+ retryAfter = date.getTime() - Date.now();
588
+ }
589
+ }
590
+ }
591
+ if (retryAfter <= 0) {
592
+ retryAfter = 60000;
593
+ }
594
+ const maxWait = this.config.maxWaitOn429;
595
+ const alwaysWait = this.config.alwaysWaitOn429;
596
+ if (retryAfter > maxWait && !alwaysWait) {
597
+ const waitMinutes = Math.round(retryAfter / 60000);
598
+ const error = new Error(`Rate limited: Server requested wait time of ${waitMinutes} minutes, which exceeds maxWaitOn429 (${Math.round(maxWait / 60000)} minutes). Set alwaysWaitOn429: true to wait regardless.`);
599
+ error.code = "REZ_RATE_LIMIT_EXCEEDED";
600
+ error.url = url;
601
+ error.status = 429;
602
+ throw error;
603
+ }
604
+ if (retryAfter > maxWait && alwaysWait) {
605
+ const waitMinutes = Math.round(retryAfter / 60000);
606
+ console.warn(`[Crawler] WARNING: Rate limited on ${url}. Server requested ${waitMinutes} minute wait. Waiting because alwaysWaitOn429 is enabled.`);
607
+ }
608
+ if (this.config.debug) {
609
+ console.log(`[Crawler] 429 Rate Limited: waiting ${Math.round(retryAfter / 1000)}s before retry`);
610
+ }
611
+ return { shouldRetry: true, waitTime: retryAfter };
612
+ }
613
+ async checkCrawlLimits(url, parentUrl) {
614
+ if (this.config.maxUrls > 0 && this.crawlStats.urlsVisited >= this.config.maxUrls) {
615
+ return { allowed: false, reason: `maxUrls limit reached (${this.config.maxUrls})` };
616
+ }
617
+ if (this.config.maxDepth > 0) {
618
+ const parentDepth = parentUrl ? this.urlDepthMap.get(parentUrl) ?? 0 : 0;
619
+ const urlDepth = this.urlDepthMap.get(url) ?? parentDepth + 1;
620
+ if (urlDepth > this.config.maxDepth) {
621
+ return { allowed: false, reason: `maxDepth limit reached (depth ${urlDepth} > ${this.config.maxDepth})` };
622
+ }
623
+ if (!this.urlDepthMap.has(url)) {
624
+ this.urlDepthMap.set(url, urlDepth);
625
+ this.crawlStats.currentDepth = Math.max(this.crawlStats.currentDepth, urlDepth);
626
+ }
627
+ }
628
+ if (this.config.respectRobotsTxt) {
629
+ try {
630
+ if (!this.robotsTxt.isCached(url)) {
631
+ await this.robotsTxt.fetch(url, async (robotsUrl) => {
632
+ const response = await this.http.get(robotsUrl, { timeout: 1e4 });
633
+ return { status: response.status, data: response.data };
634
+ });
635
+ }
636
+ const allowed = this.robotsTxt.isAllowed(url);
637
+ if (!allowed) {
638
+ return { allowed: false, reason: "Blocked by robots.txt" };
639
+ }
640
+ } catch (error) {
641
+ if (this.config.debug) {
642
+ console.warn(`[Crawler] Failed to check robots.txt for ${url}:`, error);
643
+ }
644
+ }
645
+ }
646
+ return { allowed: true };
647
+ }
648
+ shouldFollowLink(element) {
649
+ if (this.config.followNofollow) {
650
+ return true;
651
+ }
652
+ const rel = element.getAttribute("rel");
653
+ if (rel && rel.toLowerCase().includes("nofollow")) {
654
+ return false;
655
+ }
656
+ return true;
657
+ }
658
+ checkResponseSize(contentLength) {
659
+ if (this.config.maxResponseSize > 0 && contentLength > this.config.maxResponseSize) {
660
+ return {
661
+ allowed: false,
662
+ reason: `Response size (${contentLength} bytes) exceeds maxResponseSize (${this.config.maxResponseSize} bytes)`
663
+ };
664
+ }
665
+ return { allowed: true };
666
+ }
667
+ collect(data) {
668
+ this.collectedData.push(data);
669
+ return this;
670
+ }
671
+ getCollectedData() {
672
+ return [...this.collectedData];
673
+ }
674
+ clearCollectedData() {
675
+ this.collectedData = [];
676
+ return this;
677
+ }
678
+ async exportData(filePath, format = "json") {
679
+ const data = this.collectedData;
680
+ if (data.length === 0) {
681
+ if (this.config.debug) {
682
+ console.warn("[Crawler] No data to export");
683
+ }
684
+ return;
685
+ }
686
+ let content;
687
+ switch (format) {
688
+ case "json":
689
+ content = JSON.stringify(data, null, 2);
690
+ break;
691
+ case "jsonl":
692
+ content = data.map((item) => JSON.stringify(item)).join(`
693
+ `);
694
+ break;
695
+ case "csv":
696
+ const keys = new Set;
697
+ data.forEach((item) => {
698
+ if (typeof item === "object" && item !== null) {
699
+ Object.keys(item).forEach((key) => keys.add(key));
700
+ }
701
+ });
702
+ const headers = Array.from(keys);
703
+ const escapeCSV = (val) => {
704
+ if (val === null || val === undefined)
705
+ return "";
706
+ const str = String(val);
707
+ if (str.includes(",") || str.includes('"') || str.includes(`
708
+ `)) {
709
+ return `"${str.replace(/"/g, '""')}"`;
710
+ }
711
+ return str;
712
+ };
713
+ const rows = [
714
+ headers.join(","),
715
+ ...data.map((item) => {
716
+ if (typeof item !== "object" || item === null) {
717
+ return escapeCSV(item);
718
+ }
719
+ return headers.map((key) => escapeCSV(item[key])).join(",");
720
+ })
721
+ ];
722
+ content = rows.join(`
723
+ `);
724
+ break;
725
+ default:
726
+ throw new Error(`Unsupported export format: ${format}`);
727
+ }
728
+ const dir = path.dirname(filePath);
729
+ if (!fs.existsSync(dir)) {
730
+ fs.mkdirSync(dir, { recursive: true });
731
+ }
732
+ fs.writeFileSync(filePath, content, "utf-8");
733
+ if (this.config.debug) {
734
+ console.log(`[Crawler] Exported ${data.length} items to ${filePath} (${format})`);
735
+ }
736
+ }
737
+ getStats() {
738
+ return { ...this.crawlStats };
739
+ }
740
+ async triggerStartHandlers() {
741
+ if (this.crawlStarted)
742
+ return;
743
+ this.crawlStarted = true;
744
+ this.crawlStats.startTime = Date.now();
745
+ for (const handler of this.startHandlers) {
746
+ try {
747
+ await handler();
748
+ } catch (error) {
749
+ if (this.config.debug) {
750
+ console.error("[Crawler] onStart handler error:", error);
751
+ }
752
+ }
753
+ }
754
+ }
755
+ async triggerFinishHandlers() {
756
+ this.crawlStats.endTime = Date.now();
757
+ for (const handler of this.finishHandlers) {
758
+ try {
759
+ await handler(this.crawlStats);
760
+ } catch (error) {
761
+ if (this.config.debug) {
762
+ console.error("[Crawler] onFinish handler error:", error);
763
+ }
764
+ }
765
+ }
766
+ }
767
+ async triggerRedirectHandlers(event) {
768
+ for (const handler of this.redirectHandlers) {
769
+ try {
770
+ await handler(event);
771
+ } catch (error) {
772
+ if (this.config.debug) {
773
+ console.error("[Crawler] onRedirect handler error:", error);
774
+ }
775
+ }
776
+ }
777
+ }
504
778
  buildUrl(url, params) {
505
779
  if (params) {
506
780
  const u = new URL(url, this.config.baseUrl);
@@ -570,16 +844,35 @@ class Crawler {
570
844
  this.addToNavigationQueue(url, method, body, headersObj);
571
845
  }
572
846
  if (deepEmailFinder) {
573
- this.execute2(method, url, body, _options, forceRevisit).then();
847
+ const p = this.execute2(method, url, body, _options, forceRevisit);
848
+ this.pendingExecutions.add(p);
849
+ p.finally(() => this.pendingExecutions.delete(p));
574
850
  return this;
575
851
  }
576
- this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions).then();
852
+ const p = this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions);
853
+ this.pendingExecutions.add(p);
854
+ p.finally(() => this.pendingExecutions.delete(p));
577
855
  return this;
578
856
  }
579
857
  async execute(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions) {
580
- this.queue.add(() => this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions)).then();
858
+ await this.waitForStorage();
859
+ if (this.isCacheEnabled) {
860
+ await this.waitForCache();
861
+ }
862
+ if (this.config.enableNavigationHistory) {
863
+ await this.waitForNavigationHistory();
864
+ }
865
+ const task = this.queue.add(() => this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions));
866
+ task.finally(() => this.pendingExecutions.delete(task));
581
867
  }
582
868
  async execute2(method, url, body, options = {}, forceRevisit) {
869
+ await this.waitForStorage();
870
+ if (this.isCacheEnabled) {
871
+ await this.waitForCache();
872
+ }
873
+ if (this.config.enableNavigationHistory) {
874
+ await this.waitForNavigationHistory();
875
+ }
583
876
  this.queue.add(() => this.leadsFinder.parseExternalWebsite(url, method, body, {
584
877
  httpConfig: options,
585
878
  saveCache: this.saveCache.bind(this),
@@ -593,21 +886,34 @@ class Crawler {
593
886
  allowCrossDomainTravel: true
594
887
  }, forceRevisit, true)).then();
595
888
  }
596
- async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0) {
889
+ async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0, parentUrl) {
597
890
  try {
598
- console.log({
599
- oxylabsOptions: typeof oxylabsOptions,
600
- oxylabsInstanse: typeof oxylabsInstanse,
601
- decodoInstanse: typeof decodoInstanse,
602
- decodoOptions: typeof decodoOptions
603
- });
891
+ await this.triggerStartHandlers();
892
+ const limitCheck = await this.checkCrawlLimits(url, parentUrl);
893
+ if (!limitCheck.allowed) {
894
+ if (this.config.debug) {
895
+ console.log(`[Crawler] Skipping ${url}: ${limitCheck.reason}`);
896
+ }
897
+ return;
898
+ }
899
+ this.crawlStats.urlsQueued++;
900
+ const domain = new URL(url).hostname;
901
+ const delay = this.getAutoThrottleDelay(domain);
902
+ if (delay > 0) {
903
+ await new Promise((resolve) => setTimeout(resolve, delay));
904
+ }
604
905
  const isVisited = forceRevisit ? false : await this.hasUrlInCache(url);
605
906
  const cache = await this.getCache(url);
606
907
  if (isVisited && !cache)
607
908
  return;
608
909
  if (isVisited && method !== "GET")
609
910
  return;
911
+ const requestStartTime = Date.now();
610
912
  const response = cache && method === "GET" ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) : await (method === "GET" ? this.http.get(url, options) : method === "PATCH" ? this.http.patch(url, body, options) : method === "POST" ? this.http.post(url, body, options) : this.http.put(url, body, options));
913
+ if (!cache) {
914
+ const responseTime = Date.now() - requestStartTime;
915
+ this.calculateAutoThrottleDelay(domain, responseTime);
916
+ }
611
917
  const res = {
612
918
  data: response.data || response.content || "",
613
919
  contentType: response.contentType || "",
@@ -619,11 +925,29 @@ class Crawler {
619
925
  cookies: response?.cookies?.serialized || response?.cookies,
620
926
  contentLength: response.contentLength || 0
621
927
  };
928
+ if (res.contentLength && res.contentLength > 0) {
929
+ const sizeCheck = this.checkResponseSize(res.contentLength);
930
+ if (!sizeCheck.allowed) {
931
+ if (this.config.debug) {
932
+ console.log(`[Crawler] Skipping ${url}: ${sizeCheck.reason}`);
933
+ }
934
+ return;
935
+ }
936
+ }
937
+ this.crawlStats.urlsVisited++;
938
+ if (res.finalUrl && res.finalUrl !== url && this.redirectHandlers.length > 0) {
939
+ await this.triggerRedirectHandlers({
940
+ originalUrl: url,
941
+ finalUrl: res.finalUrl,
942
+ redirectCount: response.redirectCount || 1,
943
+ statusCode: res.status
944
+ });
945
+ }
622
946
  if (!cache)
623
947
  await this.saveCache(url, res);
624
948
  if (!isVisited)
625
949
  await this.saveUrl(url);
626
- this.markUrlVisited(url, {
950
+ await this.markUrlVisited(url, {
627
951
  status: res.status,
628
952
  finalUrl: res.finalUrl,
629
953
  contentType: res.contentType
@@ -655,6 +979,24 @@ class Crawler {
655
979
  }
656
980
  } catch (e) {
657
981
  const error = e;
982
+ if (error?.response?.status === 429 || error?.status === 429) {
983
+ try {
984
+ const { shouldRetry, waitTime } = await this.handle429Response(url, error.response || error);
985
+ if (shouldRetry) {
986
+ await this.sleep(waitTime);
987
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
988
+ }
989
+ } catch (rateLimitError) {
990
+ this.crawlStats.urlsFailed++;
991
+ if (this.config.throwFatalError)
992
+ throw rateLimitError;
993
+ for (let i = 0;i < this.errorEvents.length; i++) {
994
+ const event = this.errorEvents[i];
995
+ this[event.handler](...event.attr, rateLimitError);
996
+ }
997
+ return;
998
+ }
999
+ }
658
1000
  if (error && error.response) {
659
1001
  const status = error.response.status;
660
1002
  const retryDelay = this.config.retryDelay || 1000;
@@ -666,16 +1008,17 @@ class Crawler {
666
1008
  if (retryWithoutProxyOnStatusCode && options.proxy && retryWithoutProxyOnStatusCode.includes(status) && retryCount < maxRetryAttempts) {
667
1009
  await this.sleep(retryDelay);
668
1010
  delete options.proxy;
669
- return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
1011
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
670
1012
  } else if (retryOnStatusCode && options.proxy && retryOnStatusCode.includes(status) && retryCount < maxRetryAttempts) {
671
1013
  await this.sleep(retryDelay);
672
- return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
1014
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
673
1015
  } else if (retryOnProxyError && options.proxy && retryCount < maxRetryOnProxyError) {
674
1016
  await this.sleep(retryDelay);
675
- return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
1017
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
676
1018
  }
677
1019
  }
678
- this.markUrlVisited(url, {
1020
+ this.crawlStats.urlsFailed++;
1021
+ await this.markUrlVisited(url, {
679
1022
  status: error?.response?.status || 0,
680
1023
  errorMessage: e.message || "Unknown error"
681
1024
  });
@@ -684,7 +1027,6 @@ class Crawler {
684
1027
  if (this.config.debug) {
685
1028
  console.log(`Error visiting ${url}: ${e.message}`);
686
1029
  }
687
- console.log(error);
688
1030
  for (let i = 0;i < this.errorEvents.length; i++) {
689
1031
  const event = this.errorEvents[i];
690
1032
  this[event.handler](...event.attr, e);
@@ -692,18 +1034,36 @@ class Crawler {
692
1034
  }
693
1035
  }
694
1036
  async waitForAll() {
1037
+ if (this.pendingExecutions.size > 0) {
1038
+ await Promise.allSettled([...this.pendingExecutions]);
1039
+ }
695
1040
  await this.queue.onIdle();
1041
+ await this.triggerFinishHandlers();
696
1042
  }
697
1043
  async done() {
698
1044
  return this.waitForAll();
699
1045
  }
700
1046
  async close() {
701
1047
  try {
702
- await this.cacher.close();
1048
+ await this.cacher?.close();
703
1049
  } catch {}
704
1050
  try {
705
- await this.urlStorage.close();
1051
+ await this.urlStorage?.close();
706
1052
  } catch {}
1053
+ try {
1054
+ await this.navigationHistory?.close();
1055
+ } catch {}
1056
+ }
1057
+ async destroy() {
1058
+ this.queue.clear();
1059
+ this.events.length = 0;
1060
+ this.jsonEvents.length = 0;
1061
+ this.errorEvents.length = 0;
1062
+ this.responseEvents.length = 0;
1063
+ this.rawResponseEvents.length = 0;
1064
+ this.emailDiscoveredEvents.length = 0;
1065
+ this.emailLeadsEvents.length = 0;
1066
+ await this.close();
707
1067
  }
708
1068
  }
709
1069