rezo 1.0.43 → 1.0.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/dist/adapters/entries/curl.d.ts +115 -0
  2. package/dist/adapters/entries/fetch.d.ts +115 -0
  3. package/dist/adapters/entries/http.d.ts +115 -0
  4. package/dist/adapters/entries/http2.d.ts +115 -0
  5. package/dist/adapters/entries/react-native.d.ts +115 -0
  6. package/dist/adapters/entries/xhr.d.ts +115 -0
  7. package/dist/adapters/fetch.cjs +18 -0
  8. package/dist/adapters/fetch.js +18 -0
  9. package/dist/adapters/http.cjs +18 -0
  10. package/dist/adapters/http.js +18 -0
  11. package/dist/adapters/http2.cjs +18 -0
  12. package/dist/adapters/http2.js +18 -0
  13. package/dist/adapters/index.cjs +6 -6
  14. package/dist/adapters/xhr.cjs +19 -0
  15. package/dist/adapters/xhr.js +19 -0
  16. package/dist/cache/index.cjs +9 -15
  17. package/dist/cache/index.js +0 -3
  18. package/dist/core/hooks.cjs +4 -2
  19. package/dist/core/hooks.js +4 -2
  20. package/dist/crawler/addon/decodo/index.cjs +1 -0
  21. package/dist/crawler/addon/decodo/index.js +1 -0
  22. package/dist/crawler/crawler-options.cjs +1 -0
  23. package/dist/crawler/crawler-options.js +1 -0
  24. package/dist/{plugin → crawler}/crawler.cjs +392 -32
  25. package/dist/{plugin → crawler}/crawler.js +392 -32
  26. package/dist/crawler/index.cjs +40 -0
  27. package/dist/{plugin → crawler}/index.js +4 -2
  28. package/dist/crawler/plugin/file-cacher.cjs +19 -0
  29. package/dist/crawler/plugin/file-cacher.js +19 -0
  30. package/dist/crawler/plugin/index.cjs +1 -0
  31. package/dist/crawler/plugin/index.js +1 -0
  32. package/dist/crawler/plugin/navigation-history.cjs +43 -0
  33. package/dist/crawler/plugin/navigation-history.js +43 -0
  34. package/dist/crawler/plugin/robots-txt.cjs +2 -0
  35. package/dist/crawler/plugin/robots-txt.js +2 -0
  36. package/dist/crawler/plugin/url-store.cjs +18 -0
  37. package/dist/crawler/plugin/url-store.js +18 -0
  38. package/dist/crawler.d.ts +430 -172
  39. package/dist/entries/crawler.cjs +5 -5
  40. package/dist/entries/crawler.js +2 -2
  41. package/dist/index.cjs +27 -27
  42. package/dist/index.d.ts +115 -0
  43. package/dist/internal/agents/index.cjs +10 -10
  44. package/dist/platform/browser.d.ts +115 -0
  45. package/dist/platform/bun.d.ts +115 -0
  46. package/dist/platform/deno.d.ts +115 -0
  47. package/dist/platform/node.d.ts +115 -0
  48. package/dist/platform/react-native.d.ts +115 -0
  49. package/dist/platform/worker.d.ts +115 -0
  50. package/dist/proxy/index.cjs +5 -5
  51. package/dist/proxy/index.js +1 -1
  52. package/dist/queue/index.cjs +8 -8
  53. package/dist/responses/universal/index.cjs +11 -11
  54. package/dist/utils/rate-limit-wait.cjs +217 -0
  55. package/dist/utils/rate-limit-wait.js +208 -0
  56. package/package.json +2 -6
  57. package/dist/cache/file-cacher.cjs +0 -270
  58. package/dist/cache/file-cacher.js +0 -267
  59. package/dist/cache/navigation-history.cjs +0 -298
  60. package/dist/cache/navigation-history.js +0 -296
  61. package/dist/cache/url-store.cjs +0 -294
  62. package/dist/cache/url-store.js +0 -291
  63. package/dist/plugin/addon/decodo/index.cjs +0 -1
  64. package/dist/plugin/addon/decodo/index.js +0 -1
  65. package/dist/plugin/crawler-options.cjs +0 -1
  66. package/dist/plugin/crawler-options.js +0 -1
  67. package/dist/plugin/index.cjs +0 -36
  68. /package/dist/{plugin → crawler}/addon/decodo/options.cjs +0 -0
  69. /package/dist/{plugin → crawler}/addon/decodo/options.js +0 -0
  70. /package/dist/{plugin → crawler}/addon/decodo/types.cjs +0 -0
  71. /package/dist/{plugin → crawler}/addon/decodo/types.js +0 -0
  72. /package/dist/{plugin → crawler}/addon/oxylabs/index.cjs +0 -0
  73. /package/dist/{plugin → crawler}/addon/oxylabs/index.js +0 -0
  74. /package/dist/{plugin → crawler}/addon/oxylabs/options.cjs +0 -0
  75. /package/dist/{plugin → crawler}/addon/oxylabs/options.js +0 -0
  76. /package/dist/{plugin → crawler}/addon/oxylabs/types.cjs +0 -0
  77. /package/dist/{plugin → crawler}/addon/oxylabs/types.js +0 -0
  78. /package/dist/{plugin → crawler}/scraper.cjs +0 -0
  79. /package/dist/{plugin → crawler}/scraper.js +0 -0
@@ -1,7 +1,8 @@
1
1
  import fs from "node:fs";
2
- import { FileCacher } from '../cache/file-cacher.js';
3
- import { UrlStore } from '../cache/url-store.js';
4
- import { NavigationHistory } from '../cache/navigation-history.js';
2
+ import { FileCacher } from './plugin/file-cacher.js';
3
+ import { UrlStore } from './plugin/url-store.js';
4
+ import { NavigationHistory } from './plugin/navigation-history.js';
5
+ import { RobotsTxt } from './plugin/robots-txt.js';
5
6
  import { parseHTML } from "linkedom";
6
7
  import path from "node:path";
7
8
  import { Rezo } from '../core/rezo.js';
@@ -54,6 +55,23 @@ export class Crawler {
54
55
  navigationHistoryInitPromise = null;
55
56
  adapterExecutor = null;
56
57
  adapterType;
58
+ pendingExecutions = new Set;
59
+ robotsTxt;
60
+ domainResponseTimes = new Map;
61
+ domainCurrentDelay = new Map;
62
+ crawlStats = {
63
+ urlsVisited: 0,
64
+ urlsQueued: 0,
65
+ urlsFailed: 0,
66
+ startTime: 0,
67
+ currentDepth: 0
68
+ };
69
+ urlDepthMap = new Map;
70
+ startHandlers = [];
71
+ finishHandlers = [];
72
+ redirectHandlers = [];
73
+ collectedData = [];
74
+ crawlStarted = false;
57
75
  constructor(crawlerOptions, http = new Rezo) {
58
76
  this.http = http;
59
77
  this.queue = new RezoQueue({
@@ -71,7 +89,6 @@ export class Crawler {
71
89
  fs.mkdirSync(path.dirname(dbUrl), { recursive: true });
72
90
  FileCacher.create({
73
91
  cacheDir: dbUrl,
74
- softDelete: false,
75
92
  ttl: cacheTTL,
76
93
  encryptNamespace: true
77
94
  }).then((storage) => {
@@ -110,6 +127,14 @@ export class Crawler {
110
127
  }
111
128
  this.initializeAdapter();
112
129
  this.leadsFinder = new Scraper(this.http, this.config, this._onEmailLeads.bind(this), this._onEmailDiscovered.bind(this), this.config.debug);
130
+ this.robotsTxt = new RobotsTxt({
131
+ userAgent: this.config.userAgent || "RezoBot",
132
+ cacheTTL: 24 * 60 * 60 * 1000
133
+ });
134
+ this.crawlStats.startTime = Date.now();
135
+ if (this.config.baseUrl) {
136
+ this.urlDepthMap.set(this.config.baseUrl, 0);
137
+ }
113
138
  }
114
139
  async initializeAdapter() {
115
140
  try {
@@ -148,6 +173,8 @@ export class Crawler {
148
173
  async waitForNavigationHistory() {
149
174
  if (!this.config.enableNavigationHistory)
150
175
  return;
176
+ if (this.isNavigationHistoryReady && this.isSessionReady)
177
+ return;
151
178
  if (this.navigationHistoryInitPromise) {
152
179
  await this.navigationHistoryInitPromise;
153
180
  }
@@ -361,6 +388,18 @@ export class Crawler {
361
388
  this.emailLeadsEvents.push(handler);
362
389
  return this;
363
390
  }
391
+ onStart(handler) {
392
+ this.startHandlers.push(handler);
393
+ return this;
394
+ }
395
+ onFinish(handler) {
396
+ this.finishHandlers.push(handler);
397
+ return this;
398
+ }
399
+ onRedirect(handler) {
400
+ this.redirectHandlers.push(handler);
401
+ return this;
402
+ }
364
403
  onRawData(handler) {
365
404
  this.rawResponseEvents.push({
366
405
  handler: "_onRawResponse",
@@ -435,20 +474,25 @@ export class Crawler {
435
474
  this.queue.add(() => handler(document.body));
436
475
  }
437
476
  _onAttribute(selection, attribute, handler, document) {
438
- selection = typeof attribute === "function" ? selection : null;
439
- attribute = typeof attribute === "function" ? selection : attribute;
440
- handler = typeof attribute === "function" ? attribute : handler;
441
- selection = selection || `[${attribute}]`;
442
- const elements = document.querySelectorAll(selection);
477
+ const isSimpleForm = typeof attribute === "function";
478
+ const actualAttribute = isSimpleForm ? selection : attribute;
479
+ const actualHandler = isSimpleForm ? attribute : handler;
480
+ const actualSelection = isSimpleForm ? `[${selection}]` : selection || `[${attribute}]`;
481
+ const elements = document.querySelectorAll(actualSelection);
443
482
  for (let i = 0;i < elements.length; i++) {
444
- if (elements[i].hasAttribute(attribute))
445
- this.queue.add(() => handler(elements[i].getAttribute(attribute)));
483
+ const el = elements[i];
484
+ if (el.hasAttribute(actualAttribute)) {
485
+ const value = el.getAttribute(actualAttribute);
486
+ this.queue.add(() => actualHandler.call(el, value, actualAttribute));
487
+ }
446
488
  }
447
489
  }
448
490
  _onText(selection, handler, document) {
449
491
  const elements = document.querySelectorAll(selection);
450
492
  for (let i = 0;i < elements.length; i++) {
451
- this.queue.add(() => handler(elements[i].textContent));
493
+ const el = elements[i];
494
+ const text = el.textContent;
495
+ this.queue.add(() => handler.call(el, text));
452
496
  }
453
497
  }
454
498
  _onSelection(selection, handler, document) {
@@ -466,8 +510,11 @@ export class Crawler {
466
510
  _onHref(handler, document) {
467
511
  const elements = document.querySelectorAll("a, link");
468
512
  for (let i = 0;i < elements.length; i++) {
469
- if (elements[i].hasAttribute("href"))
470
- this.queue.add(() => handler(new URL(elements[i].getAttribute("href"), document.URL).href));
513
+ const el = elements[i];
514
+ if (el.hasAttribute("href")) {
515
+ const href = new URL(el.getAttribute("href"), document.URL).href;
516
+ this.queue.add(() => handler.call(el, href));
517
+ }
471
518
  }
472
519
  }
473
520
  _onAnchor(selection, handler, document) {
@@ -501,6 +548,233 @@ export class Crawler {
501
548
  _onResponse(handler, response) {
502
549
  this.queue.add(() => handler(response));
503
550
  }
551
+ calculateAutoThrottleDelay(domain, responseTime) {
552
+ if (!this.config.autoThrottle)
553
+ return 0;
554
+ let times = this.domainResponseTimes.get(domain) || [];
555
+ times.push(responseTime);
556
+ if (times.length > 10) {
557
+ times = times.slice(-10);
558
+ }
559
+ this.domainResponseTimes.set(domain, times);
560
+ const avgResponseTime = times.reduce((a, b) => a + b, 0) / times.length;
561
+ const targetDelay = this.config.autoThrottleTargetDelay;
562
+ const loadFactor = avgResponseTime / 200;
563
+ let newDelay = Math.round(targetDelay * loadFactor);
564
+ newDelay = Math.max(this.config.autoThrottleMinDelay, newDelay);
565
+ newDelay = Math.min(this.config.autoThrottleMaxDelay, newDelay);
566
+ this.domainCurrentDelay.set(domain, newDelay);
567
+ if (this.config.debug) {
568
+ console.log(`[AutoThrottle] ${domain}: avgRT=${avgResponseTime.toFixed(0)}ms, delay=${newDelay}ms`);
569
+ }
570
+ return newDelay;
571
+ }
572
+ getAutoThrottleDelay(domain) {
573
+ if (!this.config.autoThrottle)
574
+ return 0;
575
+ return this.domainCurrentDelay.get(domain) || this.config.autoThrottleMinDelay;
576
+ }
577
+ async handle429Response(url, response) {
578
+ let retryAfter = 0;
579
+ const retryAfterHeader = response?.headers?.["retry-after"] || response?.headers?.get?.("retry-after");
580
+ if (retryAfterHeader) {
581
+ const parsed = parseInt(retryAfterHeader, 10);
582
+ if (!isNaN(parsed)) {
583
+ retryAfter = parsed * 1000;
584
+ } else {
585
+ const date = new Date(retryAfterHeader);
586
+ if (!isNaN(date.getTime())) {
587
+ retryAfter = date.getTime() - Date.now();
588
+ }
589
+ }
590
+ }
591
+ if (retryAfter <= 0) {
592
+ retryAfter = 60000;
593
+ }
594
+ const maxWait = this.config.maxWaitOn429;
595
+ const alwaysWait = this.config.alwaysWaitOn429;
596
+ if (retryAfter > maxWait && !alwaysWait) {
597
+ const waitMinutes = Math.round(retryAfter / 60000);
598
+ const error = new Error(`Rate limited: Server requested wait time of ${waitMinutes} minutes, which exceeds maxWaitOn429 (${Math.round(maxWait / 60000)} minutes). Set alwaysWaitOn429: true to wait regardless.`);
599
+ error.code = "REZ_RATE_LIMIT_EXCEEDED";
600
+ error.url = url;
601
+ error.status = 429;
602
+ throw error;
603
+ }
604
+ if (retryAfter > maxWait && alwaysWait) {
605
+ const waitMinutes = Math.round(retryAfter / 60000);
606
+ console.warn(`[Crawler] WARNING: Rate limited on ${url}. Server requested ${waitMinutes} minute wait. Waiting because alwaysWaitOn429 is enabled.`);
607
+ }
608
+ if (this.config.debug) {
609
+ console.log(`[Crawler] 429 Rate Limited: waiting ${Math.round(retryAfter / 1000)}s before retry`);
610
+ }
611
+ return { shouldRetry: true, waitTime: retryAfter };
612
+ }
613
+ async checkCrawlLimits(url, parentUrl) {
614
+ if (this.config.maxUrls > 0 && this.crawlStats.urlsVisited >= this.config.maxUrls) {
615
+ return { allowed: false, reason: `maxUrls limit reached (${this.config.maxUrls})` };
616
+ }
617
+ if (this.config.maxDepth > 0) {
618
+ const parentDepth = parentUrl ? this.urlDepthMap.get(parentUrl) ?? 0 : 0;
619
+ const urlDepth = this.urlDepthMap.get(url) ?? parentDepth + 1;
620
+ if (urlDepth > this.config.maxDepth) {
621
+ return { allowed: false, reason: `maxDepth limit reached (depth ${urlDepth} > ${this.config.maxDepth})` };
622
+ }
623
+ if (!this.urlDepthMap.has(url)) {
624
+ this.urlDepthMap.set(url, urlDepth);
625
+ this.crawlStats.currentDepth = Math.max(this.crawlStats.currentDepth, urlDepth);
626
+ }
627
+ }
628
+ if (this.config.respectRobotsTxt) {
629
+ try {
630
+ if (!this.robotsTxt.isCached(url)) {
631
+ await this.robotsTxt.fetch(url, async (robotsUrl) => {
632
+ const response = await this.http.get(robotsUrl, { timeout: 1e4 });
633
+ return { status: response.status, data: response.data };
634
+ });
635
+ }
636
+ const allowed = this.robotsTxt.isAllowed(url);
637
+ if (!allowed) {
638
+ return { allowed: false, reason: "Blocked by robots.txt" };
639
+ }
640
+ } catch (error) {
641
+ if (this.config.debug) {
642
+ console.warn(`[Crawler] Failed to check robots.txt for ${url}:`, error);
643
+ }
644
+ }
645
+ }
646
+ return { allowed: true };
647
+ }
648
+ shouldFollowLink(element) {
649
+ if (this.config.followNofollow) {
650
+ return true;
651
+ }
652
+ const rel = element.getAttribute("rel");
653
+ if (rel && rel.toLowerCase().includes("nofollow")) {
654
+ return false;
655
+ }
656
+ return true;
657
+ }
658
+ checkResponseSize(contentLength) {
659
+ if (this.config.maxResponseSize > 0 && contentLength > this.config.maxResponseSize) {
660
+ return {
661
+ allowed: false,
662
+ reason: `Response size (${contentLength} bytes) exceeds maxResponseSize (${this.config.maxResponseSize} bytes)`
663
+ };
664
+ }
665
+ return { allowed: true };
666
+ }
667
+ collect(data) {
668
+ this.collectedData.push(data);
669
+ return this;
670
+ }
671
+ getCollectedData() {
672
+ return [...this.collectedData];
673
+ }
674
+ clearCollectedData() {
675
+ this.collectedData = [];
676
+ return this;
677
+ }
678
+ async exportData(filePath, format = "json") {
679
+ const data = this.collectedData;
680
+ if (data.length === 0) {
681
+ if (this.config.debug) {
682
+ console.warn("[Crawler] No data to export");
683
+ }
684
+ return;
685
+ }
686
+ let content;
687
+ switch (format) {
688
+ case "json":
689
+ content = JSON.stringify(data, null, 2);
690
+ break;
691
+ case "jsonl":
692
+ content = data.map((item) => JSON.stringify(item)).join(`
693
+ `);
694
+ break;
695
+ case "csv":
696
+ const keys = new Set;
697
+ data.forEach((item) => {
698
+ if (typeof item === "object" && item !== null) {
699
+ Object.keys(item).forEach((key) => keys.add(key));
700
+ }
701
+ });
702
+ const headers = Array.from(keys);
703
+ const escapeCSV = (val) => {
704
+ if (val === null || val === undefined)
705
+ return "";
706
+ const str = String(val);
707
+ if (str.includes(",") || str.includes('"') || str.includes(`
708
+ `)) {
709
+ return `"${str.replace(/"/g, '""')}"`;
710
+ }
711
+ return str;
712
+ };
713
+ const rows = [
714
+ headers.join(","),
715
+ ...data.map((item) => {
716
+ if (typeof item !== "object" || item === null) {
717
+ return escapeCSV(item);
718
+ }
719
+ return headers.map((key) => escapeCSV(item[key])).join(",");
720
+ })
721
+ ];
722
+ content = rows.join(`
723
+ `);
724
+ break;
725
+ default:
726
+ throw new Error(`Unsupported export format: ${format}`);
727
+ }
728
+ const dir = path.dirname(filePath);
729
+ if (!fs.existsSync(dir)) {
730
+ fs.mkdirSync(dir, { recursive: true });
731
+ }
732
+ fs.writeFileSync(filePath, content, "utf-8");
733
+ if (this.config.debug) {
734
+ console.log(`[Crawler] Exported ${data.length} items to ${filePath} (${format})`);
735
+ }
736
+ }
737
+ getStats() {
738
+ return { ...this.crawlStats };
739
+ }
740
+ async triggerStartHandlers() {
741
+ if (this.crawlStarted)
742
+ return;
743
+ this.crawlStarted = true;
744
+ this.crawlStats.startTime = Date.now();
745
+ for (const handler of this.startHandlers) {
746
+ try {
747
+ await handler();
748
+ } catch (error) {
749
+ if (this.config.debug) {
750
+ console.error("[Crawler] onStart handler error:", error);
751
+ }
752
+ }
753
+ }
754
+ }
755
+ async triggerFinishHandlers() {
756
+ this.crawlStats.endTime = Date.now();
757
+ for (const handler of this.finishHandlers) {
758
+ try {
759
+ await handler(this.crawlStats);
760
+ } catch (error) {
761
+ if (this.config.debug) {
762
+ console.error("[Crawler] onFinish handler error:", error);
763
+ }
764
+ }
765
+ }
766
+ }
767
+ async triggerRedirectHandlers(event) {
768
+ for (const handler of this.redirectHandlers) {
769
+ try {
770
+ await handler(event);
771
+ } catch (error) {
772
+ if (this.config.debug) {
773
+ console.error("[Crawler] onRedirect handler error:", error);
774
+ }
775
+ }
776
+ }
777
+ }
504
778
  buildUrl(url, params) {
505
779
  if (params) {
506
780
  const u = new URL(url, this.config.baseUrl);
@@ -570,16 +844,35 @@ export class Crawler {
570
844
  this.addToNavigationQueue(url, method, body, headersObj);
571
845
  }
572
846
  if (deepEmailFinder) {
573
- this.execute2(method, url, body, _options, forceRevisit).then();
847
+ const p = this.execute2(method, url, body, _options, forceRevisit);
848
+ this.pendingExecutions.add(p);
849
+ p.finally(() => this.pendingExecutions.delete(p));
574
850
  return this;
575
851
  }
576
- this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions).then();
852
+ const p = this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions);
853
+ this.pendingExecutions.add(p);
854
+ p.finally(() => this.pendingExecutions.delete(p));
577
855
  return this;
578
856
  }
579
857
  async execute(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions) {
580
- this.queue.add(() => this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions)).then();
858
+ await this.waitForStorage();
859
+ if (this.isCacheEnabled) {
860
+ await this.waitForCache();
861
+ }
862
+ if (this.config.enableNavigationHistory) {
863
+ await this.waitForNavigationHistory();
864
+ }
865
+ const task = this.queue.add(() => this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions));
866
+ task.finally(() => this.pendingExecutions.delete(task));
581
867
  }
582
868
  async execute2(method, url, body, options = {}, forceRevisit) {
869
+ await this.waitForStorage();
870
+ if (this.isCacheEnabled) {
871
+ await this.waitForCache();
872
+ }
873
+ if (this.config.enableNavigationHistory) {
874
+ await this.waitForNavigationHistory();
875
+ }
583
876
  this.queue.add(() => this.leadsFinder.parseExternalWebsite(url, method, body, {
584
877
  httpConfig: options,
585
878
  saveCache: this.saveCache.bind(this),
@@ -593,21 +886,34 @@ export class Crawler {
593
886
  allowCrossDomainTravel: true
594
887
  }, forceRevisit, true)).then();
595
888
  }
596
- async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0) {
889
+ async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0, parentUrl) {
597
890
  try {
598
- console.log({
599
- oxylabsOptions: typeof oxylabsOptions,
600
- oxylabsInstanse: typeof oxylabsInstanse,
601
- decodoInstanse: typeof decodoInstanse,
602
- decodoOptions: typeof decodoOptions
603
- });
891
+ await this.triggerStartHandlers();
892
+ const limitCheck = await this.checkCrawlLimits(url, parentUrl);
893
+ if (!limitCheck.allowed) {
894
+ if (this.config.debug) {
895
+ console.log(`[Crawler] Skipping ${url}: ${limitCheck.reason}`);
896
+ }
897
+ return;
898
+ }
899
+ this.crawlStats.urlsQueued++;
900
+ const domain = new URL(url).hostname;
901
+ const delay = this.getAutoThrottleDelay(domain);
902
+ if (delay > 0) {
903
+ await new Promise((resolve) => setTimeout(resolve, delay));
904
+ }
604
905
  const isVisited = forceRevisit ? false : await this.hasUrlInCache(url);
605
906
  const cache = await this.getCache(url);
606
907
  if (isVisited && !cache)
607
908
  return;
608
909
  if (isVisited && method !== "GET")
609
910
  return;
911
+ const requestStartTime = Date.now();
610
912
  const response = cache && method === "GET" ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) : await (method === "GET" ? this.http.get(url, options) : method === "PATCH" ? this.http.patch(url, body, options) : method === "POST" ? this.http.post(url, body, options) : this.http.put(url, body, options));
913
+ if (!cache) {
914
+ const responseTime = Date.now() - requestStartTime;
915
+ this.calculateAutoThrottleDelay(domain, responseTime);
916
+ }
611
917
  const res = {
612
918
  data: response.data || response.content || "",
613
919
  contentType: response.contentType || "",
@@ -619,11 +925,29 @@ export class Crawler {
619
925
  cookies: response?.cookies?.serialized || response?.cookies,
620
926
  contentLength: response.contentLength || 0
621
927
  };
928
+ if (res.contentLength && res.contentLength > 0) {
929
+ const sizeCheck = this.checkResponseSize(res.contentLength);
930
+ if (!sizeCheck.allowed) {
931
+ if (this.config.debug) {
932
+ console.log(`[Crawler] Skipping ${url}: ${sizeCheck.reason}`);
933
+ }
934
+ return;
935
+ }
936
+ }
937
+ this.crawlStats.urlsVisited++;
938
+ if (res.finalUrl && res.finalUrl !== url && this.redirectHandlers.length > 0) {
939
+ await this.triggerRedirectHandlers({
940
+ originalUrl: url,
941
+ finalUrl: res.finalUrl,
942
+ redirectCount: response.redirectCount || 1,
943
+ statusCode: res.status
944
+ });
945
+ }
622
946
  if (!cache)
623
947
  await this.saveCache(url, res);
624
948
  if (!isVisited)
625
949
  await this.saveUrl(url);
626
- this.markUrlVisited(url, {
950
+ await this.markUrlVisited(url, {
627
951
  status: res.status,
628
952
  finalUrl: res.finalUrl,
629
953
  contentType: res.contentType
@@ -655,6 +979,24 @@ export class Crawler {
655
979
  }
656
980
  } catch (e) {
657
981
  const error = e;
982
+ if (error?.response?.status === 429 || error?.status === 429) {
983
+ try {
984
+ const { shouldRetry, waitTime } = await this.handle429Response(url, error.response || error);
985
+ if (shouldRetry) {
986
+ await this.sleep(waitTime);
987
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
988
+ }
989
+ } catch (rateLimitError) {
990
+ this.crawlStats.urlsFailed++;
991
+ if (this.config.throwFatalError)
992
+ throw rateLimitError;
993
+ for (let i = 0;i < this.errorEvents.length; i++) {
994
+ const event = this.errorEvents[i];
995
+ this[event.handler](...event.attr, rateLimitError);
996
+ }
997
+ return;
998
+ }
999
+ }
658
1000
  if (error && error.response) {
659
1001
  const status = error.response.status;
660
1002
  const retryDelay = this.config.retryDelay || 1000;
@@ -666,16 +1008,17 @@ export class Crawler {
666
1008
  if (retryWithoutProxyOnStatusCode && options.proxy && retryWithoutProxyOnStatusCode.includes(status) && retryCount < maxRetryAttempts) {
667
1009
  await this.sleep(retryDelay);
668
1010
  delete options.proxy;
669
- return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
1011
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
670
1012
  } else if (retryOnStatusCode && options.proxy && retryOnStatusCode.includes(status) && retryCount < maxRetryAttempts) {
671
1013
  await this.sleep(retryDelay);
672
- return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
1014
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
673
1015
  } else if (retryOnProxyError && options.proxy && retryCount < maxRetryOnProxyError) {
674
1016
  await this.sleep(retryDelay);
675
- return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
1017
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
676
1018
  }
677
1019
  }
678
- this.markUrlVisited(url, {
1020
+ this.crawlStats.urlsFailed++;
1021
+ await this.markUrlVisited(url, {
679
1022
  status: error?.response?.status || 0,
680
1023
  errorMessage: e.message || "Unknown error"
681
1024
  });
@@ -684,7 +1027,6 @@ export class Crawler {
684
1027
  if (this.config.debug) {
685
1028
  console.log(`Error visiting ${url}: ${e.message}`);
686
1029
  }
687
- console.log(error);
688
1030
  for (let i = 0;i < this.errorEvents.length; i++) {
689
1031
  const event = this.errorEvents[i];
690
1032
  this[event.handler](...event.attr, e);
@@ -692,17 +1034,35 @@ export class Crawler {
692
1034
  }
693
1035
  }
694
1036
  async waitForAll() {
1037
+ if (this.pendingExecutions.size > 0) {
1038
+ await Promise.allSettled([...this.pendingExecutions]);
1039
+ }
695
1040
  await this.queue.onIdle();
1041
+ await this.triggerFinishHandlers();
696
1042
  }
697
1043
  async done() {
698
1044
  return this.waitForAll();
699
1045
  }
700
1046
  async close() {
701
1047
  try {
702
- await this.cacher.close();
1048
+ await this.cacher?.close();
703
1049
  } catch {}
704
1050
  try {
705
- await this.urlStorage.close();
1051
+ await this.urlStorage?.close();
706
1052
  } catch {}
1053
+ try {
1054
+ await this.navigationHistory?.close();
1055
+ } catch {}
1056
+ }
1057
+ async destroy() {
1058
+ this.queue.clear();
1059
+ this.events.length = 0;
1060
+ this.jsonEvents.length = 0;
1061
+ this.errorEvents.length = 0;
1062
+ this.responseEvents.length = 0;
1063
+ this.rawResponseEvents.length = 0;
1064
+ this.emailDiscoveredEvents.length = 0;
1065
+ this.emailLeadsEvents.length = 0;
1066
+ await this.close();
707
1067
  }
708
1068
  }
@@ -0,0 +1,40 @@
1
+ const _mod_ohy0ht = require('./crawler.cjs');
2
+ exports.Crawler = _mod_ohy0ht.Crawler;;
3
+ const _mod_lcpyaf = require('./crawler-options.cjs');
4
+ exports.CrawlerOptions = _mod_lcpyaf.CrawlerOptions;;
5
+ const _mod_i89xd7 = require('./plugin/robots-txt.cjs');
6
+ exports.RobotsTxt = _mod_i89xd7.RobotsTxt;;
7
+ const _mod_5bw1j8 = require('./plugin/file-cacher.cjs');
8
+ exports.FileCacher = _mod_5bw1j8.FileCacher;;
9
+ const _mod_fhan3k = require('./plugin/url-store.cjs');
10
+ exports.UrlStore = _mod_fhan3k.UrlStore;;
11
+ const _mod_23xqla = require('./plugin/navigation-history.cjs');
12
+ exports.NavigationHistory = _mod_23xqla.NavigationHistory;;
13
+ const _mod_bugokk = require('./addon/oxylabs/index.cjs');
14
+ exports.Oxylabs = _mod_bugokk.Oxylabs;;
15
+ const _mod_rqu5cf = require('./addon/oxylabs/options.cjs');
16
+ exports.OXYLABS_BROWSER_TYPES = _mod_rqu5cf.OXYLABS_BROWSER_TYPES;
17
+ exports.OXYLABS_COMMON_LOCALES = _mod_rqu5cf.OXYLABS_COMMON_LOCALES;
18
+ exports.OXYLABS_COMMON_GEO_LOCATIONS = _mod_rqu5cf.OXYLABS_COMMON_GEO_LOCATIONS;
19
+ exports.OXYLABS_US_STATES = _mod_rqu5cf.OXYLABS_US_STATES;
20
+ exports.OXYLABS_EUROPEAN_COUNTRIES = _mod_rqu5cf.OXYLABS_EUROPEAN_COUNTRIES;
21
+ exports.OXYLABS_ASIAN_COUNTRIES = _mod_rqu5cf.OXYLABS_ASIAN_COUNTRIES;
22
+ exports.getRandomOxylabsBrowserType = _mod_rqu5cf.getRandomBrowserType;
23
+ exports.getRandomOxylabsLocale = _mod_rqu5cf.getRandomLocale;
24
+ exports.getRandomOxylabsGeoLocation = _mod_rqu5cf.getRandomGeoLocation;;
25
+ const _mod_eh5umb = require('./addon/decodo/index.cjs');
26
+ exports.Decodo = _mod_eh5umb.Decodo;;
27
+ const _mod_pa9reb = require('./addon/decodo/options.cjs');
28
+ exports.DECODO_DEVICE_TYPES = _mod_pa9reb.DECODO_DEVICE_TYPES;
29
+ exports.DECODO_HEADLESS_MODES = _mod_pa9reb.DECODO_HEADLESS_MODES;
30
+ exports.DECODO_COMMON_LOCALES = _mod_pa9reb.DECODO_COMMON_LOCALES;
31
+ exports.DECODO_COMMON_COUNTRIES = _mod_pa9reb.DECODO_COMMON_COUNTRIES;
32
+ exports.DECODO_EUROPEAN_COUNTRIES = _mod_pa9reb.DECODO_EUROPEAN_COUNTRIES;
33
+ exports.DECODO_ASIAN_COUNTRIES = _mod_pa9reb.DECODO_ASIAN_COUNTRIES;
34
+ exports.DECODO_US_STATES = _mod_pa9reb.DECODO_US_STATES;
35
+ exports.DECODO_COMMON_CITIES = _mod_pa9reb.DECODO_COMMON_CITIES;
36
+ exports.getRandomDecodoDeviceType = _mod_pa9reb.getRandomDeviceType;
37
+ exports.getRandomDecodoLocale = _mod_pa9reb.getRandomLocale;
38
+ exports.getRandomDecodoCountry = _mod_pa9reb.getRandomCountry;
39
+ exports.getRandomDecodoCity = _mod_pa9reb.getRandomCity;
40
+ exports.generateDecodoSessionId = _mod_pa9reb.generateSessionId;;
@@ -1,7 +1,9 @@
1
1
  export { Crawler } from './crawler.js';
2
2
  export { CrawlerOptions } from './crawler-options.js';
3
- export { FileCacher } from '../cache/file-cacher.js';
4
- export { UrlStore } from '../cache/url-store.js';
3
+ export { RobotsTxt } from './plugin/robots-txt.js';
4
+ export { FileCacher } from './plugin/file-cacher.js';
5
+ export { UrlStore } from './plugin/url-store.js';
6
+ export { NavigationHistory } from './plugin/navigation-history.js';
5
7
  export { Oxylabs } from './addon/oxylabs/index.js';
6
8
  export {
7
9
  OXYLABS_BROWSER_TYPES,
@@ -0,0 +1,19 @@
1
+ var x=Object.create;var{getPrototypeOf:A,defineProperty:d,getOwnPropertyNames:b}=Object;var m=Object.prototype.hasOwnProperty;var p=(e,r,c)=>{c=e!=null?x(A(e)):{};let t=r||!e||!e.__esModule?d(c,"default",{value:e,enumerable:!0}):c;for(let s of b(e))if(!m.call(t,s))d(t,s,{get:()=>e[s],enumerable:!0});return t};var y=require("node:fs"),R=require("node:path"),{createHash:O}=require("node:crypto"),f=require("node:zlib"),D=typeof globalThis.Bun<"u",E=typeof f.zstdCompressSync==="function";async function w(e){if(D){let{Database:t}=await import("bun:sqlite"),s=new t(e);return{run:(a,...n)=>s.run(a,...n),get:(a,...n)=>s.query(a).get(...n),all:(a,...n)=>s.query(a).all(...n),exec:(a)=>s.exec(a),close:()=>s.close()}}let{DatabaseSync:r}=await import("node:sqlite"),c=new r(e);return{run:(t,...s)=>{if(s.length===0)c.exec(t);else c.prepare(t).run(...s)},get:(t,...s)=>{return c.prepare(t).get(...s)},all:(t,...s)=>{return c.prepare(t).all(...s)},exec:(t)=>c.exec(t),close:()=>c.close()}}function T(e){if(E)return f.zstdCompressSync(e);return e}function g(e){if(E)return f.zstdDecompressSync(e);return e}class l{databases=new Map;options;cacheDir;closed=!1;constructor(e={}){if(this.options={cacheDir:e.cacheDir||"/tmp/rezo-crawler/cache",ttl:e.ttl||604800000,compression:e.compression??!1,encryptNamespace:e.encryptNamespace??!1,maxEntries:e.maxEntries??0},this.cacheDir=R.resolve(this.options.cacheDir),!y.existsSync(this.cacheDir))y.mkdirSync(this.cacheDir,{recursive:!0})}static async create(e={}){return new l(e)}async getDatabase(e){let r=this.options.encryptNamespace?O("md5").update(e).digest("hex"):e.replace(/[^a-zA-Z0-9_-]/g,"_");if(this.databases.has(r))return this.databases.get(r);let c=R.join(this.cacheDir,`${r}.db`),t=await w(c);return t.exec("PRAGMA journal_mode = WAL"),t.exec("PRAGMA synchronous = NORMAL"),t.exec("PRAGMA cache_size = -64000"),t.exec("PRAGMA temp_store = MEMORY"),t.exec("PRAGMA mmap_size = 268435456"),t.exec(`
2
+ CREATE TABLE IF NOT EXISTS cache (
3
+ key TEXT PRIMARY KEY,
4
+ value BLOB NOT NULL,
5
+ expiresAt INTEGER NOT NULL,
6
+ createdAt INTEGER NOT NULL,
7
+ compressed INTEGER DEFAULT 0
8
+ ) WITHOUT ROWID
9
+ `),t.exec("CREATE INDEX IF NOT EXISTS idx_expires ON cache(expiresAt)"),this.databases.set(r,t),t}async set(e,r,c,t="default"){if(this.closed)throw Error("FileCacher is closed");let s=await this.getDatabase(t),a=Date.now(),n=a+(c??this.options.ttl),o=Buffer.from(JSON.stringify(r),"utf-8"),i=0;if(this.options.compression&&E)try{o=T(o),i=1}catch{}if(s.run(`
10
+ INSERT OR REPLACE INTO cache (key, value, expiresAt, createdAt, compressed)
11
+ VALUES (?, ?, ?, ?, ?)
12
+ `,e,Buffer.from(o).toString("base64"),n,a,i),this.options.maxEntries>0){let u=s.get("SELECT COUNT(*) as cnt FROM cache");if(u&&u.cnt>this.options.maxEntries){let h=u.cnt-this.options.maxEntries;s.run(`
13
+ DELETE FROM cache WHERE key IN (
14
+ SELECT key FROM cache ORDER BY createdAt ASC LIMIT ?
15
+ )
16
+ `,h)}}}async setMany(e,r="default"){if(this.closed)throw Error("FileCacher is closed");if(e.length===0)return;let c=await this.getDatabase(r),t=Date.now(),s=t+this.options.ttl;c.exec("BEGIN TRANSACTION");try{for(let a of e){let n=a.ttl?t+a.ttl:s,o=Buffer.from(JSON.stringify(a.value),"utf-8"),i=0;if(this.options.compression&&E)try{o=T(o),i=1}catch{}c.run(`
17
+ INSERT OR REPLACE INTO cache (key, value, expiresAt, createdAt, compressed)
18
+ VALUES (?, ?, ?, ?, ?)
19
+ `,a.key,Buffer.from(o).toString("base64"),n,t,i)}c.exec("COMMIT")}catch(a){throw c.exec("ROLLBACK"),a}}async get(e,r="default"){if(this.closed)throw Error("FileCacher is closed");let c=await this.getDatabase(r),t=c.get("SELECT value, expiresAt, compressed FROM cache WHERE key = ?",e);if(!t)return null;if(t.expiresAt<Date.now())return c.run("DELETE FROM cache WHERE key = ?",e),null;let s=Buffer.from(t.value,"base64");if(t.compressed)try{s=g(s)}catch{return null}try{return JSON.parse(Buffer.from(s).toString("utf-8"))}catch{return null}}async has(e,r="default"){if(this.closed)return!1;let t=(await this.getDatabase(r)).get("SELECT expiresAt FROM cache WHERE key = ?",e);if(!t)return!1;return t.expiresAt>=Date.now()}async hasMany(e,r="default"){if(this.closed)return new Set;if(e.length===0)return new Set;let c=await this.getDatabase(r),t=Date.now(),s=new Set,a=500;for(let n=0;n<e.length;n+=a){let o=e.slice(n,n+a),i=o.map(()=>"?").join(","),u=c.all(`SELECT key, expiresAt FROM cache WHERE key IN (${i})`,...o);for(let h of u)if(h.expiresAt>=t)s.add(h.key)}return s}async delete(e,r="default"){if(this.closed)return!1;return(await this.getDatabase(r)).run("DELETE FROM cache WHERE key = ?",e),!0}async clear(e="default"){if(this.closed)return;(await this.getDatabase(e)).exec("DELETE FROM cache")}async cleanup(e="default"){if(this.closed)return 0;let r=await this.getDatabase(e),c=Date.now(),t=r.get("SELECT COUNT(*) as cnt FROM cache");r.run("DELETE FROM cache WHERE expiresAt < ?",c);let s=r.get("SELECT COUNT(*) as cnt FROM cache");return(t?.cnt||0)-(s?.cnt||0)}async stats(e="default"){if(this.closed)return{count:0,expired:0};let r=await this.getDatabase(e),c=Date.now(),t=r.get("SELECT COUNT(*) as cnt FROM cache"),s=r.get("SELECT COUNT(*) as cnt FROM cache WHERE expiresAt < ?",c);return{count:t?.cnt||0,expired:s?.cnt||0}}async close(){if(this.closed)return;this.closed=!0;for(let e of this.databases.values())try{e.close()}catch{}this.databases.clear()}get isClosed(){return this.closed}get directory(){return this.cacheDir}}exports.FileCacher=l;exports.default=l;module.exports=Object.assign(l,exports);