rezo 1.0.43 → 1.0.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/entries/curl.d.ts +115 -0
- package/dist/adapters/entries/fetch.d.ts +115 -0
- package/dist/adapters/entries/http.d.ts +115 -0
- package/dist/adapters/entries/http2.d.ts +115 -0
- package/dist/adapters/entries/react-native.d.ts +115 -0
- package/dist/adapters/entries/xhr.d.ts +115 -0
- package/dist/adapters/fetch.cjs +18 -0
- package/dist/adapters/fetch.js +18 -0
- package/dist/adapters/http.cjs +18 -0
- package/dist/adapters/http.js +18 -0
- package/dist/adapters/http2.cjs +18 -0
- package/dist/adapters/http2.js +18 -0
- package/dist/adapters/index.cjs +6 -6
- package/dist/adapters/xhr.cjs +19 -0
- package/dist/adapters/xhr.js +19 -0
- package/dist/cache/index.cjs +9 -15
- package/dist/cache/index.js +0 -3
- package/dist/core/hooks.cjs +4 -2
- package/dist/core/hooks.js +4 -2
- package/dist/crawler/addon/decodo/index.cjs +1 -0
- package/dist/crawler/addon/decodo/index.js +1 -0
- package/dist/crawler/crawler-options.cjs +1 -0
- package/dist/crawler/crawler-options.js +1 -0
- package/dist/{plugin → crawler}/crawler.cjs +392 -32
- package/dist/{plugin → crawler}/crawler.js +392 -32
- package/dist/crawler/index.cjs +40 -0
- package/dist/{plugin → crawler}/index.js +4 -2
- package/dist/crawler/plugin/file-cacher.cjs +19 -0
- package/dist/crawler/plugin/file-cacher.js +19 -0
- package/dist/crawler/plugin/index.cjs +1 -0
- package/dist/crawler/plugin/index.js +1 -0
- package/dist/crawler/plugin/navigation-history.cjs +43 -0
- package/dist/crawler/plugin/navigation-history.js +43 -0
- package/dist/crawler/plugin/robots-txt.cjs +2 -0
- package/dist/crawler/plugin/robots-txt.js +2 -0
- package/dist/crawler/plugin/url-store.cjs +18 -0
- package/dist/crawler/plugin/url-store.js +18 -0
- package/dist/crawler.d.ts +430 -172
- package/dist/entries/crawler.cjs +5 -5
- package/dist/entries/crawler.js +2 -2
- package/dist/index.cjs +27 -27
- package/dist/index.d.ts +115 -0
- package/dist/internal/agents/index.cjs +10 -10
- package/dist/platform/browser.d.ts +115 -0
- package/dist/platform/bun.d.ts +115 -0
- package/dist/platform/deno.d.ts +115 -0
- package/dist/platform/node.d.ts +115 -0
- package/dist/platform/react-native.d.ts +115 -0
- package/dist/platform/worker.d.ts +115 -0
- package/dist/proxy/index.cjs +5 -5
- package/dist/proxy/index.js +1 -1
- package/dist/queue/index.cjs +8 -8
- package/dist/responses/universal/index.cjs +11 -11
- package/dist/utils/rate-limit-wait.cjs +217 -0
- package/dist/utils/rate-limit-wait.js +208 -0
- package/package.json +2 -6
- package/dist/cache/file-cacher.cjs +0 -270
- package/dist/cache/file-cacher.js +0 -267
- package/dist/cache/navigation-history.cjs +0 -298
- package/dist/cache/navigation-history.js +0 -296
- package/dist/cache/url-store.cjs +0 -294
- package/dist/cache/url-store.js +0 -291
- package/dist/plugin/addon/decodo/index.cjs +0 -1
- package/dist/plugin/addon/decodo/index.js +0 -1
- package/dist/plugin/crawler-options.cjs +0 -1
- package/dist/plugin/crawler-options.js +0 -1
- package/dist/plugin/index.cjs +0 -36
- /package/dist/{plugin → crawler}/addon/decodo/options.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/decodo/options.js +0 -0
- /package/dist/{plugin → crawler}/addon/decodo/types.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/decodo/types.js +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/index.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/index.js +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/options.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/options.js +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/types.cjs +0 -0
- /package/dist/{plugin → crawler}/addon/oxylabs/types.js +0 -0
- /package/dist/{plugin → crawler}/scraper.cjs +0 -0
- /package/dist/{plugin → crawler}/scraper.js +0 -0
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
const fs = require("node:fs");
|
|
2
|
-
const { FileCacher } = require('
|
|
3
|
-
const { UrlStore } = require('
|
|
4
|
-
const { NavigationHistory } = require('
|
|
2
|
+
const { FileCacher } = require('./plugin/file-cacher.cjs');
|
|
3
|
+
const { UrlStore } = require('./plugin/url-store.cjs');
|
|
4
|
+
const { NavigationHistory } = require('./plugin/navigation-history.cjs');
|
|
5
|
+
const { RobotsTxt } = require('./plugin/robots-txt.cjs');
|
|
5
6
|
const { parseHTML } = require("linkedom");
|
|
6
7
|
const path = require("node:path");
|
|
7
8
|
const { Rezo } = require('../core/rezo.cjs');
|
|
@@ -54,6 +55,23 @@ class Crawler {
|
|
|
54
55
|
navigationHistoryInitPromise = null;
|
|
55
56
|
adapterExecutor = null;
|
|
56
57
|
adapterType;
|
|
58
|
+
pendingExecutions = new Set;
|
|
59
|
+
robotsTxt;
|
|
60
|
+
domainResponseTimes = new Map;
|
|
61
|
+
domainCurrentDelay = new Map;
|
|
62
|
+
crawlStats = {
|
|
63
|
+
urlsVisited: 0,
|
|
64
|
+
urlsQueued: 0,
|
|
65
|
+
urlsFailed: 0,
|
|
66
|
+
startTime: 0,
|
|
67
|
+
currentDepth: 0
|
|
68
|
+
};
|
|
69
|
+
urlDepthMap = new Map;
|
|
70
|
+
startHandlers = [];
|
|
71
|
+
finishHandlers = [];
|
|
72
|
+
redirectHandlers = [];
|
|
73
|
+
collectedData = [];
|
|
74
|
+
crawlStarted = false;
|
|
57
75
|
constructor(crawlerOptions, http = new Rezo) {
|
|
58
76
|
this.http = http;
|
|
59
77
|
this.queue = new RezoQueue({
|
|
@@ -71,7 +89,6 @@ class Crawler {
|
|
|
71
89
|
fs.mkdirSync(path.dirname(dbUrl), { recursive: true });
|
|
72
90
|
FileCacher.create({
|
|
73
91
|
cacheDir: dbUrl,
|
|
74
|
-
softDelete: false,
|
|
75
92
|
ttl: cacheTTL,
|
|
76
93
|
encryptNamespace: true
|
|
77
94
|
}).then((storage) => {
|
|
@@ -110,6 +127,14 @@ class Crawler {
|
|
|
110
127
|
}
|
|
111
128
|
this.initializeAdapter();
|
|
112
129
|
this.leadsFinder = new Scraper(this.http, this.config, this._onEmailLeads.bind(this), this._onEmailDiscovered.bind(this), this.config.debug);
|
|
130
|
+
this.robotsTxt = new RobotsTxt({
|
|
131
|
+
userAgent: this.config.userAgent || "RezoBot",
|
|
132
|
+
cacheTTL: 24 * 60 * 60 * 1000
|
|
133
|
+
});
|
|
134
|
+
this.crawlStats.startTime = Date.now();
|
|
135
|
+
if (this.config.baseUrl) {
|
|
136
|
+
this.urlDepthMap.set(this.config.baseUrl, 0);
|
|
137
|
+
}
|
|
113
138
|
}
|
|
114
139
|
async initializeAdapter() {
|
|
115
140
|
try {
|
|
@@ -148,6 +173,8 @@ class Crawler {
|
|
|
148
173
|
async waitForNavigationHistory() {
|
|
149
174
|
if (!this.config.enableNavigationHistory)
|
|
150
175
|
return;
|
|
176
|
+
if (this.isNavigationHistoryReady && this.isSessionReady)
|
|
177
|
+
return;
|
|
151
178
|
if (this.navigationHistoryInitPromise) {
|
|
152
179
|
await this.navigationHistoryInitPromise;
|
|
153
180
|
}
|
|
@@ -361,6 +388,18 @@ class Crawler {
|
|
|
361
388
|
this.emailLeadsEvents.push(handler);
|
|
362
389
|
return this;
|
|
363
390
|
}
|
|
391
|
+
onStart(handler) {
|
|
392
|
+
this.startHandlers.push(handler);
|
|
393
|
+
return this;
|
|
394
|
+
}
|
|
395
|
+
onFinish(handler) {
|
|
396
|
+
this.finishHandlers.push(handler);
|
|
397
|
+
return this;
|
|
398
|
+
}
|
|
399
|
+
onRedirect(handler) {
|
|
400
|
+
this.redirectHandlers.push(handler);
|
|
401
|
+
return this;
|
|
402
|
+
}
|
|
364
403
|
onRawData(handler) {
|
|
365
404
|
this.rawResponseEvents.push({
|
|
366
405
|
handler: "_onRawResponse",
|
|
@@ -435,20 +474,25 @@ class Crawler {
|
|
|
435
474
|
this.queue.add(() => handler(document.body));
|
|
436
475
|
}
|
|
437
476
|
_onAttribute(selection, attribute, handler, document) {
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
const elements = document.querySelectorAll(
|
|
477
|
+
const isSimpleForm = typeof attribute === "function";
|
|
478
|
+
const actualAttribute = isSimpleForm ? selection : attribute;
|
|
479
|
+
const actualHandler = isSimpleForm ? attribute : handler;
|
|
480
|
+
const actualSelection = isSimpleForm ? `[${selection}]` : selection || `[${attribute}]`;
|
|
481
|
+
const elements = document.querySelectorAll(actualSelection);
|
|
443
482
|
for (let i = 0;i < elements.length; i++) {
|
|
444
|
-
|
|
445
|
-
|
|
483
|
+
const el = elements[i];
|
|
484
|
+
if (el.hasAttribute(actualAttribute)) {
|
|
485
|
+
const value = el.getAttribute(actualAttribute);
|
|
486
|
+
this.queue.add(() => actualHandler.call(el, value, actualAttribute));
|
|
487
|
+
}
|
|
446
488
|
}
|
|
447
489
|
}
|
|
448
490
|
_onText(selection, handler, document) {
|
|
449
491
|
const elements = document.querySelectorAll(selection);
|
|
450
492
|
for (let i = 0;i < elements.length; i++) {
|
|
451
|
-
|
|
493
|
+
const el = elements[i];
|
|
494
|
+
const text = el.textContent;
|
|
495
|
+
this.queue.add(() => handler.call(el, text));
|
|
452
496
|
}
|
|
453
497
|
}
|
|
454
498
|
_onSelection(selection, handler, document) {
|
|
@@ -466,8 +510,11 @@ class Crawler {
|
|
|
466
510
|
_onHref(handler, document) {
|
|
467
511
|
const elements = document.querySelectorAll("a, link");
|
|
468
512
|
for (let i = 0;i < elements.length; i++) {
|
|
469
|
-
|
|
470
|
-
|
|
513
|
+
const el = elements[i];
|
|
514
|
+
if (el.hasAttribute("href")) {
|
|
515
|
+
const href = new URL(el.getAttribute("href"), document.URL).href;
|
|
516
|
+
this.queue.add(() => handler.call(el, href));
|
|
517
|
+
}
|
|
471
518
|
}
|
|
472
519
|
}
|
|
473
520
|
_onAnchor(selection, handler, document) {
|
|
@@ -501,6 +548,233 @@ class Crawler {
|
|
|
501
548
|
_onResponse(handler, response) {
|
|
502
549
|
this.queue.add(() => handler(response));
|
|
503
550
|
}
|
|
551
|
+
calculateAutoThrottleDelay(domain, responseTime) {
|
|
552
|
+
if (!this.config.autoThrottle)
|
|
553
|
+
return 0;
|
|
554
|
+
let times = this.domainResponseTimes.get(domain) || [];
|
|
555
|
+
times.push(responseTime);
|
|
556
|
+
if (times.length > 10) {
|
|
557
|
+
times = times.slice(-10);
|
|
558
|
+
}
|
|
559
|
+
this.domainResponseTimes.set(domain, times);
|
|
560
|
+
const avgResponseTime = times.reduce((a, b) => a + b, 0) / times.length;
|
|
561
|
+
const targetDelay = this.config.autoThrottleTargetDelay;
|
|
562
|
+
const loadFactor = avgResponseTime / 200;
|
|
563
|
+
let newDelay = Math.round(targetDelay * loadFactor);
|
|
564
|
+
newDelay = Math.max(this.config.autoThrottleMinDelay, newDelay);
|
|
565
|
+
newDelay = Math.min(this.config.autoThrottleMaxDelay, newDelay);
|
|
566
|
+
this.domainCurrentDelay.set(domain, newDelay);
|
|
567
|
+
if (this.config.debug) {
|
|
568
|
+
console.log(`[AutoThrottle] ${domain}: avgRT=${avgResponseTime.toFixed(0)}ms, delay=${newDelay}ms`);
|
|
569
|
+
}
|
|
570
|
+
return newDelay;
|
|
571
|
+
}
|
|
572
|
+
getAutoThrottleDelay(domain) {
|
|
573
|
+
if (!this.config.autoThrottle)
|
|
574
|
+
return 0;
|
|
575
|
+
return this.domainCurrentDelay.get(domain) || this.config.autoThrottleMinDelay;
|
|
576
|
+
}
|
|
577
|
+
async handle429Response(url, response) {
|
|
578
|
+
let retryAfter = 0;
|
|
579
|
+
const retryAfterHeader = response?.headers?.["retry-after"] || response?.headers?.get?.("retry-after");
|
|
580
|
+
if (retryAfterHeader) {
|
|
581
|
+
const parsed = parseInt(retryAfterHeader, 10);
|
|
582
|
+
if (!isNaN(parsed)) {
|
|
583
|
+
retryAfter = parsed * 1000;
|
|
584
|
+
} else {
|
|
585
|
+
const date = new Date(retryAfterHeader);
|
|
586
|
+
if (!isNaN(date.getTime())) {
|
|
587
|
+
retryAfter = date.getTime() - Date.now();
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
if (retryAfter <= 0) {
|
|
592
|
+
retryAfter = 60000;
|
|
593
|
+
}
|
|
594
|
+
const maxWait = this.config.maxWaitOn429;
|
|
595
|
+
const alwaysWait = this.config.alwaysWaitOn429;
|
|
596
|
+
if (retryAfter > maxWait && !alwaysWait) {
|
|
597
|
+
const waitMinutes = Math.round(retryAfter / 60000);
|
|
598
|
+
const error = new Error(`Rate limited: Server requested wait time of ${waitMinutes} minutes, which exceeds maxWaitOn429 (${Math.round(maxWait / 60000)} minutes). Set alwaysWaitOn429: true to wait regardless.`);
|
|
599
|
+
error.code = "REZ_RATE_LIMIT_EXCEEDED";
|
|
600
|
+
error.url = url;
|
|
601
|
+
error.status = 429;
|
|
602
|
+
throw error;
|
|
603
|
+
}
|
|
604
|
+
if (retryAfter > maxWait && alwaysWait) {
|
|
605
|
+
const waitMinutes = Math.round(retryAfter / 60000);
|
|
606
|
+
console.warn(`[Crawler] WARNING: Rate limited on ${url}. Server requested ${waitMinutes} minute wait. Waiting because alwaysWaitOn429 is enabled.`);
|
|
607
|
+
}
|
|
608
|
+
if (this.config.debug) {
|
|
609
|
+
console.log(`[Crawler] 429 Rate Limited: waiting ${Math.round(retryAfter / 1000)}s before retry`);
|
|
610
|
+
}
|
|
611
|
+
return { shouldRetry: true, waitTime: retryAfter };
|
|
612
|
+
}
|
|
613
|
+
async checkCrawlLimits(url, parentUrl) {
|
|
614
|
+
if (this.config.maxUrls > 0 && this.crawlStats.urlsVisited >= this.config.maxUrls) {
|
|
615
|
+
return { allowed: false, reason: `maxUrls limit reached (${this.config.maxUrls})` };
|
|
616
|
+
}
|
|
617
|
+
if (this.config.maxDepth > 0) {
|
|
618
|
+
const parentDepth = parentUrl ? this.urlDepthMap.get(parentUrl) ?? 0 : 0;
|
|
619
|
+
const urlDepth = this.urlDepthMap.get(url) ?? parentDepth + 1;
|
|
620
|
+
if (urlDepth > this.config.maxDepth) {
|
|
621
|
+
return { allowed: false, reason: `maxDepth limit reached (depth ${urlDepth} > ${this.config.maxDepth})` };
|
|
622
|
+
}
|
|
623
|
+
if (!this.urlDepthMap.has(url)) {
|
|
624
|
+
this.urlDepthMap.set(url, urlDepth);
|
|
625
|
+
this.crawlStats.currentDepth = Math.max(this.crawlStats.currentDepth, urlDepth);
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
if (this.config.respectRobotsTxt) {
|
|
629
|
+
try {
|
|
630
|
+
if (!this.robotsTxt.isCached(url)) {
|
|
631
|
+
await this.robotsTxt.fetch(url, async (robotsUrl) => {
|
|
632
|
+
const response = await this.http.get(robotsUrl, { timeout: 1e4 });
|
|
633
|
+
return { status: response.status, data: response.data };
|
|
634
|
+
});
|
|
635
|
+
}
|
|
636
|
+
const allowed = this.robotsTxt.isAllowed(url);
|
|
637
|
+
if (!allowed) {
|
|
638
|
+
return { allowed: false, reason: "Blocked by robots.txt" };
|
|
639
|
+
}
|
|
640
|
+
} catch (error) {
|
|
641
|
+
if (this.config.debug) {
|
|
642
|
+
console.warn(`[Crawler] Failed to check robots.txt for ${url}:`, error);
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
return { allowed: true };
|
|
647
|
+
}
|
|
648
|
+
shouldFollowLink(element) {
|
|
649
|
+
if (this.config.followNofollow) {
|
|
650
|
+
return true;
|
|
651
|
+
}
|
|
652
|
+
const rel = element.getAttribute("rel");
|
|
653
|
+
if (rel && rel.toLowerCase().includes("nofollow")) {
|
|
654
|
+
return false;
|
|
655
|
+
}
|
|
656
|
+
return true;
|
|
657
|
+
}
|
|
658
|
+
checkResponseSize(contentLength) {
|
|
659
|
+
if (this.config.maxResponseSize > 0 && contentLength > this.config.maxResponseSize) {
|
|
660
|
+
return {
|
|
661
|
+
allowed: false,
|
|
662
|
+
reason: `Response size (${contentLength} bytes) exceeds maxResponseSize (${this.config.maxResponseSize} bytes)`
|
|
663
|
+
};
|
|
664
|
+
}
|
|
665
|
+
return { allowed: true };
|
|
666
|
+
}
|
|
667
|
+
collect(data) {
|
|
668
|
+
this.collectedData.push(data);
|
|
669
|
+
return this;
|
|
670
|
+
}
|
|
671
|
+
getCollectedData() {
|
|
672
|
+
return [...this.collectedData];
|
|
673
|
+
}
|
|
674
|
+
clearCollectedData() {
|
|
675
|
+
this.collectedData = [];
|
|
676
|
+
return this;
|
|
677
|
+
}
|
|
678
|
+
async exportData(filePath, format = "json") {
|
|
679
|
+
const data = this.collectedData;
|
|
680
|
+
if (data.length === 0) {
|
|
681
|
+
if (this.config.debug) {
|
|
682
|
+
console.warn("[Crawler] No data to export");
|
|
683
|
+
}
|
|
684
|
+
return;
|
|
685
|
+
}
|
|
686
|
+
let content;
|
|
687
|
+
switch (format) {
|
|
688
|
+
case "json":
|
|
689
|
+
content = JSON.stringify(data, null, 2);
|
|
690
|
+
break;
|
|
691
|
+
case "jsonl":
|
|
692
|
+
content = data.map((item) => JSON.stringify(item)).join(`
|
|
693
|
+
`);
|
|
694
|
+
break;
|
|
695
|
+
case "csv":
|
|
696
|
+
const keys = new Set;
|
|
697
|
+
data.forEach((item) => {
|
|
698
|
+
if (typeof item === "object" && item !== null) {
|
|
699
|
+
Object.keys(item).forEach((key) => keys.add(key));
|
|
700
|
+
}
|
|
701
|
+
});
|
|
702
|
+
const headers = Array.from(keys);
|
|
703
|
+
const escapeCSV = (val) => {
|
|
704
|
+
if (val === null || val === undefined)
|
|
705
|
+
return "";
|
|
706
|
+
const str = String(val);
|
|
707
|
+
if (str.includes(",") || str.includes('"') || str.includes(`
|
|
708
|
+
`)) {
|
|
709
|
+
return `"${str.replace(/"/g, '""')}"`;
|
|
710
|
+
}
|
|
711
|
+
return str;
|
|
712
|
+
};
|
|
713
|
+
const rows = [
|
|
714
|
+
headers.join(","),
|
|
715
|
+
...data.map((item) => {
|
|
716
|
+
if (typeof item !== "object" || item === null) {
|
|
717
|
+
return escapeCSV(item);
|
|
718
|
+
}
|
|
719
|
+
return headers.map((key) => escapeCSV(item[key])).join(",");
|
|
720
|
+
})
|
|
721
|
+
];
|
|
722
|
+
content = rows.join(`
|
|
723
|
+
`);
|
|
724
|
+
break;
|
|
725
|
+
default:
|
|
726
|
+
throw new Error(`Unsupported export format: ${format}`);
|
|
727
|
+
}
|
|
728
|
+
const dir = path.dirname(filePath);
|
|
729
|
+
if (!fs.existsSync(dir)) {
|
|
730
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
731
|
+
}
|
|
732
|
+
fs.writeFileSync(filePath, content, "utf-8");
|
|
733
|
+
if (this.config.debug) {
|
|
734
|
+
console.log(`[Crawler] Exported ${data.length} items to ${filePath} (${format})`);
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
getStats() {
|
|
738
|
+
return { ...this.crawlStats };
|
|
739
|
+
}
|
|
740
|
+
async triggerStartHandlers() {
|
|
741
|
+
if (this.crawlStarted)
|
|
742
|
+
return;
|
|
743
|
+
this.crawlStarted = true;
|
|
744
|
+
this.crawlStats.startTime = Date.now();
|
|
745
|
+
for (const handler of this.startHandlers) {
|
|
746
|
+
try {
|
|
747
|
+
await handler();
|
|
748
|
+
} catch (error) {
|
|
749
|
+
if (this.config.debug) {
|
|
750
|
+
console.error("[Crawler] onStart handler error:", error);
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
async triggerFinishHandlers() {
|
|
756
|
+
this.crawlStats.endTime = Date.now();
|
|
757
|
+
for (const handler of this.finishHandlers) {
|
|
758
|
+
try {
|
|
759
|
+
await handler(this.crawlStats);
|
|
760
|
+
} catch (error) {
|
|
761
|
+
if (this.config.debug) {
|
|
762
|
+
console.error("[Crawler] onFinish handler error:", error);
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
async triggerRedirectHandlers(event) {
|
|
768
|
+
for (const handler of this.redirectHandlers) {
|
|
769
|
+
try {
|
|
770
|
+
await handler(event);
|
|
771
|
+
} catch (error) {
|
|
772
|
+
if (this.config.debug) {
|
|
773
|
+
console.error("[Crawler] onRedirect handler error:", error);
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
}
|
|
504
778
|
buildUrl(url, params) {
|
|
505
779
|
if (params) {
|
|
506
780
|
const u = new URL(url, this.config.baseUrl);
|
|
@@ -570,16 +844,35 @@ class Crawler {
|
|
|
570
844
|
this.addToNavigationQueue(url, method, body, headersObj);
|
|
571
845
|
}
|
|
572
846
|
if (deepEmailFinder) {
|
|
573
|
-
this.execute2(method, url, body, _options, forceRevisit)
|
|
847
|
+
const p = this.execute2(method, url, body, _options, forceRevisit);
|
|
848
|
+
this.pendingExecutions.add(p);
|
|
849
|
+
p.finally(() => this.pendingExecutions.delete(p));
|
|
574
850
|
return this;
|
|
575
851
|
}
|
|
576
|
-
this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions)
|
|
852
|
+
const p = this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions);
|
|
853
|
+
this.pendingExecutions.add(p);
|
|
854
|
+
p.finally(() => this.pendingExecutions.delete(p));
|
|
577
855
|
return this;
|
|
578
856
|
}
|
|
579
857
|
async execute(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions) {
|
|
580
|
-
|
|
858
|
+
await this.waitForStorage();
|
|
859
|
+
if (this.isCacheEnabled) {
|
|
860
|
+
await this.waitForCache();
|
|
861
|
+
}
|
|
862
|
+
if (this.config.enableNavigationHistory) {
|
|
863
|
+
await this.waitForNavigationHistory();
|
|
864
|
+
}
|
|
865
|
+
const task = this.queue.add(() => this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions));
|
|
866
|
+
task.finally(() => this.pendingExecutions.delete(task));
|
|
581
867
|
}
|
|
582
868
|
async execute2(method, url, body, options = {}, forceRevisit) {
|
|
869
|
+
await this.waitForStorage();
|
|
870
|
+
if (this.isCacheEnabled) {
|
|
871
|
+
await this.waitForCache();
|
|
872
|
+
}
|
|
873
|
+
if (this.config.enableNavigationHistory) {
|
|
874
|
+
await this.waitForNavigationHistory();
|
|
875
|
+
}
|
|
583
876
|
this.queue.add(() => this.leadsFinder.parseExternalWebsite(url, method, body, {
|
|
584
877
|
httpConfig: options,
|
|
585
878
|
saveCache: this.saveCache.bind(this),
|
|
@@ -593,21 +886,34 @@ class Crawler {
|
|
|
593
886
|
allowCrossDomainTravel: true
|
|
594
887
|
}, forceRevisit, true)).then();
|
|
595
888
|
}
|
|
596
|
-
async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0) {
|
|
889
|
+
async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0, parentUrl) {
|
|
597
890
|
try {
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
891
|
+
await this.triggerStartHandlers();
|
|
892
|
+
const limitCheck = await this.checkCrawlLimits(url, parentUrl);
|
|
893
|
+
if (!limitCheck.allowed) {
|
|
894
|
+
if (this.config.debug) {
|
|
895
|
+
console.log(`[Crawler] Skipping ${url}: ${limitCheck.reason}`);
|
|
896
|
+
}
|
|
897
|
+
return;
|
|
898
|
+
}
|
|
899
|
+
this.crawlStats.urlsQueued++;
|
|
900
|
+
const domain = new URL(url).hostname;
|
|
901
|
+
const delay = this.getAutoThrottleDelay(domain);
|
|
902
|
+
if (delay > 0) {
|
|
903
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
904
|
+
}
|
|
604
905
|
const isVisited = forceRevisit ? false : await this.hasUrlInCache(url);
|
|
605
906
|
const cache = await this.getCache(url);
|
|
606
907
|
if (isVisited && !cache)
|
|
607
908
|
return;
|
|
608
909
|
if (isVisited && method !== "GET")
|
|
609
910
|
return;
|
|
911
|
+
const requestStartTime = Date.now();
|
|
610
912
|
const response = cache && method === "GET" ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) : await (method === "GET" ? this.http.get(url, options) : method === "PATCH" ? this.http.patch(url, body, options) : method === "POST" ? this.http.post(url, body, options) : this.http.put(url, body, options));
|
|
913
|
+
if (!cache) {
|
|
914
|
+
const responseTime = Date.now() - requestStartTime;
|
|
915
|
+
this.calculateAutoThrottleDelay(domain, responseTime);
|
|
916
|
+
}
|
|
611
917
|
const res = {
|
|
612
918
|
data: response.data || response.content || "",
|
|
613
919
|
contentType: response.contentType || "",
|
|
@@ -619,11 +925,29 @@ class Crawler {
|
|
|
619
925
|
cookies: response?.cookies?.serialized || response?.cookies,
|
|
620
926
|
contentLength: response.contentLength || 0
|
|
621
927
|
};
|
|
928
|
+
if (res.contentLength && res.contentLength > 0) {
|
|
929
|
+
const sizeCheck = this.checkResponseSize(res.contentLength);
|
|
930
|
+
if (!sizeCheck.allowed) {
|
|
931
|
+
if (this.config.debug) {
|
|
932
|
+
console.log(`[Crawler] Skipping ${url}: ${sizeCheck.reason}`);
|
|
933
|
+
}
|
|
934
|
+
return;
|
|
935
|
+
}
|
|
936
|
+
}
|
|
937
|
+
this.crawlStats.urlsVisited++;
|
|
938
|
+
if (res.finalUrl && res.finalUrl !== url && this.redirectHandlers.length > 0) {
|
|
939
|
+
await this.triggerRedirectHandlers({
|
|
940
|
+
originalUrl: url,
|
|
941
|
+
finalUrl: res.finalUrl,
|
|
942
|
+
redirectCount: response.redirectCount || 1,
|
|
943
|
+
statusCode: res.status
|
|
944
|
+
});
|
|
945
|
+
}
|
|
622
946
|
if (!cache)
|
|
623
947
|
await this.saveCache(url, res);
|
|
624
948
|
if (!isVisited)
|
|
625
949
|
await this.saveUrl(url);
|
|
626
|
-
this.markUrlVisited(url, {
|
|
950
|
+
await this.markUrlVisited(url, {
|
|
627
951
|
status: res.status,
|
|
628
952
|
finalUrl: res.finalUrl,
|
|
629
953
|
contentType: res.contentType
|
|
@@ -655,6 +979,24 @@ class Crawler {
|
|
|
655
979
|
}
|
|
656
980
|
} catch (e) {
|
|
657
981
|
const error = e;
|
|
982
|
+
if (error?.response?.status === 429 || error?.status === 429) {
|
|
983
|
+
try {
|
|
984
|
+
const { shouldRetry, waitTime } = await this.handle429Response(url, error.response || error);
|
|
985
|
+
if (shouldRetry) {
|
|
986
|
+
await this.sleep(waitTime);
|
|
987
|
+
return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
|
|
988
|
+
}
|
|
989
|
+
} catch (rateLimitError) {
|
|
990
|
+
this.crawlStats.urlsFailed++;
|
|
991
|
+
if (this.config.throwFatalError)
|
|
992
|
+
throw rateLimitError;
|
|
993
|
+
for (let i = 0;i < this.errorEvents.length; i++) {
|
|
994
|
+
const event = this.errorEvents[i];
|
|
995
|
+
this[event.handler](...event.attr, rateLimitError);
|
|
996
|
+
}
|
|
997
|
+
return;
|
|
998
|
+
}
|
|
999
|
+
}
|
|
658
1000
|
if (error && error.response) {
|
|
659
1001
|
const status = error.response.status;
|
|
660
1002
|
const retryDelay = this.config.retryDelay || 1000;
|
|
@@ -666,16 +1008,17 @@ class Crawler {
|
|
|
666
1008
|
if (retryWithoutProxyOnStatusCode && options.proxy && retryWithoutProxyOnStatusCode.includes(status) && retryCount < maxRetryAttempts) {
|
|
667
1009
|
await this.sleep(retryDelay);
|
|
668
1010
|
delete options.proxy;
|
|
669
|
-
return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
|
|
1011
|
+
return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
|
|
670
1012
|
} else if (retryOnStatusCode && options.proxy && retryOnStatusCode.includes(status) && retryCount < maxRetryAttempts) {
|
|
671
1013
|
await this.sleep(retryDelay);
|
|
672
|
-
return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
|
|
1014
|
+
return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
|
|
673
1015
|
} else if (retryOnProxyError && options.proxy && retryCount < maxRetryOnProxyError) {
|
|
674
1016
|
await this.sleep(retryDelay);
|
|
675
|
-
return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
|
|
1017
|
+
return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
|
|
676
1018
|
}
|
|
677
1019
|
}
|
|
678
|
-
this.
|
|
1020
|
+
this.crawlStats.urlsFailed++;
|
|
1021
|
+
await this.markUrlVisited(url, {
|
|
679
1022
|
status: error?.response?.status || 0,
|
|
680
1023
|
errorMessage: e.message || "Unknown error"
|
|
681
1024
|
});
|
|
@@ -684,7 +1027,6 @@ class Crawler {
|
|
|
684
1027
|
if (this.config.debug) {
|
|
685
1028
|
console.log(`Error visiting ${url}: ${e.message}`);
|
|
686
1029
|
}
|
|
687
|
-
console.log(error);
|
|
688
1030
|
for (let i = 0;i < this.errorEvents.length; i++) {
|
|
689
1031
|
const event = this.errorEvents[i];
|
|
690
1032
|
this[event.handler](...event.attr, e);
|
|
@@ -692,18 +1034,36 @@ class Crawler {
|
|
|
692
1034
|
}
|
|
693
1035
|
}
|
|
694
1036
|
async waitForAll() {
|
|
1037
|
+
if (this.pendingExecutions.size > 0) {
|
|
1038
|
+
await Promise.allSettled([...this.pendingExecutions]);
|
|
1039
|
+
}
|
|
695
1040
|
await this.queue.onIdle();
|
|
1041
|
+
await this.triggerFinishHandlers();
|
|
696
1042
|
}
|
|
697
1043
|
async done() {
|
|
698
1044
|
return this.waitForAll();
|
|
699
1045
|
}
|
|
700
1046
|
async close() {
|
|
701
1047
|
try {
|
|
702
|
-
await this.cacher
|
|
1048
|
+
await this.cacher?.close();
|
|
703
1049
|
} catch {}
|
|
704
1050
|
try {
|
|
705
|
-
await this.urlStorage
|
|
1051
|
+
await this.urlStorage?.close();
|
|
706
1052
|
} catch {}
|
|
1053
|
+
try {
|
|
1054
|
+
await this.navigationHistory?.close();
|
|
1055
|
+
} catch {}
|
|
1056
|
+
}
|
|
1057
|
+
async destroy() {
|
|
1058
|
+
this.queue.clear();
|
|
1059
|
+
this.events.length = 0;
|
|
1060
|
+
this.jsonEvents.length = 0;
|
|
1061
|
+
this.errorEvents.length = 0;
|
|
1062
|
+
this.responseEvents.length = 0;
|
|
1063
|
+
this.rawResponseEvents.length = 0;
|
|
1064
|
+
this.emailDiscoveredEvents.length = 0;
|
|
1065
|
+
this.emailLeadsEvents.length = 0;
|
|
1066
|
+
await this.close();
|
|
707
1067
|
}
|
|
708
1068
|
}
|
|
709
1069
|
|