rezo 1.0.41 → 1.0.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/curl.cjs +143 -32
- package/dist/adapters/curl.js +143 -32
- package/dist/adapters/entries/curl.d.ts +65 -0
- package/dist/adapters/entries/fetch.d.ts +65 -0
- package/dist/adapters/entries/http.d.ts +65 -0
- package/dist/adapters/entries/http2.d.ts +65 -0
- package/dist/adapters/entries/react-native.d.ts +65 -0
- package/dist/adapters/entries/xhr.d.ts +65 -0
- package/dist/adapters/fetch.cjs +98 -12
- package/dist/adapters/fetch.js +98 -12
- package/dist/adapters/http.cjs +26 -14
- package/dist/adapters/http.js +26 -14
- package/dist/adapters/http2.cjs +756 -227
- package/dist/adapters/http2.js +756 -227
- package/dist/adapters/index.cjs +6 -6
- package/dist/adapters/xhr.cjs +94 -2
- package/dist/adapters/xhr.js +94 -2
- package/dist/cache/dns-cache.cjs +5 -3
- package/dist/cache/dns-cache.js +5 -3
- package/dist/cache/file-cacher.cjs +7 -1
- package/dist/cache/file-cacher.js +7 -1
- package/dist/cache/index.cjs +15 -13
- package/dist/cache/index.js +1 -0
- package/dist/cache/navigation-history.cjs +298 -0
- package/dist/cache/navigation-history.js +296 -0
- package/dist/cache/url-store.cjs +7 -1
- package/dist/cache/url-store.js +7 -1
- package/dist/core/rezo.cjs +7 -0
- package/dist/core/rezo.js +7 -0
- package/dist/crawler.d.ts +196 -11
- package/dist/entries/crawler.cjs +5 -5
- package/dist/index.cjs +27 -24
- package/dist/index.d.ts +73 -0
- package/dist/index.js +1 -0
- package/dist/internal/agents/base.cjs +113 -0
- package/dist/internal/agents/base.js +110 -0
- package/dist/internal/agents/http-proxy.cjs +89 -0
- package/dist/internal/agents/http-proxy.js +86 -0
- package/dist/internal/agents/https-proxy.cjs +176 -0
- package/dist/internal/agents/https-proxy.js +173 -0
- package/dist/internal/agents/index.cjs +10 -0
- package/dist/internal/agents/index.js +5 -0
- package/dist/internal/agents/socks-client.cjs +571 -0
- package/dist/internal/agents/socks-client.js +567 -0
- package/dist/internal/agents/socks-proxy.cjs +75 -0
- package/dist/internal/agents/socks-proxy.js +72 -0
- package/dist/platform/browser.d.ts +65 -0
- package/dist/platform/bun.d.ts +65 -0
- package/dist/platform/deno.d.ts +65 -0
- package/dist/platform/node.d.ts +65 -0
- package/dist/platform/react-native.d.ts +65 -0
- package/dist/platform/worker.d.ts +65 -0
- package/dist/plugin/crawler-options.cjs +1 -1
- package/dist/plugin/crawler-options.js +1 -1
- package/dist/plugin/crawler.cjs +192 -1
- package/dist/plugin/crawler.js +192 -1
- package/dist/plugin/index.cjs +36 -36
- package/dist/proxy/index.cjs +18 -16
- package/dist/proxy/index.js +17 -12
- package/dist/queue/index.cjs +8 -8
- package/dist/responses/buildError.cjs +11 -2
- package/dist/responses/buildError.js +11 -2
- package/dist/responses/universal/index.cjs +11 -11
- package/dist/utils/agent-pool.cjs +1 -17
- package/dist/utils/agent-pool.js +1 -17
- package/dist/utils/curl.cjs +317 -0
- package/dist/utils/curl.js +314 -0
- package/package.json +1 -1
package/dist/plugin/crawler.cjs
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
const fs = require("node:fs");
|
|
2
2
|
const { FileCacher } = require('../cache/file-cacher.cjs');
|
|
3
3
|
const { UrlStore } = require('../cache/url-store.cjs');
|
|
4
|
+
const { NavigationHistory } = require('../cache/navigation-history.cjs');
|
|
4
5
|
const { parseHTML } = require("linkedom");
|
|
5
6
|
const path = require("node:path");
|
|
7
|
+
const { Rezo } = require('../core/rezo.cjs');
|
|
6
8
|
const { RezoQueue } = require('../queue/queue.cjs');
|
|
7
9
|
const { Scraper } = require('./scraper.cjs');
|
|
8
10
|
const { CrawlerOptions } = require('./crawler-options.cjs');
|
|
11
|
+
const { loadAdapter } = require('../adapters/picker.cjs');
|
|
9
12
|
String.prototype.addBaseUrl = function(url) {
|
|
10
13
|
url = url instanceof URL ? url.href : url;
|
|
11
14
|
const html = this.replace(/<base\b[^>]*?>/gi, "");
|
|
@@ -44,12 +47,20 @@ class Crawler {
|
|
|
44
47
|
isStorageReady = false;
|
|
45
48
|
isCacheReady = false;
|
|
46
49
|
leadsFinder;
|
|
47
|
-
|
|
50
|
+
navigationHistory = null;
|
|
51
|
+
isNavigationHistoryReady = false;
|
|
52
|
+
isSessionReady = false;
|
|
53
|
+
currentSession = null;
|
|
54
|
+
navigationHistoryInitPromise = null;
|
|
55
|
+
adapterExecutor = null;
|
|
56
|
+
adapterType;
|
|
57
|
+
constructor(crawlerOptions, http = new Rezo) {
|
|
48
58
|
this.http = http;
|
|
49
59
|
this.queue = new RezoQueue({
|
|
50
60
|
concurrency: 1000
|
|
51
61
|
});
|
|
52
62
|
this.config = new CrawlerOptions(crawlerOptions);
|
|
63
|
+
this.adapterType = this.config.adapter;
|
|
53
64
|
const enableCache = this.config.enableCache;
|
|
54
65
|
this.isCacheEnabled = enableCache;
|
|
55
66
|
if (enableCache) {
|
|
@@ -91,8 +102,172 @@ class Crawler {
|
|
|
91
102
|
this.isStorageReady = true;
|
|
92
103
|
});
|
|
93
104
|
}
|
|
105
|
+
if (this.config.enableNavigationHistory) {
|
|
106
|
+
const navHistoryDir = path.resolve(this.config.cacheDir, "navigation");
|
|
107
|
+
if (!fs.existsSync(navHistoryDir))
|
|
108
|
+
fs.mkdirSync(navHistoryDir, { recursive: true });
|
|
109
|
+
this.navigationHistoryInitPromise = this.initializeNavigationHistory(navHistoryDir);
|
|
110
|
+
}
|
|
111
|
+
this.initializeAdapter();
|
|
94
112
|
this.leadsFinder = new Scraper(this.http, this.config, this._onEmailLeads.bind(this), this._onEmailDiscovered.bind(this), this.config.debug);
|
|
95
113
|
}
|
|
114
|
+
async initializeAdapter() {
|
|
115
|
+
try {
|
|
116
|
+
const adapterModule = await loadAdapter(this.adapterType);
|
|
117
|
+
this.adapterExecutor = adapterModule.executeRequest.bind(adapterModule);
|
|
118
|
+
} catch (error) {
|
|
119
|
+
if (this.config.debug) {
|
|
120
|
+
console.warn(`[Crawler] Failed to load adapter '${this.adapterType}', falling back to http instance`);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
async initializeNavigationHistory(navHistoryDir) {
|
|
125
|
+
try {
|
|
126
|
+
const history = await NavigationHistory.create({
|
|
127
|
+
storeDir: navHistoryDir,
|
|
128
|
+
dbFileName: "navigation.db"
|
|
129
|
+
});
|
|
130
|
+
this.navigationHistory = history;
|
|
131
|
+
this.isNavigationHistoryReady = true;
|
|
132
|
+
const session = await history.getSession(this.config.sessionId);
|
|
133
|
+
if (session && (session.status === "running" || session.status === "paused")) {
|
|
134
|
+
this.currentSession = session;
|
|
135
|
+
await history.updateSessionStatus(this.config.sessionId, "running");
|
|
136
|
+
} else if (!session) {
|
|
137
|
+
this.currentSession = await history.createSession(this.config.sessionId, this.config.baseUrl, { adapter: this.adapterType });
|
|
138
|
+
}
|
|
139
|
+
this.isSessionReady = true;
|
|
140
|
+
} catch (error) {
|
|
141
|
+
if (this.config.debug) {
|
|
142
|
+
console.error(`[Crawler] Failed to initialize navigation history:`, error);
|
|
143
|
+
}
|
|
144
|
+
this.isNavigationHistoryReady = false;
|
|
145
|
+
this.isSessionReady = false;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
async waitForNavigationHistory() {
|
|
149
|
+
if (!this.config.enableNavigationHistory)
|
|
150
|
+
return;
|
|
151
|
+
if (this.navigationHistoryInitPromise) {
|
|
152
|
+
await this.navigationHistoryInitPromise;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
async ensureNavigationHistoryReady() {
|
|
156
|
+
if (!this.config.enableNavigationHistory)
|
|
157
|
+
return null;
|
|
158
|
+
await this.waitForNavigationHistory();
|
|
159
|
+
return this.navigationHistory;
|
|
160
|
+
}
|
|
161
|
+
async addToNavigationQueue(url, method, body, headers) {
|
|
162
|
+
const history = await this.ensureNavigationHistoryReady();
|
|
163
|
+
if (!history || !this.currentSession)
|
|
164
|
+
return;
|
|
165
|
+
try {
|
|
166
|
+
await history.addToQueue(this.currentSession.sessionId, url, {
|
|
167
|
+
method,
|
|
168
|
+
body,
|
|
169
|
+
headers
|
|
170
|
+
});
|
|
171
|
+
} catch (error) {
|
|
172
|
+
if (this.config.debug) {
|
|
173
|
+
console.warn(`[Crawler] Failed to add URL to navigation queue: ${url}`, error);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
async markUrlVisited(url, result) {
|
|
178
|
+
const history = await this.ensureNavigationHistoryReady();
|
|
179
|
+
if (!history || !this.currentSession)
|
|
180
|
+
return;
|
|
181
|
+
try {
|
|
182
|
+
await history.markVisited(this.currentSession.sessionId, url, result);
|
|
183
|
+
} catch (error) {
|
|
184
|
+
if (this.config.debug) {
|
|
185
|
+
console.warn(`[Crawler] Failed to mark URL as visited: ${url}`, error);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
getSession() {
|
|
190
|
+
return this.currentSession;
|
|
191
|
+
}
|
|
192
|
+
getSessionId() {
|
|
193
|
+
return this.config.sessionId;
|
|
194
|
+
}
|
|
195
|
+
async resume(sessionId) {
|
|
196
|
+
if (!this.config.enableNavigationHistory) {
|
|
197
|
+
throw new Error("Navigation history is not enabled. Set enableNavigationHistory: true in options.");
|
|
198
|
+
}
|
|
199
|
+
await this.waitForNavigationHistory();
|
|
200
|
+
if (!this.navigationHistory) {
|
|
201
|
+
throw new Error("Navigation history failed to initialize.");
|
|
202
|
+
}
|
|
203
|
+
await this.waitForStorage();
|
|
204
|
+
if (this.isCacheEnabled) {
|
|
205
|
+
await this.waitForCache();
|
|
206
|
+
}
|
|
207
|
+
const targetSessionId = sessionId || this.config.sessionId;
|
|
208
|
+
const session = await this.navigationHistory.getSession(targetSessionId);
|
|
209
|
+
if (!session) {
|
|
210
|
+
throw new Error(`Session '${targetSessionId}' not found`);
|
|
211
|
+
}
|
|
212
|
+
if (session.status === "completed") {
|
|
213
|
+
throw new Error(`Session '${targetSessionId}' is already completed`);
|
|
214
|
+
}
|
|
215
|
+
this.currentSession = session;
|
|
216
|
+
await this.navigationHistory.updateSessionStatus(targetSessionId, "running");
|
|
217
|
+
const queuedUrls = await this.navigationHistory.getAllQueuedUrls(targetSessionId);
|
|
218
|
+
if (this.config.debug) {
|
|
219
|
+
console.log(`[Crawler] Resuming session '${targetSessionId}' with ${queuedUrls.length} queued URLs`);
|
|
220
|
+
}
|
|
221
|
+
const scheduledUrls = new Set;
|
|
222
|
+
for (const item of queuedUrls) {
|
|
223
|
+
if (scheduledUrls.has(item.url)) {
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
226
|
+
scheduledUrls.add(item.url);
|
|
227
|
+
const body = item.body ? JSON.parse(item.body) : undefined;
|
|
228
|
+
const headers = item.headers ? JSON.parse(item.headers) : undefined;
|
|
229
|
+
this.visit(item.url, {
|
|
230
|
+
method: item.method,
|
|
231
|
+
body,
|
|
232
|
+
headers,
|
|
233
|
+
forceRevisit: false
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
return this;
|
|
237
|
+
}
|
|
238
|
+
async getResumableSessions() {
|
|
239
|
+
if (!this.config.enableNavigationHistory) {
|
|
240
|
+
return [];
|
|
241
|
+
}
|
|
242
|
+
await this.waitForNavigationHistory();
|
|
243
|
+
if (!this.navigationHistory) {
|
|
244
|
+
return [];
|
|
245
|
+
}
|
|
246
|
+
return this.navigationHistory.getResumableSessions();
|
|
247
|
+
}
|
|
248
|
+
async pause() {
|
|
249
|
+
await this.waitForNavigationHistory();
|
|
250
|
+
if (!this.navigationHistory || !this.currentSession) {
|
|
251
|
+
return;
|
|
252
|
+
}
|
|
253
|
+
await this.navigationHistory.updateSessionStatus(this.currentSession.sessionId, "paused");
|
|
254
|
+
this.currentSession.status = "paused";
|
|
255
|
+
}
|
|
256
|
+
async complete() {
|
|
257
|
+
await this.waitForNavigationHistory();
|
|
258
|
+
if (!this.navigationHistory || !this.currentSession) {
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
await this.navigationHistory.updateSessionStatus(this.currentSession.sessionId, "completed");
|
|
262
|
+
this.currentSession.status = "completed";
|
|
263
|
+
}
|
|
264
|
+
getAdapterType() {
|
|
265
|
+
return this.adapterType;
|
|
266
|
+
}
|
|
267
|
+
async setAdapter(adapter) {
|
|
268
|
+
this.adapterType = adapter;
|
|
269
|
+
await this.initializeAdapter();
|
|
270
|
+
}
|
|
96
271
|
rawResponseHandler(data) {
|
|
97
272
|
if (this.rawResponseEvents.length === 0)
|
|
98
273
|
return;
|
|
@@ -390,6 +565,10 @@ class Crawler {
|
|
|
390
565
|
};
|
|
391
566
|
decodoInstanse = this.config.getAdapter(url, "decodo", false, useOxylabsRotation) || undefined;
|
|
392
567
|
}
|
|
568
|
+
if (this.config.enableNavigationHistory) {
|
|
569
|
+
const headersObj = headers instanceof Headers ? Object.fromEntries(headers.entries()) : headers;
|
|
570
|
+
this.addToNavigationQueue(url, method, body, headersObj);
|
|
571
|
+
}
|
|
393
572
|
if (deepEmailFinder) {
|
|
394
573
|
this.execute2(method, url, body, _options, forceRevisit).then();
|
|
395
574
|
return this;
|
|
@@ -444,6 +623,11 @@ class Crawler {
|
|
|
444
623
|
await this.saveCache(url, res);
|
|
445
624
|
if (!isVisited)
|
|
446
625
|
await this.saveUrl(url);
|
|
626
|
+
this.markUrlVisited(url, {
|
|
627
|
+
status: res.status,
|
|
628
|
+
finalUrl: res.finalUrl,
|
|
629
|
+
contentType: res.contentType
|
|
630
|
+
});
|
|
447
631
|
if (res.contentType && res.contentType.includes("/json")) {
|
|
448
632
|
if (this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) {
|
|
449
633
|
this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.queue);
|
|
@@ -491,6 +675,10 @@ class Crawler {
|
|
|
491
675
|
return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
|
|
492
676
|
}
|
|
493
677
|
}
|
|
678
|
+
this.markUrlVisited(url, {
|
|
679
|
+
status: error?.response?.status || 0,
|
|
680
|
+
errorMessage: e.message || "Unknown error"
|
|
681
|
+
});
|
|
494
682
|
if (this.config.throwFatalError)
|
|
495
683
|
throw e;
|
|
496
684
|
if (this.config.debug) {
|
|
@@ -506,6 +694,9 @@ class Crawler {
|
|
|
506
694
|
async waitForAll() {
|
|
507
695
|
await this.queue.onIdle();
|
|
508
696
|
}
|
|
697
|
+
async done() {
|
|
698
|
+
return this.waitForAll();
|
|
699
|
+
}
|
|
509
700
|
async close() {
|
|
510
701
|
try {
|
|
511
702
|
await this.cacher.close();
|
package/dist/plugin/crawler.js
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
import fs from "node:fs";
|
|
2
2
|
import { FileCacher } from '../cache/file-cacher.js';
|
|
3
3
|
import { UrlStore } from '../cache/url-store.js';
|
|
4
|
+
import { NavigationHistory } from '../cache/navigation-history.js';
|
|
4
5
|
import { parseHTML } from "linkedom";
|
|
5
6
|
import path from "node:path";
|
|
7
|
+
import { Rezo } from '../core/rezo.js';
|
|
6
8
|
import { RezoQueue } from '../queue/queue.js';
|
|
7
9
|
import { Scraper } from './scraper.js';
|
|
8
10
|
import { CrawlerOptions } from './crawler-options.js';
|
|
11
|
+
import { loadAdapter } from '../adapters/picker.js';
|
|
9
12
|
String.prototype.addBaseUrl = function(url) {
|
|
10
13
|
url = url instanceof URL ? url.href : url;
|
|
11
14
|
const html = this.replace(/<base\b[^>]*?>/gi, "");
|
|
@@ -44,12 +47,20 @@ export class Crawler {
|
|
|
44
47
|
isStorageReady = false;
|
|
45
48
|
isCacheReady = false;
|
|
46
49
|
leadsFinder;
|
|
47
|
-
|
|
50
|
+
navigationHistory = null;
|
|
51
|
+
isNavigationHistoryReady = false;
|
|
52
|
+
isSessionReady = false;
|
|
53
|
+
currentSession = null;
|
|
54
|
+
navigationHistoryInitPromise = null;
|
|
55
|
+
adapterExecutor = null;
|
|
56
|
+
adapterType;
|
|
57
|
+
constructor(crawlerOptions, http = new Rezo) {
|
|
48
58
|
this.http = http;
|
|
49
59
|
this.queue = new RezoQueue({
|
|
50
60
|
concurrency: 1000
|
|
51
61
|
});
|
|
52
62
|
this.config = new CrawlerOptions(crawlerOptions);
|
|
63
|
+
this.adapterType = this.config.adapter;
|
|
53
64
|
const enableCache = this.config.enableCache;
|
|
54
65
|
this.isCacheEnabled = enableCache;
|
|
55
66
|
if (enableCache) {
|
|
@@ -91,8 +102,172 @@ export class Crawler {
|
|
|
91
102
|
this.isStorageReady = true;
|
|
92
103
|
});
|
|
93
104
|
}
|
|
105
|
+
if (this.config.enableNavigationHistory) {
|
|
106
|
+
const navHistoryDir = path.resolve(this.config.cacheDir, "navigation");
|
|
107
|
+
if (!fs.existsSync(navHistoryDir))
|
|
108
|
+
fs.mkdirSync(navHistoryDir, { recursive: true });
|
|
109
|
+
this.navigationHistoryInitPromise = this.initializeNavigationHistory(navHistoryDir);
|
|
110
|
+
}
|
|
111
|
+
this.initializeAdapter();
|
|
94
112
|
this.leadsFinder = new Scraper(this.http, this.config, this._onEmailLeads.bind(this), this._onEmailDiscovered.bind(this), this.config.debug);
|
|
95
113
|
}
|
|
114
|
+
async initializeAdapter() {
|
|
115
|
+
try {
|
|
116
|
+
const adapterModule = await loadAdapter(this.adapterType);
|
|
117
|
+
this.adapterExecutor = adapterModule.executeRequest.bind(adapterModule);
|
|
118
|
+
} catch (error) {
|
|
119
|
+
if (this.config.debug) {
|
|
120
|
+
console.warn(`[Crawler] Failed to load adapter '${this.adapterType}', falling back to http instance`);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
async initializeNavigationHistory(navHistoryDir) {
|
|
125
|
+
try {
|
|
126
|
+
const history = await NavigationHistory.create({
|
|
127
|
+
storeDir: navHistoryDir,
|
|
128
|
+
dbFileName: "navigation.db"
|
|
129
|
+
});
|
|
130
|
+
this.navigationHistory = history;
|
|
131
|
+
this.isNavigationHistoryReady = true;
|
|
132
|
+
const session = await history.getSession(this.config.sessionId);
|
|
133
|
+
if (session && (session.status === "running" || session.status === "paused")) {
|
|
134
|
+
this.currentSession = session;
|
|
135
|
+
await history.updateSessionStatus(this.config.sessionId, "running");
|
|
136
|
+
} else if (!session) {
|
|
137
|
+
this.currentSession = await history.createSession(this.config.sessionId, this.config.baseUrl, { adapter: this.adapterType });
|
|
138
|
+
}
|
|
139
|
+
this.isSessionReady = true;
|
|
140
|
+
} catch (error) {
|
|
141
|
+
if (this.config.debug) {
|
|
142
|
+
console.error(`[Crawler] Failed to initialize navigation history:`, error);
|
|
143
|
+
}
|
|
144
|
+
this.isNavigationHistoryReady = false;
|
|
145
|
+
this.isSessionReady = false;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
async waitForNavigationHistory() {
|
|
149
|
+
if (!this.config.enableNavigationHistory)
|
|
150
|
+
return;
|
|
151
|
+
if (this.navigationHistoryInitPromise) {
|
|
152
|
+
await this.navigationHistoryInitPromise;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
async ensureNavigationHistoryReady() {
|
|
156
|
+
if (!this.config.enableNavigationHistory)
|
|
157
|
+
return null;
|
|
158
|
+
await this.waitForNavigationHistory();
|
|
159
|
+
return this.navigationHistory;
|
|
160
|
+
}
|
|
161
|
+
async addToNavigationQueue(url, method, body, headers) {
|
|
162
|
+
const history = await this.ensureNavigationHistoryReady();
|
|
163
|
+
if (!history || !this.currentSession)
|
|
164
|
+
return;
|
|
165
|
+
try {
|
|
166
|
+
await history.addToQueue(this.currentSession.sessionId, url, {
|
|
167
|
+
method,
|
|
168
|
+
body,
|
|
169
|
+
headers
|
|
170
|
+
});
|
|
171
|
+
} catch (error) {
|
|
172
|
+
if (this.config.debug) {
|
|
173
|
+
console.warn(`[Crawler] Failed to add URL to navigation queue: ${url}`, error);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
async markUrlVisited(url, result) {
|
|
178
|
+
const history = await this.ensureNavigationHistoryReady();
|
|
179
|
+
if (!history || !this.currentSession)
|
|
180
|
+
return;
|
|
181
|
+
try {
|
|
182
|
+
await history.markVisited(this.currentSession.sessionId, url, result);
|
|
183
|
+
} catch (error) {
|
|
184
|
+
if (this.config.debug) {
|
|
185
|
+
console.warn(`[Crawler] Failed to mark URL as visited: ${url}`, error);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
getSession() {
|
|
190
|
+
return this.currentSession;
|
|
191
|
+
}
|
|
192
|
+
getSessionId() {
|
|
193
|
+
return this.config.sessionId;
|
|
194
|
+
}
|
|
195
|
+
async resume(sessionId) {
|
|
196
|
+
if (!this.config.enableNavigationHistory) {
|
|
197
|
+
throw new Error("Navigation history is not enabled. Set enableNavigationHistory: true in options.");
|
|
198
|
+
}
|
|
199
|
+
await this.waitForNavigationHistory();
|
|
200
|
+
if (!this.navigationHistory) {
|
|
201
|
+
throw new Error("Navigation history failed to initialize.");
|
|
202
|
+
}
|
|
203
|
+
await this.waitForStorage();
|
|
204
|
+
if (this.isCacheEnabled) {
|
|
205
|
+
await this.waitForCache();
|
|
206
|
+
}
|
|
207
|
+
const targetSessionId = sessionId || this.config.sessionId;
|
|
208
|
+
const session = await this.navigationHistory.getSession(targetSessionId);
|
|
209
|
+
if (!session) {
|
|
210
|
+
throw new Error(`Session '${targetSessionId}' not found`);
|
|
211
|
+
}
|
|
212
|
+
if (session.status === "completed") {
|
|
213
|
+
throw new Error(`Session '${targetSessionId}' is already completed`);
|
|
214
|
+
}
|
|
215
|
+
this.currentSession = session;
|
|
216
|
+
await this.navigationHistory.updateSessionStatus(targetSessionId, "running");
|
|
217
|
+
const queuedUrls = await this.navigationHistory.getAllQueuedUrls(targetSessionId);
|
|
218
|
+
if (this.config.debug) {
|
|
219
|
+
console.log(`[Crawler] Resuming session '${targetSessionId}' with ${queuedUrls.length} queued URLs`);
|
|
220
|
+
}
|
|
221
|
+
const scheduledUrls = new Set;
|
|
222
|
+
for (const item of queuedUrls) {
|
|
223
|
+
if (scheduledUrls.has(item.url)) {
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
226
|
+
scheduledUrls.add(item.url);
|
|
227
|
+
const body = item.body ? JSON.parse(item.body) : undefined;
|
|
228
|
+
const headers = item.headers ? JSON.parse(item.headers) : undefined;
|
|
229
|
+
this.visit(item.url, {
|
|
230
|
+
method: item.method,
|
|
231
|
+
body,
|
|
232
|
+
headers,
|
|
233
|
+
forceRevisit: false
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
return this;
|
|
237
|
+
}
|
|
238
|
+
async getResumableSessions() {
|
|
239
|
+
if (!this.config.enableNavigationHistory) {
|
|
240
|
+
return [];
|
|
241
|
+
}
|
|
242
|
+
await this.waitForNavigationHistory();
|
|
243
|
+
if (!this.navigationHistory) {
|
|
244
|
+
return [];
|
|
245
|
+
}
|
|
246
|
+
return this.navigationHistory.getResumableSessions();
|
|
247
|
+
}
|
|
248
|
+
async pause() {
|
|
249
|
+
await this.waitForNavigationHistory();
|
|
250
|
+
if (!this.navigationHistory || !this.currentSession) {
|
|
251
|
+
return;
|
|
252
|
+
}
|
|
253
|
+
await this.navigationHistory.updateSessionStatus(this.currentSession.sessionId, "paused");
|
|
254
|
+
this.currentSession.status = "paused";
|
|
255
|
+
}
|
|
256
|
+
async complete() {
|
|
257
|
+
await this.waitForNavigationHistory();
|
|
258
|
+
if (!this.navigationHistory || !this.currentSession) {
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
await this.navigationHistory.updateSessionStatus(this.currentSession.sessionId, "completed");
|
|
262
|
+
this.currentSession.status = "completed";
|
|
263
|
+
}
|
|
264
|
+
getAdapterType() {
|
|
265
|
+
return this.adapterType;
|
|
266
|
+
}
|
|
267
|
+
async setAdapter(adapter) {
|
|
268
|
+
this.adapterType = adapter;
|
|
269
|
+
await this.initializeAdapter();
|
|
270
|
+
}
|
|
96
271
|
rawResponseHandler(data) {
|
|
97
272
|
if (this.rawResponseEvents.length === 0)
|
|
98
273
|
return;
|
|
@@ -390,6 +565,10 @@ export class Crawler {
|
|
|
390
565
|
};
|
|
391
566
|
decodoInstanse = this.config.getAdapter(url, "decodo", false, useOxylabsRotation) || undefined;
|
|
392
567
|
}
|
|
568
|
+
if (this.config.enableNavigationHistory) {
|
|
569
|
+
const headersObj = headers instanceof Headers ? Object.fromEntries(headers.entries()) : headers;
|
|
570
|
+
this.addToNavigationQueue(url, method, body, headersObj);
|
|
571
|
+
}
|
|
393
572
|
if (deepEmailFinder) {
|
|
394
573
|
this.execute2(method, url, body, _options, forceRevisit).then();
|
|
395
574
|
return this;
|
|
@@ -444,6 +623,11 @@ export class Crawler {
|
|
|
444
623
|
await this.saveCache(url, res);
|
|
445
624
|
if (!isVisited)
|
|
446
625
|
await this.saveUrl(url);
|
|
626
|
+
this.markUrlVisited(url, {
|
|
627
|
+
status: res.status,
|
|
628
|
+
finalUrl: res.finalUrl,
|
|
629
|
+
contentType: res.contentType
|
|
630
|
+
});
|
|
447
631
|
if (res.contentType && res.contentType.includes("/json")) {
|
|
448
632
|
if (this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) {
|
|
449
633
|
this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.queue);
|
|
@@ -491,6 +675,10 @@ export class Crawler {
|
|
|
491
675
|
return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
|
|
492
676
|
}
|
|
493
677
|
}
|
|
678
|
+
this.markUrlVisited(url, {
|
|
679
|
+
status: error?.response?.status || 0,
|
|
680
|
+
errorMessage: e.message || "Unknown error"
|
|
681
|
+
});
|
|
494
682
|
if (this.config.throwFatalError)
|
|
495
683
|
throw e;
|
|
496
684
|
if (this.config.debug) {
|
|
@@ -506,6 +694,9 @@ export class Crawler {
|
|
|
506
694
|
async waitForAll() {
|
|
507
695
|
await this.queue.onIdle();
|
|
508
696
|
}
|
|
697
|
+
async done() {
|
|
698
|
+
return this.waitForAll();
|
|
699
|
+
}
|
|
509
700
|
async close() {
|
|
510
701
|
try {
|
|
511
702
|
await this.cacher.close();
|
package/dist/plugin/index.cjs
CHANGED
|
@@ -1,36 +1,36 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.Crawler =
|
|
3
|
-
const
|
|
4
|
-
exports.CrawlerOptions =
|
|
5
|
-
const
|
|
6
|
-
exports.FileCacher =
|
|
7
|
-
const
|
|
8
|
-
exports.UrlStore =
|
|
9
|
-
const
|
|
10
|
-
exports.Oxylabs =
|
|
11
|
-
const
|
|
12
|
-
exports.OXYLABS_BROWSER_TYPES =
|
|
13
|
-
exports.OXYLABS_COMMON_LOCALES =
|
|
14
|
-
exports.OXYLABS_COMMON_GEO_LOCATIONS =
|
|
15
|
-
exports.OXYLABS_US_STATES =
|
|
16
|
-
exports.OXYLABS_EUROPEAN_COUNTRIES =
|
|
17
|
-
exports.OXYLABS_ASIAN_COUNTRIES =
|
|
18
|
-
exports.getRandomOxylabsBrowserType =
|
|
19
|
-
exports.getRandomOxylabsLocale =
|
|
20
|
-
exports.getRandomOxylabsGeoLocation =
|
|
21
|
-
const
|
|
22
|
-
exports.Decodo =
|
|
23
|
-
const
|
|
24
|
-
exports.DECODO_DEVICE_TYPES =
|
|
25
|
-
exports.DECODO_HEADLESS_MODES =
|
|
26
|
-
exports.DECODO_COMMON_LOCALES =
|
|
27
|
-
exports.DECODO_COMMON_COUNTRIES =
|
|
28
|
-
exports.DECODO_EUROPEAN_COUNTRIES =
|
|
29
|
-
exports.DECODO_ASIAN_COUNTRIES =
|
|
30
|
-
exports.DECODO_US_STATES =
|
|
31
|
-
exports.DECODO_COMMON_CITIES =
|
|
32
|
-
exports.getRandomDecodoDeviceType =
|
|
33
|
-
exports.getRandomDecodoLocale =
|
|
34
|
-
exports.getRandomDecodoCountry =
|
|
35
|
-
exports.getRandomDecodoCity =
|
|
36
|
-
exports.generateDecodoSessionId =
|
|
1
|
+
const _mod_pwsonp = require('./crawler.cjs');
|
|
2
|
+
exports.Crawler = _mod_pwsonp.Crawler;;
|
|
3
|
+
const _mod_f0a514 = require('./crawler-options.cjs');
|
|
4
|
+
exports.CrawlerOptions = _mod_f0a514.CrawlerOptions;;
|
|
5
|
+
const _mod_0j6c45 = require('../cache/file-cacher.cjs');
|
|
6
|
+
exports.FileCacher = _mod_0j6c45.FileCacher;;
|
|
7
|
+
const _mod_0ys1f7 = require('../cache/url-store.cjs');
|
|
8
|
+
exports.UrlStore = _mod_0ys1f7.UrlStore;;
|
|
9
|
+
const _mod_bs9mae = require('./addon/oxylabs/index.cjs');
|
|
10
|
+
exports.Oxylabs = _mod_bs9mae.Oxylabs;;
|
|
11
|
+
const _mod_afta57 = require('./addon/oxylabs/options.cjs');
|
|
12
|
+
exports.OXYLABS_BROWSER_TYPES = _mod_afta57.OXYLABS_BROWSER_TYPES;
|
|
13
|
+
exports.OXYLABS_COMMON_LOCALES = _mod_afta57.OXYLABS_COMMON_LOCALES;
|
|
14
|
+
exports.OXYLABS_COMMON_GEO_LOCATIONS = _mod_afta57.OXYLABS_COMMON_GEO_LOCATIONS;
|
|
15
|
+
exports.OXYLABS_US_STATES = _mod_afta57.OXYLABS_US_STATES;
|
|
16
|
+
exports.OXYLABS_EUROPEAN_COUNTRIES = _mod_afta57.OXYLABS_EUROPEAN_COUNTRIES;
|
|
17
|
+
exports.OXYLABS_ASIAN_COUNTRIES = _mod_afta57.OXYLABS_ASIAN_COUNTRIES;
|
|
18
|
+
exports.getRandomOxylabsBrowserType = _mod_afta57.getRandomBrowserType;
|
|
19
|
+
exports.getRandomOxylabsLocale = _mod_afta57.getRandomLocale;
|
|
20
|
+
exports.getRandomOxylabsGeoLocation = _mod_afta57.getRandomGeoLocation;;
|
|
21
|
+
const _mod_8woeb1 = require('./addon/decodo/index.cjs');
|
|
22
|
+
exports.Decodo = _mod_8woeb1.Decodo;;
|
|
23
|
+
const _mod_rpxa81 = require('./addon/decodo/options.cjs');
|
|
24
|
+
exports.DECODO_DEVICE_TYPES = _mod_rpxa81.DECODO_DEVICE_TYPES;
|
|
25
|
+
exports.DECODO_HEADLESS_MODES = _mod_rpxa81.DECODO_HEADLESS_MODES;
|
|
26
|
+
exports.DECODO_COMMON_LOCALES = _mod_rpxa81.DECODO_COMMON_LOCALES;
|
|
27
|
+
exports.DECODO_COMMON_COUNTRIES = _mod_rpxa81.DECODO_COMMON_COUNTRIES;
|
|
28
|
+
exports.DECODO_EUROPEAN_COUNTRIES = _mod_rpxa81.DECODO_EUROPEAN_COUNTRIES;
|
|
29
|
+
exports.DECODO_ASIAN_COUNTRIES = _mod_rpxa81.DECODO_ASIAN_COUNTRIES;
|
|
30
|
+
exports.DECODO_US_STATES = _mod_rpxa81.DECODO_US_STATES;
|
|
31
|
+
exports.DECODO_COMMON_CITIES = _mod_rpxa81.DECODO_COMMON_CITIES;
|
|
32
|
+
exports.getRandomDecodoDeviceType = _mod_rpxa81.getRandomDeviceType;
|
|
33
|
+
exports.getRandomDecodoLocale = _mod_rpxa81.getRandomLocale;
|
|
34
|
+
exports.getRandomDecodoCountry = _mod_rpxa81.getRandomCountry;
|
|
35
|
+
exports.getRandomDecodoCity = _mod_rpxa81.getRandomCity;
|
|
36
|
+
exports.generateDecodoSessionId = _mod_rpxa81.generateSessionId;;
|
package/dist/proxy/index.cjs
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
|
-
const { SocksProxyAgent
|
|
2
|
-
const { HttpsProxyAgent: RezoHttpsSocks } = require("https-proxy-agent");
|
|
3
|
-
const { HttpProxyAgent: RezoHttpSocks } = require("http-proxy-agent");
|
|
1
|
+
const { Agent, HttpProxyAgent, HttpsProxyAgent, SocksProxyAgent } = require('../internal/agents.cjs');
|
|
4
2
|
const { parseProxyString } = require('./parse.cjs');
|
|
5
|
-
const
|
|
6
|
-
exports.ProxyManager =
|
|
7
|
-
const
|
|
8
|
-
exports.parseProxyString =
|
|
3
|
+
const _mod_wpx4sa = require('./manager.cjs');
|
|
4
|
+
exports.ProxyManager = _mod_wpx4sa.ProxyManager;;
|
|
5
|
+
const _mod_138ems = require('./parse.cjs');
|
|
6
|
+
exports.parseProxyString = _mod_138ems.parseProxyString;;
|
|
9
7
|
function createOptions(uri, opts) {
|
|
10
8
|
if (uri instanceof URL || typeof uri === "string") {
|
|
11
9
|
return {
|
|
@@ -26,29 +24,29 @@ function rezoProxy(uri, over, opts) {
|
|
|
26
24
|
if (typeof over === "string") {
|
|
27
25
|
const config = createOptions(uri, opts);
|
|
28
26
|
if (over === "http") {
|
|
29
|
-
return new
|
|
27
|
+
return new HttpProxyAgent(config.uri, config.opts);
|
|
30
28
|
}
|
|
31
|
-
return new
|
|
29
|
+
return new HttpsProxyAgent(config.uri, { ...config.opts, rejectUnauthorized: config.opts?.rejectUnauthorized ?? false });
|
|
32
30
|
} else {
|
|
33
31
|
const isHttp = uri.startsWith("http:");
|
|
34
32
|
const isHttps = uri.startsWith("https:");
|
|
35
33
|
const isSocks = uri.startsWith("sock");
|
|
36
34
|
if (isSocks) {
|
|
37
35
|
const config = createOptions(uri, over || opts);
|
|
38
|
-
return new
|
|
36
|
+
return new SocksProxyAgent(config.uri, config.opts);
|
|
39
37
|
}
|
|
40
38
|
if (isHttp) {
|
|
41
39
|
const config = createOptions(uri, over || opts);
|
|
42
|
-
return new
|
|
40
|
+
return new HttpProxyAgent(config.uri, config.opts);
|
|
43
41
|
}
|
|
44
42
|
if (isHttps) {
|
|
45
43
|
const config = createOptions(uri, over || opts);
|
|
46
|
-
return new
|
|
44
|
+
return new HttpsProxyAgent(config.uri, { ...config.opts, rejectUnauthorized: config.opts?.rejectUnauthorized ?? false });
|
|
47
45
|
}
|
|
48
46
|
const proxy = parseProxyString(uri);
|
|
49
47
|
if (proxy) {
|
|
50
48
|
const config = createOptions(proxy, over || opts);
|
|
51
|
-
return new
|
|
49
|
+
return new SocksProxyAgent(config.uri, config.opts);
|
|
52
50
|
}
|
|
53
51
|
throw new Error("Invalid proxy protocol");
|
|
54
52
|
}
|
|
@@ -57,12 +55,16 @@ function rezoProxy(uri, over, opts) {
|
|
|
57
55
|
delete uri.client;
|
|
58
56
|
const config = createOptions(uri, opts);
|
|
59
57
|
if (over === "http") {
|
|
60
|
-
return new
|
|
58
|
+
return new HttpProxyAgent(config.uri, config.opts);
|
|
61
59
|
}
|
|
62
|
-
return new
|
|
60
|
+
return new HttpsProxyAgent(config.uri, { ...config.opts, rejectUnauthorized: config.opts?.rejectUnauthorized ?? false });
|
|
63
61
|
}
|
|
64
62
|
const config = createOptions(uri, opts);
|
|
65
|
-
return new
|
|
63
|
+
return new SocksProxyAgent(config.uri, config.opts);
|
|
66
64
|
}
|
|
67
65
|
|
|
66
|
+
exports.Agent = Agent;
|
|
67
|
+
exports.HttpProxyAgent = HttpProxyAgent;
|
|
68
|
+
exports.HttpsProxyAgent = HttpsProxyAgent;
|
|
69
|
+
exports.SocksProxyAgent = SocksProxyAgent;
|
|
68
70
|
exports.rezoProxy = rezoProxy;
|