rezo 1.0.42 → 1.0.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/dist/adapters/curl.cjs +131 -29
  2. package/dist/adapters/curl.js +131 -29
  3. package/dist/adapters/entries/curl.d.ts +65 -0
  4. package/dist/adapters/entries/fetch.d.ts +65 -0
  5. package/dist/adapters/entries/http.d.ts +65 -0
  6. package/dist/adapters/entries/http2.d.ts +65 -0
  7. package/dist/adapters/entries/react-native.d.ts +65 -0
  8. package/dist/adapters/entries/xhr.d.ts +65 -0
  9. package/dist/adapters/http2.cjs +209 -22
  10. package/dist/adapters/http2.js +209 -22
  11. package/dist/adapters/index.cjs +6 -6
  12. package/dist/cache/index.cjs +9 -13
  13. package/dist/cache/index.js +0 -2
  14. package/dist/core/rezo.cjs +7 -0
  15. package/dist/core/rezo.js +7 -0
  16. package/dist/crawler/addon/decodo/index.cjs +1 -0
  17. package/dist/crawler/addon/decodo/index.js +1 -0
  18. package/dist/crawler/crawler-options.cjs +1 -0
  19. package/dist/crawler/crawler-options.js +1 -0
  20. package/dist/crawler/crawler.cjs +1070 -0
  21. package/dist/crawler/crawler.js +1068 -0
  22. package/dist/crawler/index.cjs +40 -0
  23. package/dist/{plugin → crawler}/index.js +4 -2
  24. package/dist/crawler/plugin/file-cacher.cjs +19 -0
  25. package/dist/crawler/plugin/file-cacher.js +19 -0
  26. package/dist/crawler/plugin/index.cjs +1 -0
  27. package/dist/crawler/plugin/index.js +1 -0
  28. package/dist/crawler/plugin/navigation-history.cjs +43 -0
  29. package/dist/crawler/plugin/navigation-history.js +43 -0
  30. package/dist/crawler/plugin/robots-txt.cjs +2 -0
  31. package/dist/crawler/plugin/robots-txt.js +2 -0
  32. package/dist/crawler/plugin/url-store.cjs +18 -0
  33. package/dist/crawler/plugin/url-store.js +18 -0
  34. package/dist/crawler.d.ts +511 -183
  35. package/dist/entries/crawler.cjs +5 -5
  36. package/dist/entries/crawler.js +2 -2
  37. package/dist/index.cjs +27 -24
  38. package/dist/index.d.ts +73 -0
  39. package/dist/index.js +1 -0
  40. package/dist/internal/agents/base.cjs +113 -0
  41. package/dist/internal/agents/base.js +110 -0
  42. package/dist/internal/agents/http-proxy.cjs +89 -0
  43. package/dist/internal/agents/http-proxy.js +86 -0
  44. package/dist/internal/agents/https-proxy.cjs +176 -0
  45. package/dist/internal/agents/https-proxy.js +173 -0
  46. package/dist/internal/agents/index.cjs +10 -0
  47. package/dist/internal/agents/index.js +5 -0
  48. package/dist/internal/agents/socks-client.cjs +571 -0
  49. package/dist/internal/agents/socks-client.js +567 -0
  50. package/dist/internal/agents/socks-proxy.cjs +75 -0
  51. package/dist/internal/agents/socks-proxy.js +72 -0
  52. package/dist/platform/browser.d.ts +65 -0
  53. package/dist/platform/bun.d.ts +65 -0
  54. package/dist/platform/deno.d.ts +65 -0
  55. package/dist/platform/node.d.ts +65 -0
  56. package/dist/platform/react-native.d.ts +65 -0
  57. package/dist/platform/worker.d.ts +65 -0
  58. package/dist/proxy/index.cjs +18 -16
  59. package/dist/proxy/index.js +17 -12
  60. package/dist/queue/index.cjs +8 -8
  61. package/dist/responses/buildError.cjs +11 -2
  62. package/dist/responses/buildError.js +11 -2
  63. package/dist/responses/universal/index.cjs +11 -11
  64. package/dist/utils/curl.cjs +317 -0
  65. package/dist/utils/curl.js +314 -0
  66. package/package.json +2 -6
  67. package/dist/cache/file-cacher.cjs +0 -264
  68. package/dist/cache/file-cacher.js +0 -261
  69. package/dist/cache/url-store.cjs +0 -288
  70. package/dist/cache/url-store.js +0 -285
  71. package/dist/plugin/addon/decodo/index.cjs +0 -1
  72. package/dist/plugin/addon/decodo/index.js +0 -1
  73. package/dist/plugin/crawler-options.cjs +0 -1
  74. package/dist/plugin/crawler-options.js +0 -1
  75. package/dist/plugin/crawler.cjs +0 -519
  76. package/dist/plugin/crawler.js +0 -517
  77. package/dist/plugin/index.cjs +0 -36
  78. /package/dist/{plugin → crawler}/addon/decodo/options.cjs +0 -0
  79. /package/dist/{plugin → crawler}/addon/decodo/options.js +0 -0
  80. /package/dist/{plugin → crawler}/addon/decodo/types.cjs +0 -0
  81. /package/dist/{plugin → crawler}/addon/decodo/types.js +0 -0
  82. /package/dist/{plugin → crawler}/addon/oxylabs/index.cjs +0 -0
  83. /package/dist/{plugin → crawler}/addon/oxylabs/index.js +0 -0
  84. /package/dist/{plugin → crawler}/addon/oxylabs/options.cjs +0 -0
  85. /package/dist/{plugin → crawler}/addon/oxylabs/options.js +0 -0
  86. /package/dist/{plugin → crawler}/addon/oxylabs/types.cjs +0 -0
  87. /package/dist/{plugin → crawler}/addon/oxylabs/types.js +0 -0
  88. /package/dist/{plugin → crawler}/scraper.cjs +0 -0
  89. /package/dist/{plugin → crawler}/scraper.js +0 -0
@@ -0,0 +1,1070 @@
1
+ const fs = require("node:fs");
2
+ const { FileCacher } = require('./plugin/file-cacher.cjs');
3
+ const { UrlStore } = require('./plugin/url-store.cjs');
4
+ const { NavigationHistory } = require('./plugin/navigation-history.cjs');
5
+ const { RobotsTxt } = require('./plugin/robots-txt.cjs');
6
+ const { parseHTML } = require("linkedom");
7
+ const path = require("node:path");
8
+ const { Rezo } = require('../core/rezo.cjs');
9
+ const { RezoQueue } = require('../queue/queue.cjs');
10
+ const { Scraper } = require('./scraper.cjs');
11
+ const { CrawlerOptions } = require('./crawler-options.cjs');
12
+ const { loadAdapter } = require('../adapters/picker.cjs');
13
+ String.prototype.addBaseUrl = function(url) {
14
+ url = url instanceof URL ? url.href : url;
15
+ const html = this.replace(/<base\b[^>]*?>/gi, "");
16
+ if (/<head[^>]*>/i.test(html)) {
17
+ return html.replace(/<head[^>]*>/i, (match) => `${match}
18
+ <base href="${url}">`);
19
+ }
20
+ const baseTag = `<head>
21
+ <base href="${url}">
22
+ </head>
23
+ `;
24
+ if (/<body[^>]*>/i.test(html)) {
25
+ return html.replace(/<body[^>]*>/i, baseTag + "$&");
26
+ }
27
+ if (/<html[^>]*>/i.test(html)) {
28
+ return html.replace(/<html[^>]*>/i, `$&
29
+ ` + baseTag);
30
+ }
31
+ return this;
32
+ };
33
+
34
+ class Crawler {
35
+ http;
36
+ events = [];
37
+ jsonEvents = [];
38
+ errorEvents = [];
39
+ responseEvents = [];
40
+ rawResponseEvents = [];
41
+ emailDiscoveredEvents = [];
42
+ emailLeadsEvents = [];
43
+ cacher = null;
44
+ queue;
45
+ isCacheEnabled;
46
+ config;
47
+ urlStorage;
48
+ isStorageReady = false;
49
+ isCacheReady = false;
50
+ leadsFinder;
51
+ navigationHistory = null;
52
+ isNavigationHistoryReady = false;
53
+ isSessionReady = false;
54
+ currentSession = null;
55
+ navigationHistoryInitPromise = null;
56
+ adapterExecutor = null;
57
+ adapterType;
58
+ pendingExecutions = new Set;
59
+ robotsTxt;
60
+ domainResponseTimes = new Map;
61
+ domainCurrentDelay = new Map;
62
+ crawlStats = {
63
+ urlsVisited: 0,
64
+ urlsQueued: 0,
65
+ urlsFailed: 0,
66
+ startTime: 0,
67
+ currentDepth: 0
68
+ };
69
+ urlDepthMap = new Map;
70
+ startHandlers = [];
71
+ finishHandlers = [];
72
+ redirectHandlers = [];
73
+ collectedData = [];
74
+ crawlStarted = false;
75
+ constructor(crawlerOptions, http = new Rezo) {
76
+ this.http = http;
77
+ this.queue = new RezoQueue({
78
+ concurrency: 1000
79
+ });
80
+ this.config = new CrawlerOptions(crawlerOptions);
81
+ this.adapterType = this.config.adapter;
82
+ const enableCache = this.config.enableCache;
83
+ this.isCacheEnabled = enableCache;
84
+ if (enableCache) {
85
+ const cacheDir = this.config.cacheDir;
86
+ const cacheTTL = this.config.cacheTTL;
87
+ const dbUrl = cacheDir && (cacheDir.startsWith("./") || cacheDir.startsWith("/")) ? `${cacheDir}${cacheDir.endsWith("/") ? "" : "/"}` : cacheDir ? `./${cacheDir}${cacheDir.endsWith("/") ? "" : "/"}` : `./cache/`;
88
+ if (!fs.existsSync(path.dirname(dbUrl)))
89
+ fs.mkdirSync(path.dirname(dbUrl), { recursive: true });
90
+ FileCacher.create({
91
+ cacheDir: dbUrl,
92
+ ttl: cacheTTL,
93
+ encryptNamespace: true
94
+ }).then((storage) => {
95
+ this.cacher = storage;
96
+ this.isCacheReady = true;
97
+ });
98
+ const dit = path.resolve(cacheDir, "urls");
99
+ if (!fs.existsSync(dit))
100
+ fs.mkdirSync(dit, { recursive: true });
101
+ UrlStore.create({
102
+ storeDir: dit,
103
+ dbFileName: ".url_cache.db",
104
+ ttl: 1000 * 60 * 60 * 24 * 7
105
+ }).then((storage) => {
106
+ this.urlStorage = storage;
107
+ this.isStorageReady = true;
108
+ });
109
+ } else {
110
+ const dit = path.resolve(this.config.cacheDir, "./cache/urls");
111
+ if (!fs.existsSync(dit))
112
+ fs.mkdirSync(dit, { recursive: true });
113
+ UrlStore.create({
114
+ storeDir: dit,
115
+ dbFileName: ".url_cache.db",
116
+ ttl: 1000 * 60 * 60 * 24 * 7
117
+ }).then((storage) => {
118
+ this.urlStorage = storage;
119
+ this.isStorageReady = true;
120
+ });
121
+ }
122
+ if (this.config.enableNavigationHistory) {
123
+ const navHistoryDir = path.resolve(this.config.cacheDir, "navigation");
124
+ if (!fs.existsSync(navHistoryDir))
125
+ fs.mkdirSync(navHistoryDir, { recursive: true });
126
+ this.navigationHistoryInitPromise = this.initializeNavigationHistory(navHistoryDir);
127
+ }
128
+ this.initializeAdapter();
129
+ this.leadsFinder = new Scraper(this.http, this.config, this._onEmailLeads.bind(this), this._onEmailDiscovered.bind(this), this.config.debug);
130
+ this.robotsTxt = new RobotsTxt({
131
+ userAgent: this.config.userAgent || "RezoBot",
132
+ cacheTTL: 24 * 60 * 60 * 1000
133
+ });
134
+ this.crawlStats.startTime = Date.now();
135
+ if (this.config.baseUrl) {
136
+ this.urlDepthMap.set(this.config.baseUrl, 0);
137
+ }
138
+ }
139
+ async initializeAdapter() {
140
+ try {
141
+ const adapterModule = await loadAdapter(this.adapterType);
142
+ this.adapterExecutor = adapterModule.executeRequest.bind(adapterModule);
143
+ } catch (error) {
144
+ if (this.config.debug) {
145
+ console.warn(`[Crawler] Failed to load adapter '${this.adapterType}', falling back to http instance`);
146
+ }
147
+ }
148
+ }
149
+ async initializeNavigationHistory(navHistoryDir) {
150
+ try {
151
+ const history = await NavigationHistory.create({
152
+ storeDir: navHistoryDir,
153
+ dbFileName: "navigation.db"
154
+ });
155
+ this.navigationHistory = history;
156
+ this.isNavigationHistoryReady = true;
157
+ const session = await history.getSession(this.config.sessionId);
158
+ if (session && (session.status === "running" || session.status === "paused")) {
159
+ this.currentSession = session;
160
+ await history.updateSessionStatus(this.config.sessionId, "running");
161
+ } else if (!session) {
162
+ this.currentSession = await history.createSession(this.config.sessionId, this.config.baseUrl, { adapter: this.adapterType });
163
+ }
164
+ this.isSessionReady = true;
165
+ } catch (error) {
166
+ if (this.config.debug) {
167
+ console.error(`[Crawler] Failed to initialize navigation history:`, error);
168
+ }
169
+ this.isNavigationHistoryReady = false;
170
+ this.isSessionReady = false;
171
+ }
172
+ }
173
+ async waitForNavigationHistory() {
174
+ if (!this.config.enableNavigationHistory)
175
+ return;
176
+ if (this.isNavigationHistoryReady && this.isSessionReady)
177
+ return;
178
+ if (this.navigationHistoryInitPromise) {
179
+ await this.navigationHistoryInitPromise;
180
+ }
181
+ }
182
+ async ensureNavigationHistoryReady() {
183
+ if (!this.config.enableNavigationHistory)
184
+ return null;
185
+ await this.waitForNavigationHistory();
186
+ return this.navigationHistory;
187
+ }
188
+ async addToNavigationQueue(url, method, body, headers) {
189
+ const history = await this.ensureNavigationHistoryReady();
190
+ if (!history || !this.currentSession)
191
+ return;
192
+ try {
193
+ await history.addToQueue(this.currentSession.sessionId, url, {
194
+ method,
195
+ body,
196
+ headers
197
+ });
198
+ } catch (error) {
199
+ if (this.config.debug) {
200
+ console.warn(`[Crawler] Failed to add URL to navigation queue: ${url}`, error);
201
+ }
202
+ }
203
+ }
204
+ async markUrlVisited(url, result) {
205
+ const history = await this.ensureNavigationHistoryReady();
206
+ if (!history || !this.currentSession)
207
+ return;
208
+ try {
209
+ await history.markVisited(this.currentSession.sessionId, url, result);
210
+ } catch (error) {
211
+ if (this.config.debug) {
212
+ console.warn(`[Crawler] Failed to mark URL as visited: ${url}`, error);
213
+ }
214
+ }
215
+ }
216
+ getSession() {
217
+ return this.currentSession;
218
+ }
219
+ getSessionId() {
220
+ return this.config.sessionId;
221
+ }
222
+ async resume(sessionId) {
223
+ if (!this.config.enableNavigationHistory) {
224
+ throw new Error("Navigation history is not enabled. Set enableNavigationHistory: true in options.");
225
+ }
226
+ await this.waitForNavigationHistory();
227
+ if (!this.navigationHistory) {
228
+ throw new Error("Navigation history failed to initialize.");
229
+ }
230
+ await this.waitForStorage();
231
+ if (this.isCacheEnabled) {
232
+ await this.waitForCache();
233
+ }
234
+ const targetSessionId = sessionId || this.config.sessionId;
235
+ const session = await this.navigationHistory.getSession(targetSessionId);
236
+ if (!session) {
237
+ throw new Error(`Session '${targetSessionId}' not found`);
238
+ }
239
+ if (session.status === "completed") {
240
+ throw new Error(`Session '${targetSessionId}' is already completed`);
241
+ }
242
+ this.currentSession = session;
243
+ await this.navigationHistory.updateSessionStatus(targetSessionId, "running");
244
+ const queuedUrls = await this.navigationHistory.getAllQueuedUrls(targetSessionId);
245
+ if (this.config.debug) {
246
+ console.log(`[Crawler] Resuming session '${targetSessionId}' with ${queuedUrls.length} queued URLs`);
247
+ }
248
+ const scheduledUrls = new Set;
249
+ for (const item of queuedUrls) {
250
+ if (scheduledUrls.has(item.url)) {
251
+ continue;
252
+ }
253
+ scheduledUrls.add(item.url);
254
+ const body = item.body ? JSON.parse(item.body) : undefined;
255
+ const headers = item.headers ? JSON.parse(item.headers) : undefined;
256
+ this.visit(item.url, {
257
+ method: item.method,
258
+ body,
259
+ headers,
260
+ forceRevisit: false
261
+ });
262
+ }
263
+ return this;
264
+ }
265
+ async getResumableSessions() {
266
+ if (!this.config.enableNavigationHistory) {
267
+ return [];
268
+ }
269
+ await this.waitForNavigationHistory();
270
+ if (!this.navigationHistory) {
271
+ return [];
272
+ }
273
+ return this.navigationHistory.getResumableSessions();
274
+ }
275
+ async pause() {
276
+ await this.waitForNavigationHistory();
277
+ if (!this.navigationHistory || !this.currentSession) {
278
+ return;
279
+ }
280
+ await this.navigationHistory.updateSessionStatus(this.currentSession.sessionId, "paused");
281
+ this.currentSession.status = "paused";
282
+ }
283
+ async complete() {
284
+ await this.waitForNavigationHistory();
285
+ if (!this.navigationHistory || !this.currentSession) {
286
+ return;
287
+ }
288
+ await this.navigationHistory.updateSessionStatus(this.currentSession.sessionId, "completed");
289
+ this.currentSession.status = "completed";
290
+ }
291
+ getAdapterType() {
292
+ return this.adapterType;
293
+ }
294
+ async setAdapter(adapter) {
295
+ this.adapterType = adapter;
296
+ await this.initializeAdapter();
297
+ }
298
+ rawResponseHandler(data) {
299
+ if (this.rawResponseEvents.length === 0)
300
+ return;
301
+ const isBuffer = Buffer.isBuffer(data);
302
+ if (!isBuffer) {
303
+ if (data instanceof ArrayBuffer) {
304
+ data = Buffer.from(new Uint8Array(data));
305
+ } else if (data instanceof Uint8Array) {
306
+ data = Buffer.from(data);
307
+ } else if (typeof data === "string") {
308
+ data = Buffer.from(data, "utf8");
309
+ } else if (typeof data === "object") {
310
+ data = Buffer.from(JSON.stringify(data), "utf8");
311
+ }
312
+ }
313
+ this.rawResponseEvents.forEach((e) => {
314
+ const handler = e.attr[0];
315
+ handler(data);
316
+ });
317
+ }
318
+ async waitForCache() {
319
+ if (this.isCacheReady)
320
+ return;
321
+ await this.sleep(this.rnd(50, 200));
322
+ await this.waitForCache();
323
+ }
324
+ async waitForStorage() {
325
+ if (this.isStorageReady)
326
+ return;
327
+ await this.sleep(this.rnd(50, 200));
328
+ await this.waitForStorage();
329
+ }
330
+ async saveUrl(url) {
331
+ await this.waitForStorage();
332
+ await this.urlStorage.set(url);
333
+ }
334
+ async hasUrlInCache(url) {
335
+ await this.waitForStorage();
336
+ return await this.urlStorage.has(url);
337
+ }
338
+ async saveCache(url, value) {
339
+ if (!this.isCacheEnabled)
340
+ return;
341
+ await this.waitForCache();
342
+ return this.cacher.set(url, value, this.config.cacheTTL, this.getNamespace(url));
343
+ }
344
+ getNamespace(url) {
345
+ try {
346
+ return new URL(url).hostname;
347
+ } catch {
348
+ return;
349
+ }
350
+ }
351
+ async hasCache(url) {
352
+ if (!this.isCacheEnabled)
353
+ return false;
354
+ await this.waitForCache();
355
+ return this.cacher.has(url, this.getNamespace(url));
356
+ }
357
+ async getCache(url) {
358
+ if (!this.isCacheEnabled)
359
+ return null;
360
+ await this.waitForCache();
361
+ return this.cacher.get(url, this.getNamespace(url));
362
+ }
363
+ sleep(ms) {
364
+ return new Promise((resolve) => setTimeout(resolve, ms));
365
+ }
366
+ rnd(min = 0, max = Number.MAX_VALUE) {
367
+ return Math.floor(Math.random() * (max - min + 1)) + min;
368
+ }
369
+ onError(handler) {
370
+ this.errorEvents.push({
371
+ handler: "_onError",
372
+ attr: [handler]
373
+ });
374
+ return this;
375
+ }
376
+ onJson(handler) {
377
+ this.jsonEvents.push({
378
+ handler: "_onJson",
379
+ attr: [handler]
380
+ });
381
+ return this;
382
+ }
383
+ onEmailDiscovered(handler) {
384
+ this.emailDiscoveredEvents.push(handler);
385
+ return this;
386
+ }
387
+ onEmailLeads(handler) {
388
+ this.emailLeadsEvents.push(handler);
389
+ return this;
390
+ }
391
+ onStart(handler) {
392
+ this.startHandlers.push(handler);
393
+ return this;
394
+ }
395
+ onFinish(handler) {
396
+ this.finishHandlers.push(handler);
397
+ return this;
398
+ }
399
+ onRedirect(handler) {
400
+ this.redirectHandlers.push(handler);
401
+ return this;
402
+ }
403
+ onRawData(handler) {
404
+ this.rawResponseEvents.push({
405
+ handler: "_onRawResponse",
406
+ attr: [handler]
407
+ });
408
+ return this;
409
+ }
410
+ onDocument(handler) {
411
+ this.events.push({
412
+ handler: "_onDocument",
413
+ attr: [handler]
414
+ });
415
+ return this;
416
+ }
417
+ onBody(handler) {
418
+ this.events.push({
419
+ handler: "_onBody",
420
+ attr: [handler]
421
+ });
422
+ return this;
423
+ }
424
+ onElement(handler) {
425
+ this.events.push({
426
+ handler: "_onElement",
427
+ attr: [handler]
428
+ });
429
+ return this;
430
+ }
431
+ onAnchor(selection, handler) {
432
+ this.events.push({
433
+ handler: "_onAnchor",
434
+ attr: [selection, handler]
435
+ });
436
+ return this;
437
+ }
438
+ onHref(handler) {
439
+ this.events.push({
440
+ handler: "_onHref",
441
+ attr: [handler]
442
+ });
443
+ return this;
444
+ }
445
+ onSelection(selection, handler) {
446
+ this.events.push({
447
+ handler: "_onSelection",
448
+ attr: [selection, handler]
449
+ });
450
+ return this;
451
+ }
452
+ onResponse(handler) {
453
+ this.responseEvents.push({
454
+ handler: "_onResponse",
455
+ attr: [handler]
456
+ });
457
+ return this;
458
+ }
459
+ onAttribute(selection, attribute, handler) {
460
+ this.events.push({
461
+ handler: "_onAttribute",
462
+ attr: [selection, attribute, handler]
463
+ });
464
+ return this;
465
+ }
466
+ onText(selection, handler) {
467
+ this.events.push({
468
+ handler: "_onText",
469
+ attr: [selection, handler]
470
+ });
471
+ return this;
472
+ }
473
+ _onBody(handler, document) {
474
+ this.queue.add(() => handler(document.body));
475
+ }
476
+ _onAttribute(selection, attribute, handler, document) {
477
+ const isSimpleForm = typeof attribute === "function";
478
+ const actualAttribute = isSimpleForm ? selection : attribute;
479
+ const actualHandler = isSimpleForm ? attribute : handler;
480
+ const actualSelection = isSimpleForm ? `[${selection}]` : selection || `[${attribute}]`;
481
+ const elements = document.querySelectorAll(actualSelection);
482
+ for (let i = 0;i < elements.length; i++) {
483
+ const el = elements[i];
484
+ if (el.hasAttribute(actualAttribute)) {
485
+ const value = el.getAttribute(actualAttribute);
486
+ this.queue.add(() => actualHandler.call(el, value, actualAttribute));
487
+ }
488
+ }
489
+ }
490
+ _onText(selection, handler, document) {
491
+ const elements = document.querySelectorAll(selection);
492
+ for (let i = 0;i < elements.length; i++) {
493
+ const el = elements[i];
494
+ const text = el.textContent;
495
+ this.queue.add(() => handler.call(el, text));
496
+ }
497
+ }
498
+ _onSelection(selection, handler, document) {
499
+ const elements = document.querySelectorAll(selection);
500
+ for (let i = 0;i < elements.length; i++) {
501
+ this.queue.add(() => handler(elements[i]));
502
+ }
503
+ }
504
+ _onElement(handler, document) {
505
+ const elements = document.querySelectorAll("*");
506
+ for (let i = 0;i < elements.length; i++) {
507
+ this.queue.add(() => handler(elements[i]));
508
+ }
509
+ }
510
+ _onHref(handler, document) {
511
+ const elements = document.querySelectorAll("a, link");
512
+ for (let i = 0;i < elements.length; i++) {
513
+ const el = elements[i];
514
+ if (el.hasAttribute("href")) {
515
+ const href = new URL(el.getAttribute("href"), document.URL).href;
516
+ this.queue.add(() => handler.call(el, href));
517
+ }
518
+ }
519
+ }
520
+ _onAnchor(selection, handler, document) {
521
+ handler = typeof selection === "function" ? selection : handler;
522
+ selection = typeof selection === "function" ? "a" : selection;
523
+ const elements = document.querySelectorAll(selection);
524
+ for (let i = 0;i < elements.length; i++) {
525
+ if (elements[i]?.href && document.baseURI)
526
+ elements[i].href = new URL(elements[i].getAttribute("href"), document.baseURI).href;
527
+ this.queue.add(() => handler(elements[i]));
528
+ }
529
+ }
530
+ _onDocument(handler, document) {
531
+ this.queue.add(() => handler(document));
532
+ }
533
+ _onJson(handler, json) {
534
+ this.queue.add(() => handler(json));
535
+ }
536
+ _onError(handler, error) {
537
+ this.queue.add(() => handler(error));
538
+ }
539
+ async _onEmailDiscovered(handler, email) {
540
+ await handler(email);
541
+ }
542
+ async _onEmailLeads(handler, emails) {
543
+ await handler(emails);
544
+ }
545
+ _onRawResponse(handler, rawResponse) {
546
+ this.queue.add(() => handler(rawResponse));
547
+ }
548
+ _onResponse(handler, response) {
549
+ this.queue.add(() => handler(response));
550
+ }
551
+ calculateAutoThrottleDelay(domain, responseTime) {
552
+ if (!this.config.autoThrottle)
553
+ return 0;
554
+ let times = this.domainResponseTimes.get(domain) || [];
555
+ times.push(responseTime);
556
+ if (times.length > 10) {
557
+ times = times.slice(-10);
558
+ }
559
+ this.domainResponseTimes.set(domain, times);
560
+ const avgResponseTime = times.reduce((a, b) => a + b, 0) / times.length;
561
+ const targetDelay = this.config.autoThrottleTargetDelay;
562
+ const loadFactor = avgResponseTime / 200;
563
+ let newDelay = Math.round(targetDelay * loadFactor);
564
+ newDelay = Math.max(this.config.autoThrottleMinDelay, newDelay);
565
+ newDelay = Math.min(this.config.autoThrottleMaxDelay, newDelay);
566
+ this.domainCurrentDelay.set(domain, newDelay);
567
+ if (this.config.debug) {
568
+ console.log(`[AutoThrottle] ${domain}: avgRT=${avgResponseTime.toFixed(0)}ms, delay=${newDelay}ms`);
569
+ }
570
+ return newDelay;
571
+ }
572
+ getAutoThrottleDelay(domain) {
573
+ if (!this.config.autoThrottle)
574
+ return 0;
575
+ return this.domainCurrentDelay.get(domain) || this.config.autoThrottleMinDelay;
576
+ }
577
+ async handle429Response(url, response) {
578
+ let retryAfter = 0;
579
+ const retryAfterHeader = response?.headers?.["retry-after"] || response?.headers?.get?.("retry-after");
580
+ if (retryAfterHeader) {
581
+ const parsed = parseInt(retryAfterHeader, 10);
582
+ if (!isNaN(parsed)) {
583
+ retryAfter = parsed * 1000;
584
+ } else {
585
+ const date = new Date(retryAfterHeader);
586
+ if (!isNaN(date.getTime())) {
587
+ retryAfter = date.getTime() - Date.now();
588
+ }
589
+ }
590
+ }
591
+ if (retryAfter <= 0) {
592
+ retryAfter = 60000;
593
+ }
594
+ const maxWait = this.config.maxWaitOn429;
595
+ const alwaysWait = this.config.alwaysWaitOn429;
596
+ if (retryAfter > maxWait && !alwaysWait) {
597
+ const waitMinutes = Math.round(retryAfter / 60000);
598
+ const error = new Error(`Rate limited: Server requested wait time of ${waitMinutes} minutes, which exceeds maxWaitOn429 (${Math.round(maxWait / 60000)} minutes). Set alwaysWaitOn429: true to wait regardless.`);
599
+ error.code = "REZ_RATE_LIMIT_EXCEEDED";
600
+ error.url = url;
601
+ error.status = 429;
602
+ throw error;
603
+ }
604
+ if (retryAfter > maxWait && alwaysWait) {
605
+ const waitMinutes = Math.round(retryAfter / 60000);
606
+ console.warn(`[Crawler] WARNING: Rate limited on ${url}. Server requested ${waitMinutes} minute wait. Waiting because alwaysWaitOn429 is enabled.`);
607
+ }
608
+ if (this.config.debug) {
609
+ console.log(`[Crawler] 429 Rate Limited: waiting ${Math.round(retryAfter / 1000)}s before retry`);
610
+ }
611
+ return { shouldRetry: true, waitTime: retryAfter };
612
+ }
613
+ async checkCrawlLimits(url, parentUrl) {
614
+ if (this.config.maxUrls > 0 && this.crawlStats.urlsVisited >= this.config.maxUrls) {
615
+ return { allowed: false, reason: `maxUrls limit reached (${this.config.maxUrls})` };
616
+ }
617
+ if (this.config.maxDepth > 0) {
618
+ const parentDepth = parentUrl ? this.urlDepthMap.get(parentUrl) ?? 0 : 0;
619
+ const urlDepth = this.urlDepthMap.get(url) ?? parentDepth + 1;
620
+ if (urlDepth > this.config.maxDepth) {
621
+ return { allowed: false, reason: `maxDepth limit reached (depth ${urlDepth} > ${this.config.maxDepth})` };
622
+ }
623
+ if (!this.urlDepthMap.has(url)) {
624
+ this.urlDepthMap.set(url, urlDepth);
625
+ this.crawlStats.currentDepth = Math.max(this.crawlStats.currentDepth, urlDepth);
626
+ }
627
+ }
628
+ if (this.config.respectRobotsTxt) {
629
+ try {
630
+ if (!this.robotsTxt.isCached(url)) {
631
+ await this.robotsTxt.fetch(url, async (robotsUrl) => {
632
+ const response = await this.http.get(robotsUrl, { timeout: 1e4 });
633
+ return { status: response.status, data: response.data };
634
+ });
635
+ }
636
+ const allowed = this.robotsTxt.isAllowed(url);
637
+ if (!allowed) {
638
+ return { allowed: false, reason: "Blocked by robots.txt" };
639
+ }
640
+ } catch (error) {
641
+ if (this.config.debug) {
642
+ console.warn(`[Crawler] Failed to check robots.txt for ${url}:`, error);
643
+ }
644
+ }
645
+ }
646
+ return { allowed: true };
647
+ }
648
+ shouldFollowLink(element) {
649
+ if (this.config.followNofollow) {
650
+ return true;
651
+ }
652
+ const rel = element.getAttribute("rel");
653
+ if (rel && rel.toLowerCase().includes("nofollow")) {
654
+ return false;
655
+ }
656
+ return true;
657
+ }
658
+ checkResponseSize(contentLength) {
659
+ if (this.config.maxResponseSize > 0 && contentLength > this.config.maxResponseSize) {
660
+ return {
661
+ allowed: false,
662
+ reason: `Response size (${contentLength} bytes) exceeds maxResponseSize (${this.config.maxResponseSize} bytes)`
663
+ };
664
+ }
665
+ return { allowed: true };
666
+ }
667
+ collect(data) {
668
+ this.collectedData.push(data);
669
+ return this;
670
+ }
671
+ getCollectedData() {
672
+ return [...this.collectedData];
673
+ }
674
+ clearCollectedData() {
675
+ this.collectedData = [];
676
+ return this;
677
+ }
678
+ async exportData(filePath, format = "json") {
679
+ const data = this.collectedData;
680
+ if (data.length === 0) {
681
+ if (this.config.debug) {
682
+ console.warn("[Crawler] No data to export");
683
+ }
684
+ return;
685
+ }
686
+ let content;
687
+ switch (format) {
688
+ case "json":
689
+ content = JSON.stringify(data, null, 2);
690
+ break;
691
+ case "jsonl":
692
+ content = data.map((item) => JSON.stringify(item)).join(`
693
+ `);
694
+ break;
695
+ case "csv":
696
+ const keys = new Set;
697
+ data.forEach((item) => {
698
+ if (typeof item === "object" && item !== null) {
699
+ Object.keys(item).forEach((key) => keys.add(key));
700
+ }
701
+ });
702
+ const headers = Array.from(keys);
703
+ const escapeCSV = (val) => {
704
+ if (val === null || val === undefined)
705
+ return "";
706
+ const str = String(val);
707
+ if (str.includes(",") || str.includes('"') || str.includes(`
708
+ `)) {
709
+ return `"${str.replace(/"/g, '""')}"`;
710
+ }
711
+ return str;
712
+ };
713
+ const rows = [
714
+ headers.join(","),
715
+ ...data.map((item) => {
716
+ if (typeof item !== "object" || item === null) {
717
+ return escapeCSV(item);
718
+ }
719
+ return headers.map((key) => escapeCSV(item[key])).join(",");
720
+ })
721
+ ];
722
+ content = rows.join(`
723
+ `);
724
+ break;
725
+ default:
726
+ throw new Error(`Unsupported export format: ${format}`);
727
+ }
728
+ const dir = path.dirname(filePath);
729
+ if (!fs.existsSync(dir)) {
730
+ fs.mkdirSync(dir, { recursive: true });
731
+ }
732
+ fs.writeFileSync(filePath, content, "utf-8");
733
+ if (this.config.debug) {
734
+ console.log(`[Crawler] Exported ${data.length} items to ${filePath} (${format})`);
735
+ }
736
+ }
737
+ getStats() {
738
+ return { ...this.crawlStats };
739
+ }
740
+ async triggerStartHandlers() {
741
+ if (this.crawlStarted)
742
+ return;
743
+ this.crawlStarted = true;
744
+ this.crawlStats.startTime = Date.now();
745
+ for (const handler of this.startHandlers) {
746
+ try {
747
+ await handler();
748
+ } catch (error) {
749
+ if (this.config.debug) {
750
+ console.error("[Crawler] onStart handler error:", error);
751
+ }
752
+ }
753
+ }
754
+ }
755
+ async triggerFinishHandlers() {
756
+ this.crawlStats.endTime = Date.now();
757
+ for (const handler of this.finishHandlers) {
758
+ try {
759
+ await handler(this.crawlStats);
760
+ } catch (error) {
761
+ if (this.config.debug) {
762
+ console.error("[Crawler] onFinish handler error:", error);
763
+ }
764
+ }
765
+ }
766
+ }
767
+ async triggerRedirectHandlers(event) {
768
+ for (const handler of this.redirectHandlers) {
769
+ try {
770
+ await handler(event);
771
+ } catch (error) {
772
+ if (this.config.debug) {
773
+ console.error("[Crawler] onRedirect handler error:", error);
774
+ }
775
+ }
776
+ }
777
+ }
778
+ buildUrl(url, params) {
779
+ if (params) {
780
+ const u = new URL(url, this.config.baseUrl);
781
+ for (const [key, value] of Object.entries(params)) {
782
+ u.searchParams.set(key, value.toString());
783
+ }
784
+ url = u.href;
785
+ }
786
+ return url;
787
+ }
788
+ visit(url, options) {
789
+ if (this.config.baseUrl)
790
+ url = new URL(url, this.config.baseUrl).href;
791
+ if (options?.params && (options.useOxylabsScraperAi || this.config.hasDomain(url, "oxylabs"))) {
792
+ url = this.buildUrl(url, options.params);
793
+ }
794
+ const {
795
+ method = "GET",
796
+ headers = new Headers,
797
+ forceRevisit = this.config.forceRevisit,
798
+ body = "",
799
+ timeout = this.config.timeout,
800
+ maxRedirects = this.config.maxRedirects,
801
+ useProxy = this.config.hasDomain(url, "proxies", options?.useProxy),
802
+ extractLeads = false,
803
+ params,
804
+ rejectUnauthorized,
805
+ useQueue = false,
806
+ deepEmailFinder = false,
807
+ useOxylabsScraperAi = false,
808
+ useOxylabsRotation = true,
809
+ useDecodo = false
810
+ } = options || {};
811
+ const _options = {
812
+ headers: this.config.pickHeaders(url, true, headers, true),
813
+ timeout,
814
+ maxRedirects,
815
+ params,
816
+ proxy: useProxy ? this.config.getAdapter(url, "proxies", true, true) || undefined : undefined,
817
+ rejectUnauthorized: typeof rejectUnauthorized === "boolean" ? rejectUnauthorized : this.config.rejectUnauthorized,
818
+ pqueue: this.config.getAdapter(url, "limiters", useQueue, useQueue) || undefined
819
+ };
820
+ let oxylabsOptions = {};
821
+ let oxylabsInstanse = undefined;
822
+ if (useOxylabsScraperAi && this.config.hasDomain(url, "oxylabs")) {
823
+ oxylabsOptions = {
824
+ method: method === "POST" ? "post" : "get",
825
+ headers: this.config.pickHeaders(url, true, headers, true),
826
+ pqueue: this.config.getAdapter(url, "limiters", useQueue, useQueue) || undefined,
827
+ base64Body: typeof body === "string" ? Buffer.from(body).toString("base64") : undefined
828
+ };
829
+ oxylabsInstanse = this.config.getAdapter(url, "oxylabs", false, useOxylabsRotation) || undefined;
830
+ }
831
+ let decodoOptions = {};
832
+ let decodoInstanse = undefined;
833
+ if (useDecodo && this.config.hasDomain(url, "decodo")) {
834
+ decodoOptions = {
835
+ method: method === "POST" ? "post" : "get",
836
+ headers: this.config.pickHeaders(url, true, headers, true),
837
+ pqueue: this.config.getAdapter(url, "limiters", useQueue, useQueue) || undefined,
838
+ base64Body: typeof body === "string" ? Buffer.from(body).toString("base64") : undefined
839
+ };
840
+ decodoInstanse = this.config.getAdapter(url, "decodo", false, useOxylabsRotation) || undefined;
841
+ }
842
+ if (this.config.enableNavigationHistory) {
843
+ const headersObj = headers instanceof Headers ? Object.fromEntries(headers.entries()) : headers;
844
+ this.addToNavigationQueue(url, method, body, headersObj);
845
+ }
846
+ if (deepEmailFinder) {
847
+ const p = this.execute2(method, url, body, _options, forceRevisit);
848
+ this.pendingExecutions.add(p);
849
+ p.finally(() => this.pendingExecutions.delete(p));
850
+ return this;
851
+ }
852
+ const p = this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions);
853
+ this.pendingExecutions.add(p);
854
+ p.finally(() => this.pendingExecutions.delete(p));
855
+ return this;
856
+ }
857
+ async execute(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions) {
858
+ await this.waitForStorage();
859
+ if (this.isCacheEnabled) {
860
+ await this.waitForCache();
861
+ }
862
+ if (this.config.enableNavigationHistory) {
863
+ await this.waitForNavigationHistory();
864
+ }
865
+ const task = this.queue.add(() => this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions));
866
+ task.finally(() => this.pendingExecutions.delete(task));
867
+ }
868
+ async execute2(method, url, body, options = {}, forceRevisit) {
869
+ await this.waitForStorage();
870
+ if (this.isCacheEnabled) {
871
+ await this.waitForCache();
872
+ }
873
+ if (this.config.enableNavigationHistory) {
874
+ await this.waitForNavigationHistory();
875
+ }
876
+ this.queue.add(() => this.leadsFinder.parseExternalWebsite(url, method, body, {
877
+ httpConfig: options,
878
+ saveCache: this.saveCache.bind(this),
879
+ saveUrl: this.saveUrl.bind(this),
880
+ getCache: this.getCache.bind(this),
881
+ hasUrlInCache: this.hasUrlInCache.bind(this),
882
+ onEmailDiscovered: this.emailDiscoveredEvents,
883
+ onEmails: this.emailLeadsEvents,
884
+ queue: this.queue,
885
+ depth: 1,
886
+ allowCrossDomainTravel: true
887
+ }, forceRevisit, true)).then();
888
+ }
889
+ async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0, parentUrl) {
890
+ try {
891
+ await this.triggerStartHandlers();
892
+ const limitCheck = await this.checkCrawlLimits(url, parentUrl);
893
+ if (!limitCheck.allowed) {
894
+ if (this.config.debug) {
895
+ console.log(`[Crawler] Skipping ${url}: ${limitCheck.reason}`);
896
+ }
897
+ return;
898
+ }
899
+ this.crawlStats.urlsQueued++;
900
+ const domain = new URL(url).hostname;
901
+ const delay = this.getAutoThrottleDelay(domain);
902
+ if (delay > 0) {
903
+ await new Promise((resolve) => setTimeout(resolve, delay));
904
+ }
905
+ const isVisited = forceRevisit ? false : await this.hasUrlInCache(url);
906
+ const cache = await this.getCache(url);
907
+ if (isVisited && !cache)
908
+ return;
909
+ if (isVisited && method !== "GET")
910
+ return;
911
+ const requestStartTime = Date.now();
912
+ const response = cache && method === "GET" ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) : await (method === "GET" ? this.http.get(url, options) : method === "PATCH" ? this.http.patch(url, body, options) : method === "POST" ? this.http.post(url, body, options) : this.http.put(url, body, options));
913
+ if (!cache) {
914
+ const responseTime = Date.now() - requestStartTime;
915
+ this.calculateAutoThrottleDelay(domain, responseTime);
916
+ }
917
+ const res = {
918
+ data: response.data || response.content || "",
919
+ contentType: response.contentType || "",
920
+ finalUrl: response.finalUrl || response.url || url,
921
+ url: response?.urls?.[0] || response.url || this.buildUrl(url, options.params),
922
+ headers: response.headers || {},
923
+ status: response.status || response.statusCode || 200,
924
+ statusText: response.statusText || "",
925
+ cookies: response?.cookies?.serialized || response?.cookies,
926
+ contentLength: response.contentLength || 0
927
+ };
928
+ if (res.contentLength && res.contentLength > 0) {
929
+ const sizeCheck = this.checkResponseSize(res.contentLength);
930
+ if (!sizeCheck.allowed) {
931
+ if (this.config.debug) {
932
+ console.log(`[Crawler] Skipping ${url}: ${sizeCheck.reason}`);
933
+ }
934
+ return;
935
+ }
936
+ }
937
+ this.crawlStats.urlsVisited++;
938
+ if (res.finalUrl && res.finalUrl !== url && this.redirectHandlers.length > 0) {
939
+ await this.triggerRedirectHandlers({
940
+ originalUrl: url,
941
+ finalUrl: res.finalUrl,
942
+ redirectCount: response.redirectCount || 1,
943
+ statusCode: res.status
944
+ });
945
+ }
946
+ if (!cache)
947
+ await this.saveCache(url, res);
948
+ if (!isVisited)
949
+ await this.saveUrl(url);
950
+ await this.markUrlVisited(url, {
951
+ status: res.status,
952
+ finalUrl: res.finalUrl,
953
+ contentType: res.contentType
954
+ });
955
+ if (res.contentType && res.contentType.includes("/json")) {
956
+ if (this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) {
957
+ this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.queue);
958
+ }
959
+ for (let i = 0;i < this.jsonEvents.length; i++) {
960
+ const event = this.jsonEvents[i];
961
+ this[event.handler](...event.attr, res.data);
962
+ }
963
+ }
964
+ for (let i = 0;i < this.responseEvents.length; i++) {
965
+ const event = this.responseEvents[i];
966
+ this[event.handler](...event.attr, res);
967
+ }
968
+ this.rawResponseHandler(res.data);
969
+ if (!res.contentType || !res.contentType.includes("/html") || typeof res.data !== "string")
970
+ return;
971
+ if ((this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) && isEmail) {
972
+ this.leadsFinder.extractEmails(res.data, res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.queue);
973
+ }
974
+ const { document } = parseHTML(res.data.addBaseUrl(res.finalUrl));
975
+ document.URL = res.finalUrl;
976
+ for (let i = 0;i < this.events.length; i++) {
977
+ const event = this.events[i];
978
+ this[event.handler](...event.attr, document);
979
+ }
980
+ } catch (e) {
981
+ const error = e;
982
+ if (error?.response?.status === 429 || error?.status === 429) {
983
+ try {
984
+ const { shouldRetry, waitTime } = await this.handle429Response(url, error.response || error);
985
+ if (shouldRetry) {
986
+ await this.sleep(waitTime);
987
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
988
+ }
989
+ } catch (rateLimitError) {
990
+ this.crawlStats.urlsFailed++;
991
+ if (this.config.throwFatalError)
992
+ throw rateLimitError;
993
+ for (let i = 0;i < this.errorEvents.length; i++) {
994
+ const event = this.errorEvents[i];
995
+ this[event.handler](...event.attr, rateLimitError);
996
+ }
997
+ return;
998
+ }
999
+ }
1000
+ if (error && error.response) {
1001
+ const status = error.response.status;
1002
+ const retryDelay = this.config.retryDelay || 1000;
1003
+ const maxRetryAttempts = this.config.maxRetryAttempts || 3;
1004
+ const maxRetryOnProxyError = this.config.maxRetryOnProxyError || 3;
1005
+ const retryWithoutProxyOnStatusCode = this.config.retryWithoutProxyOnStatusCode || undefined;
1006
+ const retryOnStatusCode = this.config.retryOnStatusCode || undefined;
1007
+ const retryOnProxyError = this.config.retryOnProxyError || undefined;
1008
+ if (retryWithoutProxyOnStatusCode && options.proxy && retryWithoutProxyOnStatusCode.includes(status) && retryCount < maxRetryAttempts) {
1009
+ await this.sleep(retryDelay);
1010
+ delete options.proxy;
1011
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
1012
+ } else if (retryOnStatusCode && options.proxy && retryOnStatusCode.includes(status) && retryCount < maxRetryAttempts) {
1013
+ await this.sleep(retryDelay);
1014
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
1015
+ } else if (retryOnProxyError && options.proxy && retryCount < maxRetryOnProxyError) {
1016
+ await this.sleep(retryDelay);
1017
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1, parentUrl);
1018
+ }
1019
+ }
1020
+ this.crawlStats.urlsFailed++;
1021
+ await this.markUrlVisited(url, {
1022
+ status: error?.response?.status || 0,
1023
+ errorMessage: e.message || "Unknown error"
1024
+ });
1025
+ if (this.config.throwFatalError)
1026
+ throw e;
1027
+ if (this.config.debug) {
1028
+ console.log(`Error visiting ${url}: ${e.message}`);
1029
+ }
1030
+ for (let i = 0;i < this.errorEvents.length; i++) {
1031
+ const event = this.errorEvents[i];
1032
+ this[event.handler](...event.attr, e);
1033
+ }
1034
+ }
1035
+ }
1036
+ async waitForAll() {
1037
+ if (this.pendingExecutions.size > 0) {
1038
+ await Promise.allSettled([...this.pendingExecutions]);
1039
+ }
1040
+ await this.queue.onIdle();
1041
+ await this.triggerFinishHandlers();
1042
+ }
1043
+ async done() {
1044
+ return this.waitForAll();
1045
+ }
1046
+ async close() {
1047
+ try {
1048
+ await this.cacher?.close();
1049
+ } catch {}
1050
+ try {
1051
+ await this.urlStorage?.close();
1052
+ } catch {}
1053
+ try {
1054
+ await this.navigationHistory?.close();
1055
+ } catch {}
1056
+ }
1057
+ async destroy() {
1058
+ this.queue.clear();
1059
+ this.events.length = 0;
1060
+ this.jsonEvents.length = 0;
1061
+ this.errorEvents.length = 0;
1062
+ this.responseEvents.length = 0;
1063
+ this.rawResponseEvents.length = 0;
1064
+ this.emailDiscoveredEvents.length = 0;
1065
+ this.emailLeadsEvents.length = 0;
1066
+ await this.close();
1067
+ }
1068
+ }
1069
+
1070
+ exports.Crawler = Crawler;