rezo 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +1507 -0
  3. package/assets/icon.svg +37 -0
  4. package/assets/logo-dark.svg +47 -0
  5. package/assets/logo.svg +58 -0
  6. package/dist/adapters/curl.cjs +1034 -0
  7. package/dist/adapters/curl.js +1031 -0
  8. package/dist/adapters/entries/curl.cjs +4 -0
  9. package/dist/adapters/entries/curl.d.ts +2136 -0
  10. package/dist/adapters/entries/curl.js +2 -0
  11. package/dist/adapters/entries/fetch.cjs +2 -0
  12. package/dist/adapters/entries/fetch.d.ts +2127 -0
  13. package/dist/adapters/entries/fetch.js +1 -0
  14. package/dist/adapters/entries/http.cjs +2 -0
  15. package/dist/adapters/entries/http.d.ts +2126 -0
  16. package/dist/adapters/entries/http.js +1 -0
  17. package/dist/adapters/entries/http2.cjs +4 -0
  18. package/dist/adapters/entries/http2.d.ts +2136 -0
  19. package/dist/adapters/entries/http2.js +2 -0
  20. package/dist/adapters/entries/react-native.cjs +2 -0
  21. package/dist/adapters/entries/react-native.d.ts +2126 -0
  22. package/dist/adapters/entries/react-native.js +1 -0
  23. package/dist/adapters/entries/xhr.cjs +2 -0
  24. package/dist/adapters/entries/xhr.d.ts +2127 -0
  25. package/dist/adapters/entries/xhr.js +1 -0
  26. package/dist/adapters/fetch.cjs +740 -0
  27. package/dist/adapters/fetch.js +739 -0
  28. package/dist/adapters/http.cjs +1153 -0
  29. package/dist/adapters/http.js +1151 -0
  30. package/dist/adapters/http2.cjs +957 -0
  31. package/dist/adapters/http2.js +956 -0
  32. package/dist/adapters/index.cjs +6 -0
  33. package/dist/adapters/index.js +7 -0
  34. package/dist/adapters/picker.cjs +342 -0
  35. package/dist/adapters/picker.js +331 -0
  36. package/dist/adapters/react-native.cjs +545 -0
  37. package/dist/adapters/react-native.js +544 -0
  38. package/dist/adapters/xhr.cjs +622 -0
  39. package/dist/adapters/xhr.js +621 -0
  40. package/dist/cache/dns-cache.cjs +118 -0
  41. package/dist/cache/dns-cache.js +113 -0
  42. package/dist/cache/file-cacher.cjs +264 -0
  43. package/dist/cache/file-cacher.js +261 -0
  44. package/dist/cache/index.cjs +13 -0
  45. package/dist/cache/index.js +5 -0
  46. package/dist/cache/lru-cache.cjs +96 -0
  47. package/dist/cache/lru-cache.js +93 -0
  48. package/dist/cache/response-cache.cjs +314 -0
  49. package/dist/cache/response-cache.js +310 -0
  50. package/dist/cache/url-store.cjs +288 -0
  51. package/dist/cache/url-store.js +285 -0
  52. package/dist/core/hooks.cjs +133 -0
  53. package/dist/core/hooks.js +120 -0
  54. package/dist/core/rezo.cjs +464 -0
  55. package/dist/core/rezo.js +458 -0
  56. package/dist/crawler.d.ts +6255 -0
  57. package/dist/dom/index.cjs +1 -0
  58. package/dist/dom/index.d.ts +23 -0
  59. package/dist/dom/index.js +1 -0
  60. package/dist/entries/crawler.cjs +5 -0
  61. package/dist/entries/crawler.js +2 -0
  62. package/dist/errors/rezo-error.cjs +722 -0
  63. package/dist/errors/rezo-error.js +716 -0
  64. package/dist/index.cjs +34 -0
  65. package/dist/index.d.ts +3335 -0
  66. package/dist/index.js +26 -0
  67. package/dist/platform/browser.cjs +9 -0
  68. package/dist/platform/browser.d.ts +3203 -0
  69. package/dist/platform/browser.js +7 -0
  70. package/dist/platform/bun.cjs +9 -0
  71. package/dist/platform/bun.d.ts +3203 -0
  72. package/dist/platform/bun.js +7 -0
  73. package/dist/platform/deno.cjs +9 -0
  74. package/dist/platform/deno.d.ts +3203 -0
  75. package/dist/platform/deno.js +7 -0
  76. package/dist/platform/node.cjs +9 -0
  77. package/dist/platform/node.d.ts +3203 -0
  78. package/dist/platform/node.js +7 -0
  79. package/dist/platform/react-native.cjs +9 -0
  80. package/dist/platform/react-native.d.ts +3203 -0
  81. package/dist/platform/react-native.js +7 -0
  82. package/dist/platform/worker.cjs +9 -0
  83. package/dist/platform/worker.d.ts +3203 -0
  84. package/dist/platform/worker.js +7 -0
  85. package/dist/plugin/addon/decodo/index.cjs +1 -0
  86. package/dist/plugin/addon/decodo/index.js +1 -0
  87. package/dist/plugin/addon/decodo/options.cjs +1 -0
  88. package/dist/plugin/addon/decodo/options.js +1 -0
  89. package/dist/plugin/addon/oxylabs/index.cjs +1 -0
  90. package/dist/plugin/addon/oxylabs/index.js +1 -0
  91. package/dist/plugin/addon/oxylabs/options.cjs +1 -0
  92. package/dist/plugin/addon/oxylabs/options.js +1 -0
  93. package/dist/plugin/crawler-options.cjs +1 -0
  94. package/dist/plugin/crawler-options.js +1 -0
  95. package/dist/plugin/crawler.cjs +519 -0
  96. package/dist/plugin/crawler.js +517 -0
  97. package/dist/plugin/index.cjs +36 -0
  98. package/dist/plugin/index.js +32 -0
  99. package/dist/proxy/index.cjs +142 -0
  100. package/dist/proxy/index.js +139 -0
  101. package/dist/responses/buildError.cjs +452 -0
  102. package/dist/responses/buildError.js +441 -0
  103. package/dist/responses/buildResponse.cjs +365 -0
  104. package/dist/responses/buildResponse.js +361 -0
  105. package/dist/responses/download.cjs +54 -0
  106. package/dist/responses/download.js +52 -0
  107. package/dist/responses/stream.cjs +60 -0
  108. package/dist/responses/stream.js +58 -0
  109. package/dist/responses/upload.cjs +54 -0
  110. package/dist/responses/upload.js +52 -0
  111. package/dist/types/cookies.cjs +394 -0
  112. package/dist/types/cookies.js +391 -0
  113. package/dist/types/download.cjs +10 -0
  114. package/dist/types/download.js +10 -0
  115. package/dist/types/rezo-request.cjs +131 -0
  116. package/dist/types/rezo-request.js +131 -0
  117. package/dist/utils/agent-merger.cjs +111 -0
  118. package/dist/utils/agent-merger.js +108 -0
  119. package/dist/utils/compression.cjs +84 -0
  120. package/dist/utils/compression.js +82 -0
  121. package/dist/utils/cookies.cjs +514 -0
  122. package/dist/utils/cookies.js +511 -0
  123. package/dist/utils/data-operations.cjs +75 -0
  124. package/dist/utils/data-operations.js +73 -0
  125. package/dist/utils/form-data.cjs +164 -0
  126. package/dist/utils/form-data.js +161 -0
  127. package/dist/utils/headers.cjs +162 -0
  128. package/dist/utils/headers.js +161 -0
  129. package/dist/utils/http-config.cjs +723 -0
  130. package/dist/utils/http-config.js +718 -0
  131. package/dist/utils/index.cjs +8 -0
  132. package/dist/utils/index.js +8 -0
  133. package/dist/utils/tools.cjs +18 -0
  134. package/dist/utils/tools.js +15 -0
  135. package/package.json +172 -0
@@ -0,0 +1,519 @@
1
+ const fs = require("node:fs");
2
+ const { FileCacher } = require('../cache/file-cacher.cjs');
3
+ const { UrlStore } = require('../cache/url-store.cjs');
4
+ const { parseHTML } = require("linkedom");
5
+ const path = require("node:path");
6
+ const PQueue = require("p-queue");
7
+ const { Scraper } = require('./scraper.cjs');
8
+ const { CrawlerOptions } = require('./crawler-options.cjs');
9
+ String.prototype.addBaseUrl = function(url) {
10
+ url = url instanceof URL ? url.href : url;
11
+ const html = this.replace(/<base\b[^>]*?>/gi, "");
12
+ if (/<head[^>]*>/i.test(html)) {
13
+ return html.replace(/<head[^>]*>/i, (match) => `${match}
14
+ <base href="${url}">`);
15
+ }
16
+ const baseTag = `<head>
17
+ <base href="${url}">
18
+ </head>
19
+ `;
20
+ if (/<body[^>]*>/i.test(html)) {
21
+ return html.replace(/<body[^>]*>/i, baseTag + "$&");
22
+ }
23
+ if (/<html[^>]*>/i.test(html)) {
24
+ return html.replace(/<html[^>]*>/i, `$&
25
+ ` + baseTag);
26
+ }
27
+ return this;
28
+ };
29
+
30
+ class Crawler {
31
+ http;
32
+ events = [];
33
+ jsonEvents = [];
34
+ errorEvents = [];
35
+ responseEvents = [];
36
+ rawResponseEvents = [];
37
+ emailDiscoveredEvents = [];
38
+ emailLeadsEvents = [];
39
+ cacher = null;
40
+ queue;
41
+ isCacheEnabled;
42
+ config;
43
+ urlStorage;
44
+ isStorageReady = false;
45
+ isCacheReady = false;
46
+ leadsFinder;
47
+ constructor(crawlerOptions, http) {
48
+ this.http = http;
49
+ this.queue = new PQueue({
50
+ concurrency: 1000
51
+ });
52
+ this.config = new CrawlerOptions(crawlerOptions);
53
+ const enableCache = this.config.enableCache;
54
+ this.isCacheEnabled = enableCache;
55
+ if (enableCache) {
56
+ const cacheDir = this.config.cacheDir;
57
+ const cacheTTL = this.config.cacheTTL;
58
+ const dbUrl = cacheDir && (cacheDir.startsWith("./") || cacheDir.startsWith("/")) ? `${cacheDir}${cacheDir.endsWith("/") ? "" : "/"}` : cacheDir ? `./${cacheDir}${cacheDir.endsWith("/") ? "" : "/"}` : `./cache/`;
59
+ if (!fs.existsSync(path.dirname(dbUrl)))
60
+ fs.mkdirSync(path.dirname(dbUrl), { recursive: true });
61
+ FileCacher.create({
62
+ cacheDir: dbUrl,
63
+ softDelete: false,
64
+ ttl: cacheTTL,
65
+ encryptNamespace: true
66
+ }).then((storage) => {
67
+ this.cacher = storage;
68
+ this.isCacheReady = true;
69
+ });
70
+ const dit = path.resolve(cacheDir, "urls");
71
+ if (!fs.existsSync(dit))
72
+ fs.mkdirSync(dit, { recursive: true });
73
+ UrlStore.create({
74
+ storeDir: dit,
75
+ dbFileName: ".url_cache.db",
76
+ ttl: 1000 * 60 * 60 * 24 * 7
77
+ }).then((storage) => {
78
+ this.urlStorage = storage;
79
+ this.isStorageReady = true;
80
+ });
81
+ } else {
82
+ const dit = path.resolve(this.config.cacheDir, "./cache/urls");
83
+ if (!fs.existsSync(dit))
84
+ fs.mkdirSync(dit, { recursive: true });
85
+ UrlStore.create({
86
+ storeDir: dit,
87
+ dbFileName: ".url_cache.db",
88
+ ttl: 1000 * 60 * 60 * 24 * 7
89
+ }).then((storage) => {
90
+ this.urlStorage = storage;
91
+ this.isStorageReady = true;
92
+ });
93
+ }
94
+ this.leadsFinder = new Scraper(this.http, this.config, this._onEmailLeads.bind(this), this._onEmailDiscovered.bind(this), this.config.debug);
95
+ }
96
+ rawResponseHandler(data) {
97
+ if (this.rawResponseEvents.length === 0)
98
+ return;
99
+ const isBuffer = data instanceof Buffer;
100
+ if (!isBuffer) {
101
+ if (data instanceof ArrayBuffer) {
102
+ data = Buffer.from(new Uint8Array(data));
103
+ } else if (data instanceof Uint8Array) {
104
+ data = Buffer.from(data);
105
+ } else if (typeof data === "string") {
106
+ data = Buffer.from(data, "utf8");
107
+ } else if (typeof data === "object") {
108
+ data = Buffer.from(JSON.stringify(data), "utf8");
109
+ }
110
+ }
111
+ this.rawResponseEvents.forEach((e) => {
112
+ const handler = e.attr[0];
113
+ handler(data);
114
+ });
115
+ }
116
+ async waitForCache() {
117
+ if (this.isCacheReady)
118
+ return;
119
+ await this.sleep(this.rnd(50, 200));
120
+ await this.waitForCache();
121
+ }
122
+ async waitForStorage() {
123
+ if (this.isStorageReady)
124
+ return;
125
+ await this.sleep(this.rnd(50, 200));
126
+ await this.waitForStorage();
127
+ }
128
+ async saveUrl(url) {
129
+ await this.waitForStorage();
130
+ await this.urlStorage.set(url);
131
+ }
132
+ async hasUrlInCache(url) {
133
+ await this.waitForStorage();
134
+ return await this.urlStorage.has(url);
135
+ }
136
+ async saveCache(url, value) {
137
+ if (!this.isCacheEnabled)
138
+ return;
139
+ await this.waitForCache();
140
+ return this.cacher.set(url, value, this.config.cacheTTL, this.getNamespace(url));
141
+ }
142
+ getNamespace(url) {
143
+ try {
144
+ return new URL(url).hostname;
145
+ } catch {
146
+ return;
147
+ }
148
+ }
149
+ async hasCache(url) {
150
+ if (!this.isCacheEnabled)
151
+ return false;
152
+ await this.waitForCache();
153
+ return this.cacher.has(url, this.getNamespace(url));
154
+ }
155
+ async getCache(url) {
156
+ if (!this.isCacheEnabled)
157
+ return null;
158
+ await this.waitForCache();
159
+ return this.cacher.get(url, this.getNamespace(url));
160
+ }
161
+ sleep(ms) {
162
+ return new Promise((resolve) => setTimeout(resolve, ms));
163
+ }
164
+ rnd(min = 0, max = Number.MAX_VALUE) {
165
+ return Math.floor(Math.random() * (max - min + 1)) + min;
166
+ }
167
+ onError(handler) {
168
+ this.errorEvents.push({
169
+ handler: "_onError",
170
+ attr: [handler]
171
+ });
172
+ return this;
173
+ }
174
+ onJson(handler) {
175
+ this.jsonEvents.push({
176
+ handler: "_onJson",
177
+ attr: [handler]
178
+ });
179
+ return this;
180
+ }
181
+ onEmailDiscovered(handler) {
182
+ this.emailDiscoveredEvents.push(handler);
183
+ return this;
184
+ }
185
+ onEmailLeads(handler) {
186
+ this.emailLeadsEvents.push(handler);
187
+ return this;
188
+ }
189
+ onRawData(handler) {
190
+ this.rawResponseEvents.push({
191
+ handler: "_onRawResponse",
192
+ attr: [handler]
193
+ });
194
+ return this;
195
+ }
196
+ onDocument(handler) {
197
+ this.events.push({
198
+ handler: "_onDocument",
199
+ attr: [handler]
200
+ });
201
+ return this;
202
+ }
203
+ onBody(handler) {
204
+ this.events.push({
205
+ handler: "_onBody",
206
+ attr: [handler]
207
+ });
208
+ return this;
209
+ }
210
+ onElement(handler) {
211
+ this.events.push({
212
+ handler: "_onElement",
213
+ attr: [handler]
214
+ });
215
+ return this;
216
+ }
217
+ onAnchor(selection, handler) {
218
+ this.events.push({
219
+ handler: "_onAnchor",
220
+ attr: [selection, handler]
221
+ });
222
+ return this;
223
+ }
224
+ onHref(handler) {
225
+ this.events.push({
226
+ handler: "_onHref",
227
+ attr: [handler]
228
+ });
229
+ return this;
230
+ }
231
+ onSelection(selection, handler) {
232
+ this.events.push({
233
+ handler: "_onSelection",
234
+ attr: [selection, handler]
235
+ });
236
+ return this;
237
+ }
238
+ onResponse(handler) {
239
+ this.responseEvents.push({
240
+ handler: "_onResponse",
241
+ attr: [handler]
242
+ });
243
+ return this;
244
+ }
245
+ onAttribute(selection, attribute, handler) {
246
+ this.events.push({
247
+ handler: "_onAttribute",
248
+ attr: [selection, attribute, handler]
249
+ });
250
+ return this;
251
+ }
252
+ onText(selection, handler) {
253
+ this.events.push({
254
+ handler: "_onText",
255
+ attr: [selection, handler]
256
+ });
257
+ return this;
258
+ }
259
+ _onBody(handler, document) {
260
+ this.queue.add(() => handler(document.body));
261
+ }
262
+ _onAttribute(selection, attribute, handler, document) {
263
+ selection = typeof attribute === "function" ? selection : null;
264
+ attribute = typeof attribute === "function" ? selection : attribute;
265
+ handler = typeof attribute === "function" ? attribute : handler;
266
+ selection = selection || `[${attribute}]`;
267
+ const elements = document.querySelectorAll(selection);
268
+ for (let i = 0;i < elements.length; i++) {
269
+ if (elements[i].hasAttribute(attribute))
270
+ this.queue.add(() => handler(elements[i].getAttribute(attribute)));
271
+ }
272
+ }
273
+ _onText(selection, handler, document) {
274
+ const elements = document.querySelectorAll(selection);
275
+ for (let i = 0;i < elements.length; i++) {
276
+ this.queue.add(() => handler(elements[i].textContent));
277
+ }
278
+ }
279
+ _onSelection(selection, handler, document) {
280
+ const elements = document.querySelectorAll(selection);
281
+ for (let i = 0;i < elements.length; i++) {
282
+ this.queue.add(() => handler(elements[i]));
283
+ }
284
+ }
285
+ _onElement(handler, document) {
286
+ const elements = document.querySelectorAll("*");
287
+ for (let i = 0;i < elements.length; i++) {
288
+ this.queue.add(() => handler(elements[i]));
289
+ }
290
+ }
291
+ _onHref(handler, document) {
292
+ const elements = document.querySelectorAll("a, link");
293
+ for (let i = 0;i < elements.length; i++) {
294
+ if (elements[i].hasAttribute("href"))
295
+ this.queue.add(() => handler(new URL(elements[i].getAttribute("href"), document.URL).href));
296
+ }
297
+ }
298
+ _onAnchor(selection, handler, document) {
299
+ handler = typeof selection === "function" ? selection : handler;
300
+ selection = typeof selection === "function" ? "a" : selection;
301
+ const elements = document.querySelectorAll(selection);
302
+ for (let i = 0;i < elements.length; i++) {
303
+ if (elements[i]?.href && document.baseURI)
304
+ elements[i].href = new URL(elements[i].getAttribute("href"), document.baseURI).href;
305
+ this.queue.add(() => handler(elements[i]));
306
+ }
307
+ }
308
+ _onDocument(handler, document) {
309
+ this.queue.add(() => handler(document));
310
+ }
311
+ _onJson(handler, json) {
312
+ this.queue.add(() => handler(json));
313
+ }
314
+ _onError(handler, error) {
315
+ this.queue.add(() => handler(error));
316
+ }
317
+ async _onEmailDiscovered(handler, email) {
318
+ await handler(email);
319
+ }
320
+ async _onEmailLeads(handler, emails) {
321
+ await handler(emails);
322
+ }
323
+ _onRawResponse(handler, rawResponse) {
324
+ this.queue.add(() => handler(rawResponse));
325
+ }
326
+ _onResponse(handler, response) {
327
+ this.queue.add(() => handler(response));
328
+ }
329
+ buildUrl(url, params) {
330
+ if (params) {
331
+ const u = new URL(url, this.config.baseUrl);
332
+ for (const [key, value] of Object.entries(params)) {
333
+ u.searchParams.set(key, value.toString());
334
+ }
335
+ url = u.href;
336
+ }
337
+ return url;
338
+ }
339
+ visit(url, options) {
340
+ if (this.config.baseUrl)
341
+ url = new URL(url, this.config.baseUrl).href;
342
+ if (options?.params && (options.useOxylabsScraperAi || this.config.hasDomain(url, "oxylabs"))) {
343
+ url = this.buildUrl(url, options.params);
344
+ }
345
+ const {
346
+ method = "GET",
347
+ headers = new Headers,
348
+ forceRevisit = this.config.forceRevisit,
349
+ body = "",
350
+ timeout = this.config.timeout,
351
+ maxRedirects = this.config.maxRedirects,
352
+ useProxy = this.config.hasDomain(url, "proxies", options?.useProxy),
353
+ extractLeads = false,
354
+ params,
355
+ rejectUnauthorized,
356
+ useQueue = false,
357
+ deepEmailFinder = false,
358
+ useOxylabsScraperAi = false,
359
+ useOxylabsRotation = true,
360
+ useDecodo = false
361
+ } = options || {};
362
+ const _options = {
363
+ headers: this.config.pickHeaders(url, true, headers, true),
364
+ timeout,
365
+ maxRedirects,
366
+ params,
367
+ proxy: useProxy ? this.config.getAdapter(url, "proxies", true, true) || undefined : undefined,
368
+ rejectUnauthorized: typeof rejectUnauthorized === "boolean" ? rejectUnauthorized : this.config.rejectUnauthorized,
369
+ pqueue: this.config.getAdapter(url, "limiters", useQueue, useQueue) || undefined
370
+ };
371
+ let oxylabsOptions = {};
372
+ let oxylabsInstanse = undefined;
373
+ if (useOxylabsScraperAi && this.config.hasDomain(url, "oxylabs")) {
374
+ oxylabsOptions = {
375
+ method: method === "POST" ? "post" : "get",
376
+ headers: this.config.pickHeaders(url, true, headers, true),
377
+ pqueue: this.config.getAdapter(url, "limiters", useQueue, useQueue) || undefined,
378
+ base64Body: typeof body === "string" ? Buffer.from(body).toString("base64") : undefined
379
+ };
380
+ oxylabsInstanse = this.config.getAdapter(url, "oxylabs", false, useOxylabsRotation) || undefined;
381
+ }
382
+ let decodoOptions = {};
383
+ let decodoInstanse = undefined;
384
+ if (useDecodo && this.config.hasDomain(url, "decodo")) {
385
+ decodoOptions = {
386
+ method: method === "POST" ? "post" : "get",
387
+ headers: this.config.pickHeaders(url, true, headers, true),
388
+ pqueue: this.config.getAdapter(url, "limiters", useQueue, useQueue) || undefined,
389
+ base64Body: typeof body === "string" ? Buffer.from(body).toString("base64") : undefined
390
+ };
391
+ decodoInstanse = this.config.getAdapter(url, "decodo", false, useOxylabsRotation) || undefined;
392
+ }
393
+ if (deepEmailFinder) {
394
+ this.execute2(method, url, body, _options, forceRevisit).then();
395
+ return this;
396
+ }
397
+ this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions).then();
398
+ return this;
399
+ }
400
+ async execute(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions) {
401
+ this.queue.add(() => this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions)).then();
402
+ }
403
+ async execute2(method, url, body, options = {}, forceRevisit) {
404
+ this.queue.add(() => this.leadsFinder.parseExternalWebsite(url, method, body, {
405
+ httpConfig: options,
406
+ saveCache: this.saveCache.bind(this),
407
+ saveUrl: this.saveUrl.bind(this),
408
+ getCache: this.getCache.bind(this),
409
+ hasUrlInCache: this.hasUrlInCache.bind(this),
410
+ onEmailDiscovered: this.emailDiscoveredEvents,
411
+ onEmails: this.emailLeadsEvents,
412
+ queue: this.queue,
413
+ depth: 1,
414
+ allowCrossDomainTravel: true
415
+ }, forceRevisit, true)).then();
416
+ }
417
+ async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0) {
418
+ try {
419
+ console.log({
420
+ oxylabsOptions: typeof oxylabsOptions,
421
+ oxylabsInstanse: typeof oxylabsInstanse,
422
+ decodoInstanse: typeof decodoInstanse,
423
+ decodoOptions: typeof decodoOptions
424
+ });
425
+ const isVisited = forceRevisit ? false : await this.hasUrlInCache(url);
426
+ const cache = await this.getCache(url);
427
+ if (isVisited && !cache)
428
+ return;
429
+ if (isVisited && method !== "GET")
430
+ return;
431
+ const response = cache && method === "GET" ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) : await (method === "GET" ? this.http.get(url, options) : method === "PATCH" ? this.http.patch(url, body, options) : method === "POST" ? this.http.post(url, body, options) : this.http.put(url, body, options));
432
+ const res = {
433
+ data: response.data || response.content || "",
434
+ contentType: response.contentType || "",
435
+ finalUrl: response.finalUrl || response.url || url,
436
+ url: response?.urls?.[0] || response.url || this.buildUrl(url, options.params),
437
+ headers: response.headers || {},
438
+ status: response.status || response.statusCode || 200,
439
+ statusText: response.statusText || "",
440
+ cookies: response?.cookies?.serialized || response?.cookies,
441
+ contentLength: response.contentLength || 0
442
+ };
443
+ if (!cache)
444
+ await this.saveCache(url, res);
445
+ if (!isVisited)
446
+ await this.saveUrl(url);
447
+ if (res.contentType && res.contentType.includes("/json")) {
448
+ if (this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) {
449
+ this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.queue);
450
+ }
451
+ for (let i = 0;i < this.jsonEvents.length; i++) {
452
+ const event = this.jsonEvents[i];
453
+ this[event.handler](...event.attr, res.data);
454
+ }
455
+ }
456
+ for (let i = 0;i < this.responseEvents.length; i++) {
457
+ const event = this.responseEvents[i];
458
+ this[event.handler](...event.attr, res);
459
+ }
460
+ this.rawResponseHandler(res.data);
461
+ if (!res.contentType || !res.contentType.includes("/html") || typeof res.data !== "string")
462
+ return;
463
+ if ((this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) && isEmail) {
464
+ this.leadsFinder.extractEmails(res.data, res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.queue);
465
+ }
466
+ const { document } = parseHTML(res.data.addBaseUrl(res.finalUrl));
467
+ document.URL = res.finalUrl;
468
+ for (let i = 0;i < this.events.length; i++) {
469
+ const event = this.events[i];
470
+ this[event.handler](...event.attr, document);
471
+ }
472
+ } catch (e) {
473
+ const error = e;
474
+ if (error && error.response) {
475
+ const status = error.response.status;
476
+ const retryDelay = this.config.retryDelay || 1000;
477
+ const maxRetryAttempts = this.config.maxRetryAttempts || 3;
478
+ const maxRetryOnProxyError = this.config.maxRetryOnProxyError || 3;
479
+ const retryWithoutProxyOnStatusCode = this.config.retryWithoutProxyOnStatusCode || undefined;
480
+ const retryOnStatusCode = this.config.retryOnStatusCode || undefined;
481
+ const retryOnProxyError = this.config.retryOnProxyError || undefined;
482
+ if (retryWithoutProxyOnStatusCode && options.proxy && retryWithoutProxyOnStatusCode.includes(status) && retryCount < maxRetryAttempts) {
483
+ await this.sleep(retryDelay);
484
+ delete options.proxy;
485
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
486
+ } else if (retryOnStatusCode && options.proxy && retryOnStatusCode.includes(status) && retryCount < maxRetryAttempts) {
487
+ await this.sleep(retryDelay);
488
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
489
+ } else if (retryOnProxyError && options.proxy && retryCount < maxRetryOnProxyError) {
490
+ await this.sleep(retryDelay);
491
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
492
+ }
493
+ }
494
+ if (this.config.throwFatalError)
495
+ throw e;
496
+ if (this.config.debug) {
497
+ console.log(`Error visiting ${url}: ${e.message}`);
498
+ }
499
+ console.log(error);
500
+ for (let i = 0;i < this.errorEvents.length; i++) {
501
+ const event = this.errorEvents[i];
502
+ this[event.handler](...event.attr, e);
503
+ }
504
+ }
505
+ }
506
+ async waitForAll() {
507
+ await this.queue.onIdle();
508
+ }
509
+ async close() {
510
+ try {
511
+ await this.cacher.close();
512
+ } catch {}
513
+ try {
514
+ await this.urlStorage.close();
515
+ } catch {}
516
+ }
517
+ }
518
+
519
+ exports.Crawler = Crawler;