rezo 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +1507 -0
  3. package/assets/icon.svg +37 -0
  4. package/assets/logo-dark.svg +47 -0
  5. package/assets/logo.svg +58 -0
  6. package/dist/adapters/curl.cjs +1034 -0
  7. package/dist/adapters/curl.js +1031 -0
  8. package/dist/adapters/entries/curl.cjs +4 -0
  9. package/dist/adapters/entries/curl.d.ts +2136 -0
  10. package/dist/adapters/entries/curl.js +2 -0
  11. package/dist/adapters/entries/fetch.cjs +2 -0
  12. package/dist/adapters/entries/fetch.d.ts +2127 -0
  13. package/dist/adapters/entries/fetch.js +1 -0
  14. package/dist/adapters/entries/http.cjs +2 -0
  15. package/dist/adapters/entries/http.d.ts +2126 -0
  16. package/dist/adapters/entries/http.js +1 -0
  17. package/dist/adapters/entries/http2.cjs +4 -0
  18. package/dist/adapters/entries/http2.d.ts +2136 -0
  19. package/dist/adapters/entries/http2.js +2 -0
  20. package/dist/adapters/entries/react-native.cjs +2 -0
  21. package/dist/adapters/entries/react-native.d.ts +2126 -0
  22. package/dist/adapters/entries/react-native.js +1 -0
  23. package/dist/adapters/entries/xhr.cjs +2 -0
  24. package/dist/adapters/entries/xhr.d.ts +2127 -0
  25. package/dist/adapters/entries/xhr.js +1 -0
  26. package/dist/adapters/fetch.cjs +740 -0
  27. package/dist/adapters/fetch.js +739 -0
  28. package/dist/adapters/http.cjs +1153 -0
  29. package/dist/adapters/http.js +1151 -0
  30. package/dist/adapters/http2.cjs +957 -0
  31. package/dist/adapters/http2.js +956 -0
  32. package/dist/adapters/index.cjs +6 -0
  33. package/dist/adapters/index.js +7 -0
  34. package/dist/adapters/picker.cjs +342 -0
  35. package/dist/adapters/picker.js +331 -0
  36. package/dist/adapters/react-native.cjs +545 -0
  37. package/dist/adapters/react-native.js +544 -0
  38. package/dist/adapters/xhr.cjs +622 -0
  39. package/dist/adapters/xhr.js +621 -0
  40. package/dist/cache/dns-cache.cjs +118 -0
  41. package/dist/cache/dns-cache.js +113 -0
  42. package/dist/cache/file-cacher.cjs +264 -0
  43. package/dist/cache/file-cacher.js +261 -0
  44. package/dist/cache/index.cjs +13 -0
  45. package/dist/cache/index.js +5 -0
  46. package/dist/cache/lru-cache.cjs +96 -0
  47. package/dist/cache/lru-cache.js +93 -0
  48. package/dist/cache/response-cache.cjs +314 -0
  49. package/dist/cache/response-cache.js +310 -0
  50. package/dist/cache/url-store.cjs +288 -0
  51. package/dist/cache/url-store.js +285 -0
  52. package/dist/core/hooks.cjs +133 -0
  53. package/dist/core/hooks.js +120 -0
  54. package/dist/core/rezo.cjs +464 -0
  55. package/dist/core/rezo.js +458 -0
  56. package/dist/crawler.d.ts +6255 -0
  57. package/dist/dom/index.cjs +1 -0
  58. package/dist/dom/index.d.ts +23 -0
  59. package/dist/dom/index.js +1 -0
  60. package/dist/entries/crawler.cjs +5 -0
  61. package/dist/entries/crawler.js +2 -0
  62. package/dist/errors/rezo-error.cjs +722 -0
  63. package/dist/errors/rezo-error.js +716 -0
  64. package/dist/index.cjs +34 -0
  65. package/dist/index.d.ts +3335 -0
  66. package/dist/index.js +26 -0
  67. package/dist/platform/browser.cjs +9 -0
  68. package/dist/platform/browser.d.ts +3203 -0
  69. package/dist/platform/browser.js +7 -0
  70. package/dist/platform/bun.cjs +9 -0
  71. package/dist/platform/bun.d.ts +3203 -0
  72. package/dist/platform/bun.js +7 -0
  73. package/dist/platform/deno.cjs +9 -0
  74. package/dist/platform/deno.d.ts +3203 -0
  75. package/dist/platform/deno.js +7 -0
  76. package/dist/platform/node.cjs +9 -0
  77. package/dist/platform/node.d.ts +3203 -0
  78. package/dist/platform/node.js +7 -0
  79. package/dist/platform/react-native.cjs +9 -0
  80. package/dist/platform/react-native.d.ts +3203 -0
  81. package/dist/platform/react-native.js +7 -0
  82. package/dist/platform/worker.cjs +9 -0
  83. package/dist/platform/worker.d.ts +3203 -0
  84. package/dist/platform/worker.js +7 -0
  85. package/dist/plugin/addon/decodo/index.cjs +1 -0
  86. package/dist/plugin/addon/decodo/index.js +1 -0
  87. package/dist/plugin/addon/decodo/options.cjs +1 -0
  88. package/dist/plugin/addon/decodo/options.js +1 -0
  89. package/dist/plugin/addon/oxylabs/index.cjs +1 -0
  90. package/dist/plugin/addon/oxylabs/index.js +1 -0
  91. package/dist/plugin/addon/oxylabs/options.cjs +1 -0
  92. package/dist/plugin/addon/oxylabs/options.js +1 -0
  93. package/dist/plugin/crawler-options.cjs +1 -0
  94. package/dist/plugin/crawler-options.js +1 -0
  95. package/dist/plugin/crawler.cjs +519 -0
  96. package/dist/plugin/crawler.js +517 -0
  97. package/dist/plugin/index.cjs +36 -0
  98. package/dist/plugin/index.js +32 -0
  99. package/dist/proxy/index.cjs +142 -0
  100. package/dist/proxy/index.js +139 -0
  101. package/dist/responses/buildError.cjs +452 -0
  102. package/dist/responses/buildError.js +441 -0
  103. package/dist/responses/buildResponse.cjs +365 -0
  104. package/dist/responses/buildResponse.js +361 -0
  105. package/dist/responses/download.cjs +54 -0
  106. package/dist/responses/download.js +52 -0
  107. package/dist/responses/stream.cjs +60 -0
  108. package/dist/responses/stream.js +58 -0
  109. package/dist/responses/upload.cjs +54 -0
  110. package/dist/responses/upload.js +52 -0
  111. package/dist/types/cookies.cjs +394 -0
  112. package/dist/types/cookies.js +391 -0
  113. package/dist/types/download.cjs +10 -0
  114. package/dist/types/download.js +10 -0
  115. package/dist/types/rezo-request.cjs +131 -0
  116. package/dist/types/rezo-request.js +131 -0
  117. package/dist/utils/agent-merger.cjs +111 -0
  118. package/dist/utils/agent-merger.js +108 -0
  119. package/dist/utils/compression.cjs +84 -0
  120. package/dist/utils/compression.js +82 -0
  121. package/dist/utils/cookies.cjs +514 -0
  122. package/dist/utils/cookies.js +511 -0
  123. package/dist/utils/data-operations.cjs +75 -0
  124. package/dist/utils/data-operations.js +73 -0
  125. package/dist/utils/form-data.cjs +164 -0
  126. package/dist/utils/form-data.js +161 -0
  127. package/dist/utils/headers.cjs +162 -0
  128. package/dist/utils/headers.js +161 -0
  129. package/dist/utils/http-config.cjs +723 -0
  130. package/dist/utils/http-config.js +718 -0
  131. package/dist/utils/index.cjs +8 -0
  132. package/dist/utils/index.js +8 -0
  133. package/dist/utils/tools.cjs +18 -0
  134. package/dist/utils/tools.js +15 -0
  135. package/package.json +172 -0
@@ -0,0 +1,517 @@
1
+ import fs from "node:fs";
2
+ import { FileCacher } from '../cache/file-cacher.js';
3
+ import { UrlStore } from '../cache/url-store.js';
4
+ import { parseHTML } from "linkedom";
5
+ import path from "node:path";
6
+ import PQueue from "p-queue";
7
+ import { Scraper } from './scraper.js';
8
+ import { CrawlerOptions } from './crawler-options.js';
9
+ String.prototype.addBaseUrl = function(url) {
10
+ url = url instanceof URL ? url.href : url;
11
+ const html = this.replace(/<base\b[^>]*?>/gi, "");
12
+ if (/<head[^>]*>/i.test(html)) {
13
+ return html.replace(/<head[^>]*>/i, (match) => `${match}
14
+ <base href="${url}">`);
15
+ }
16
+ const baseTag = `<head>
17
+ <base href="${url}">
18
+ </head>
19
+ `;
20
+ if (/<body[^>]*>/i.test(html)) {
21
+ return html.replace(/<body[^>]*>/i, baseTag + "$&");
22
+ }
23
+ if (/<html[^>]*>/i.test(html)) {
24
+ return html.replace(/<html[^>]*>/i, `$&
25
+ ` + baseTag);
26
+ }
27
+ return this;
28
+ };
29
+
30
+ export class Crawler {
31
+ http;
32
+ events = [];
33
+ jsonEvents = [];
34
+ errorEvents = [];
35
+ responseEvents = [];
36
+ rawResponseEvents = [];
37
+ emailDiscoveredEvents = [];
38
+ emailLeadsEvents = [];
39
+ cacher = null;
40
+ queue;
41
+ isCacheEnabled;
42
+ config;
43
+ urlStorage;
44
+ isStorageReady = false;
45
+ isCacheReady = false;
46
+ leadsFinder;
47
+ constructor(crawlerOptions, http) {
48
+ this.http = http;
49
+ this.queue = new PQueue({
50
+ concurrency: 1000
51
+ });
52
+ this.config = new CrawlerOptions(crawlerOptions);
53
+ const enableCache = this.config.enableCache;
54
+ this.isCacheEnabled = enableCache;
55
+ if (enableCache) {
56
+ const cacheDir = this.config.cacheDir;
57
+ const cacheTTL = this.config.cacheTTL;
58
+ const dbUrl = cacheDir && (cacheDir.startsWith("./") || cacheDir.startsWith("/")) ? `${cacheDir}${cacheDir.endsWith("/") ? "" : "/"}` : cacheDir ? `./${cacheDir}${cacheDir.endsWith("/") ? "" : "/"}` : `./cache/`;
59
+ if (!fs.existsSync(path.dirname(dbUrl)))
60
+ fs.mkdirSync(path.dirname(dbUrl), { recursive: true });
61
+ FileCacher.create({
62
+ cacheDir: dbUrl,
63
+ softDelete: false,
64
+ ttl: cacheTTL,
65
+ encryptNamespace: true
66
+ }).then((storage) => {
67
+ this.cacher = storage;
68
+ this.isCacheReady = true;
69
+ });
70
+ const dit = path.resolve(cacheDir, "urls");
71
+ if (!fs.existsSync(dit))
72
+ fs.mkdirSync(dit, { recursive: true });
73
+ UrlStore.create({
74
+ storeDir: dit,
75
+ dbFileName: ".url_cache.db",
76
+ ttl: 1000 * 60 * 60 * 24 * 7
77
+ }).then((storage) => {
78
+ this.urlStorage = storage;
79
+ this.isStorageReady = true;
80
+ });
81
+ } else {
82
+ const dit = path.resolve(this.config.cacheDir, "./cache/urls");
83
+ if (!fs.existsSync(dit))
84
+ fs.mkdirSync(dit, { recursive: true });
85
+ UrlStore.create({
86
+ storeDir: dit,
87
+ dbFileName: ".url_cache.db",
88
+ ttl: 1000 * 60 * 60 * 24 * 7
89
+ }).then((storage) => {
90
+ this.urlStorage = storage;
91
+ this.isStorageReady = true;
92
+ });
93
+ }
94
+ this.leadsFinder = new Scraper(this.http, this.config, this._onEmailLeads.bind(this), this._onEmailDiscovered.bind(this), this.config.debug);
95
+ }
96
+ rawResponseHandler(data) {
97
+ if (this.rawResponseEvents.length === 0)
98
+ return;
99
+ const isBuffer = data instanceof Buffer;
100
+ if (!isBuffer) {
101
+ if (data instanceof ArrayBuffer) {
102
+ data = Buffer.from(new Uint8Array(data));
103
+ } else if (data instanceof Uint8Array) {
104
+ data = Buffer.from(data);
105
+ } else if (typeof data === "string") {
106
+ data = Buffer.from(data, "utf8");
107
+ } else if (typeof data === "object") {
108
+ data = Buffer.from(JSON.stringify(data), "utf8");
109
+ }
110
+ }
111
+ this.rawResponseEvents.forEach((e) => {
112
+ const handler = e.attr[0];
113
+ handler(data);
114
+ });
115
+ }
116
+ async waitForCache() {
117
+ if (this.isCacheReady)
118
+ return;
119
+ await this.sleep(this.rnd(50, 200));
120
+ await this.waitForCache();
121
+ }
122
+ async waitForStorage() {
123
+ if (this.isStorageReady)
124
+ return;
125
+ await this.sleep(this.rnd(50, 200));
126
+ await this.waitForStorage();
127
+ }
128
+ async saveUrl(url) {
129
+ await this.waitForStorage();
130
+ await this.urlStorage.set(url);
131
+ }
132
+ async hasUrlInCache(url) {
133
+ await this.waitForStorage();
134
+ return await this.urlStorage.has(url);
135
+ }
136
+ async saveCache(url, value) {
137
+ if (!this.isCacheEnabled)
138
+ return;
139
+ await this.waitForCache();
140
+ return this.cacher.set(url, value, this.config.cacheTTL, this.getNamespace(url));
141
+ }
142
+ getNamespace(url) {
143
+ try {
144
+ return new URL(url).hostname;
145
+ } catch {
146
+ return;
147
+ }
148
+ }
149
+ async hasCache(url) {
150
+ if (!this.isCacheEnabled)
151
+ return false;
152
+ await this.waitForCache();
153
+ return this.cacher.has(url, this.getNamespace(url));
154
+ }
155
+ async getCache(url) {
156
+ if (!this.isCacheEnabled)
157
+ return null;
158
+ await this.waitForCache();
159
+ return this.cacher.get(url, this.getNamespace(url));
160
+ }
161
+ sleep(ms) {
162
+ return new Promise((resolve) => setTimeout(resolve, ms));
163
+ }
164
+ rnd(min = 0, max = Number.MAX_VALUE) {
165
+ return Math.floor(Math.random() * (max - min + 1)) + min;
166
+ }
167
+ onError(handler) {
168
+ this.errorEvents.push({
169
+ handler: "_onError",
170
+ attr: [handler]
171
+ });
172
+ return this;
173
+ }
174
+ onJson(handler) {
175
+ this.jsonEvents.push({
176
+ handler: "_onJson",
177
+ attr: [handler]
178
+ });
179
+ return this;
180
+ }
181
+ onEmailDiscovered(handler) {
182
+ this.emailDiscoveredEvents.push(handler);
183
+ return this;
184
+ }
185
+ onEmailLeads(handler) {
186
+ this.emailLeadsEvents.push(handler);
187
+ return this;
188
+ }
189
+ onRawData(handler) {
190
+ this.rawResponseEvents.push({
191
+ handler: "_onRawResponse",
192
+ attr: [handler]
193
+ });
194
+ return this;
195
+ }
196
+ onDocument(handler) {
197
+ this.events.push({
198
+ handler: "_onDocument",
199
+ attr: [handler]
200
+ });
201
+ return this;
202
+ }
203
+ onBody(handler) {
204
+ this.events.push({
205
+ handler: "_onBody",
206
+ attr: [handler]
207
+ });
208
+ return this;
209
+ }
210
+ onElement(handler) {
211
+ this.events.push({
212
+ handler: "_onElement",
213
+ attr: [handler]
214
+ });
215
+ return this;
216
+ }
217
+ onAnchor(selection, handler) {
218
+ this.events.push({
219
+ handler: "_onAnchor",
220
+ attr: [selection, handler]
221
+ });
222
+ return this;
223
+ }
224
+ onHref(handler) {
225
+ this.events.push({
226
+ handler: "_onHref",
227
+ attr: [handler]
228
+ });
229
+ return this;
230
+ }
231
+ onSelection(selection, handler) {
232
+ this.events.push({
233
+ handler: "_onSelection",
234
+ attr: [selection, handler]
235
+ });
236
+ return this;
237
+ }
238
+ onResponse(handler) {
239
+ this.responseEvents.push({
240
+ handler: "_onResponse",
241
+ attr: [handler]
242
+ });
243
+ return this;
244
+ }
245
+ onAttribute(selection, attribute, handler) {
246
+ this.events.push({
247
+ handler: "_onAttribute",
248
+ attr: [selection, attribute, handler]
249
+ });
250
+ return this;
251
+ }
252
+ onText(selection, handler) {
253
+ this.events.push({
254
+ handler: "_onText",
255
+ attr: [selection, handler]
256
+ });
257
+ return this;
258
+ }
259
+ _onBody(handler, document) {
260
+ this.queue.add(() => handler(document.body));
261
+ }
262
+ _onAttribute(selection, attribute, handler, document) {
263
+ selection = typeof attribute === "function" ? selection : null;
264
+ attribute = typeof attribute === "function" ? selection : attribute;
265
+ handler = typeof attribute === "function" ? attribute : handler;
266
+ selection = selection || `[${attribute}]`;
267
+ const elements = document.querySelectorAll(selection);
268
+ for (let i = 0;i < elements.length; i++) {
269
+ if (elements[i].hasAttribute(attribute))
270
+ this.queue.add(() => handler(elements[i].getAttribute(attribute)));
271
+ }
272
+ }
273
+ _onText(selection, handler, document) {
274
+ const elements = document.querySelectorAll(selection);
275
+ for (let i = 0;i < elements.length; i++) {
276
+ this.queue.add(() => handler(elements[i].textContent));
277
+ }
278
+ }
279
+ _onSelection(selection, handler, document) {
280
+ const elements = document.querySelectorAll(selection);
281
+ for (let i = 0;i < elements.length; i++) {
282
+ this.queue.add(() => handler(elements[i]));
283
+ }
284
+ }
285
+ _onElement(handler, document) {
286
+ const elements = document.querySelectorAll("*");
287
+ for (let i = 0;i < elements.length; i++) {
288
+ this.queue.add(() => handler(elements[i]));
289
+ }
290
+ }
291
+ _onHref(handler, document) {
292
+ const elements = document.querySelectorAll("a, link");
293
+ for (let i = 0;i < elements.length; i++) {
294
+ if (elements[i].hasAttribute("href"))
295
+ this.queue.add(() => handler(new URL(elements[i].getAttribute("href"), document.URL).href));
296
+ }
297
+ }
298
+ _onAnchor(selection, handler, document) {
299
+ handler = typeof selection === "function" ? selection : handler;
300
+ selection = typeof selection === "function" ? "a" : selection;
301
+ const elements = document.querySelectorAll(selection);
302
+ for (let i = 0;i < elements.length; i++) {
303
+ if (elements[i]?.href && document.baseURI)
304
+ elements[i].href = new URL(elements[i].getAttribute("href"), document.baseURI).href;
305
+ this.queue.add(() => handler(elements[i]));
306
+ }
307
+ }
308
+ _onDocument(handler, document) {
309
+ this.queue.add(() => handler(document));
310
+ }
311
+ _onJson(handler, json) {
312
+ this.queue.add(() => handler(json));
313
+ }
314
+ _onError(handler, error) {
315
+ this.queue.add(() => handler(error));
316
+ }
317
+ async _onEmailDiscovered(handler, email) {
318
+ await handler(email);
319
+ }
320
+ async _onEmailLeads(handler, emails) {
321
+ await handler(emails);
322
+ }
323
+ _onRawResponse(handler, rawResponse) {
324
+ this.queue.add(() => handler(rawResponse));
325
+ }
326
+ _onResponse(handler, response) {
327
+ this.queue.add(() => handler(response));
328
+ }
329
+ buildUrl(url, params) {
330
+ if (params) {
331
+ const u = new URL(url, this.config.baseUrl);
332
+ for (const [key, value] of Object.entries(params)) {
333
+ u.searchParams.set(key, value.toString());
334
+ }
335
+ url = u.href;
336
+ }
337
+ return url;
338
+ }
339
+ visit(url, options) {
340
+ if (this.config.baseUrl)
341
+ url = new URL(url, this.config.baseUrl).href;
342
+ if (options?.params && (options.useOxylabsScraperAi || this.config.hasDomain(url, "oxylabs"))) {
343
+ url = this.buildUrl(url, options.params);
344
+ }
345
+ const {
346
+ method = "GET",
347
+ headers = new Headers,
348
+ forceRevisit = this.config.forceRevisit,
349
+ body = "",
350
+ timeout = this.config.timeout,
351
+ maxRedirects = this.config.maxRedirects,
352
+ useProxy = this.config.hasDomain(url, "proxies", options?.useProxy),
353
+ extractLeads = false,
354
+ params,
355
+ rejectUnauthorized,
356
+ useQueue = false,
357
+ deepEmailFinder = false,
358
+ useOxylabsScraperAi = false,
359
+ useOxylabsRotation = true,
360
+ useDecodo = false
361
+ } = options || {};
362
+ const _options = {
363
+ headers: this.config.pickHeaders(url, true, headers, true),
364
+ timeout,
365
+ maxRedirects,
366
+ params,
367
+ proxy: useProxy ? this.config.getAdapter(url, "proxies", true, true) || undefined : undefined,
368
+ rejectUnauthorized: typeof rejectUnauthorized === "boolean" ? rejectUnauthorized : this.config.rejectUnauthorized,
369
+ pqueue: this.config.getAdapter(url, "limiters", useQueue, useQueue) || undefined
370
+ };
371
+ let oxylabsOptions = {};
372
+ let oxylabsInstanse = undefined;
373
+ if (useOxylabsScraperAi && this.config.hasDomain(url, "oxylabs")) {
374
+ oxylabsOptions = {
375
+ method: method === "POST" ? "post" : "get",
376
+ headers: this.config.pickHeaders(url, true, headers, true),
377
+ pqueue: this.config.getAdapter(url, "limiters", useQueue, useQueue) || undefined,
378
+ base64Body: typeof body === "string" ? Buffer.from(body).toString("base64") : undefined
379
+ };
380
+ oxylabsInstanse = this.config.getAdapter(url, "oxylabs", false, useOxylabsRotation) || undefined;
381
+ }
382
+ let decodoOptions = {};
383
+ let decodoInstanse = undefined;
384
+ if (useDecodo && this.config.hasDomain(url, "decodo")) {
385
+ decodoOptions = {
386
+ method: method === "POST" ? "post" : "get",
387
+ headers: this.config.pickHeaders(url, true, headers, true),
388
+ pqueue: this.config.getAdapter(url, "limiters", useQueue, useQueue) || undefined,
389
+ base64Body: typeof body === "string" ? Buffer.from(body).toString("base64") : undefined
390
+ };
391
+ decodoInstanse = this.config.getAdapter(url, "decodo", false, useOxylabsRotation) || undefined;
392
+ }
393
+ if (deepEmailFinder) {
394
+ this.execute2(method, url, body, _options, forceRevisit).then();
395
+ return this;
396
+ }
397
+ this.execute(method, url, body, _options, extractLeads, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions).then();
398
+ return this;
399
+ }
400
+ async execute(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions) {
401
+ this.queue.add(() => this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions)).then();
402
+ }
403
+ async execute2(method, url, body, options = {}, forceRevisit) {
404
+ this.queue.add(() => this.leadsFinder.parseExternalWebsite(url, method, body, {
405
+ httpConfig: options,
406
+ saveCache: this.saveCache.bind(this),
407
+ saveUrl: this.saveUrl.bind(this),
408
+ getCache: this.getCache.bind(this),
409
+ hasUrlInCache: this.hasUrlInCache.bind(this),
410
+ onEmailDiscovered: this.emailDiscoveredEvents,
411
+ onEmails: this.emailLeadsEvents,
412
+ queue: this.queue,
413
+ depth: 1,
414
+ allowCrossDomainTravel: true
415
+ }, forceRevisit, true)).then();
416
+ }
417
+ async executeHttp(method, url, body, options = {}, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount = 0) {
418
+ try {
419
+ console.log({
420
+ oxylabsOptions: typeof oxylabsOptions,
421
+ oxylabsInstanse: typeof oxylabsInstanse,
422
+ decodoInstanse: typeof decodoInstanse,
423
+ decodoOptions: typeof decodoOptions
424
+ });
425
+ const isVisited = forceRevisit ? false : await this.hasUrlInCache(url);
426
+ const cache = await this.getCache(url);
427
+ if (isVisited && !cache)
428
+ return;
429
+ if (isVisited && method !== "GET")
430
+ return;
431
+ const response = cache && method === "GET" ? cache : oxylabsInstanse && oxylabsOptions ? await oxylabsInstanse.scrape(url) : decodoInstanse && decodoOptions ? await decodoInstanse.scrape(url) : await (method === "GET" ? this.http.get(url, options) : method === "PATCH" ? this.http.patch(url, body, options) : method === "POST" ? this.http.post(url, body, options) : this.http.put(url, body, options));
432
+ const res = {
433
+ data: response.data || response.content || "",
434
+ contentType: response.contentType || "",
435
+ finalUrl: response.finalUrl || response.url || url,
436
+ url: response?.urls?.[0] || response.url || this.buildUrl(url, options.params),
437
+ headers: response.headers || {},
438
+ status: response.status || response.statusCode || 200,
439
+ statusText: response.statusText || "",
440
+ cookies: response?.cookies?.serialized || response?.cookies,
441
+ contentLength: response.contentLength || 0
442
+ };
443
+ if (!cache)
444
+ await this.saveCache(url, res);
445
+ if (!isVisited)
446
+ await this.saveUrl(url);
447
+ if (res.contentType && res.contentType.includes("/json")) {
448
+ if (this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) {
449
+ this.leadsFinder.extractEmails(JSON.stringify(res.data), res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.queue);
450
+ }
451
+ for (let i = 0;i < this.jsonEvents.length; i++) {
452
+ const event = this.jsonEvents[i];
453
+ this[event.handler](...event.attr, res.data);
454
+ }
455
+ }
456
+ for (let i = 0;i < this.responseEvents.length; i++) {
457
+ const event = this.responseEvents[i];
458
+ this[event.handler](...event.attr, res);
459
+ }
460
+ this.rawResponseHandler(res.data);
461
+ if (!res.contentType || !res.contentType.includes("/html") || typeof res.data !== "string")
462
+ return;
463
+ if ((this.emailDiscoveredEvents.length > 0 || this.emailLeadsEvents.length > 0) && isEmail) {
464
+ this.leadsFinder.extractEmails(res.data, res.finalUrl, this.emailDiscoveredEvents, this.emailLeadsEvents, this.queue);
465
+ }
466
+ const { document } = parseHTML(res.data.addBaseUrl(res.finalUrl));
467
+ document.URL = res.finalUrl;
468
+ for (let i = 0;i < this.events.length; i++) {
469
+ const event = this.events[i];
470
+ this[event.handler](...event.attr, document);
471
+ }
472
+ } catch (e) {
473
+ const error = e;
474
+ if (error && error.response) {
475
+ const status = error.response.status;
476
+ const retryDelay = this.config.retryDelay || 1000;
477
+ const maxRetryAttempts = this.config.maxRetryAttempts || 3;
478
+ const maxRetryOnProxyError = this.config.maxRetryOnProxyError || 3;
479
+ const retryWithoutProxyOnStatusCode = this.config.retryWithoutProxyOnStatusCode || undefined;
480
+ const retryOnStatusCode = this.config.retryOnStatusCode || undefined;
481
+ const retryOnProxyError = this.config.retryOnProxyError || undefined;
482
+ if (retryWithoutProxyOnStatusCode && options.proxy && retryWithoutProxyOnStatusCode.includes(status) && retryCount < maxRetryAttempts) {
483
+ await this.sleep(retryDelay);
484
+ delete options.proxy;
485
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
486
+ } else if (retryOnStatusCode && options.proxy && retryOnStatusCode.includes(status) && retryCount < maxRetryAttempts) {
487
+ await this.sleep(retryDelay);
488
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
489
+ } else if (retryOnProxyError && options.proxy && retryCount < maxRetryOnProxyError) {
490
+ await this.sleep(retryDelay);
491
+ return await this.executeHttp(method, url, body, options, isEmail, forceRevisit, oxylabsOptions, oxylabsInstanse, decodoInstanse, decodoOptions, retryCount + 1);
492
+ }
493
+ }
494
+ if (this.config.throwFatalError)
495
+ throw e;
496
+ if (this.config.debug) {
497
+ console.log(`Error visiting ${url}: ${e.message}`);
498
+ }
499
+ console.log(error);
500
+ for (let i = 0;i < this.errorEvents.length; i++) {
501
+ const event = this.errorEvents[i];
502
+ this[event.handler](...event.attr, e);
503
+ }
504
+ }
505
+ }
506
+ async waitForAll() {
507
+ await this.queue.onIdle();
508
+ }
509
+ async close() {
510
+ try {
511
+ await this.cacher.close();
512
+ } catch {}
513
+ try {
514
+ await this.urlStorage.close();
515
+ } catch {}
516
+ }
517
+ }
@@ -0,0 +1,36 @@
1
+ const _mod_4lomw8 = require('./crawler.cjs');
2
+ exports.Crawler = _mod_4lomw8.Crawler;;
3
+ const _mod_w4j7fu = require('./crawler-options.cjs');
4
+ exports.CrawlerOptions = _mod_w4j7fu.CrawlerOptions;;
5
+ const _mod_ek7ks1 = require('../cache/file-cacher.cjs');
6
+ exports.FileCacher = _mod_ek7ks1.FileCacher;;
7
+ const _mod_0txncg = require('../cache/url-store.cjs');
8
+ exports.UrlStore = _mod_0txncg.UrlStore;;
9
+ const _mod_y87964 = require('./addon/oxylabs/index.cjs');
10
+ exports.Oxylabs = _mod_y87964.Oxylabs;;
11
+ const _mod_z2fqld = require('./addon/oxylabs/options.cjs');
12
+ exports.OXYLABS_BROWSER_TYPES = _mod_z2fqld.OXYLABS_BROWSER_TYPES;
13
+ exports.OXYLABS_COMMON_LOCALES = _mod_z2fqld.OXYLABS_COMMON_LOCALES;
14
+ exports.OXYLABS_COMMON_GEO_LOCATIONS = _mod_z2fqld.OXYLABS_COMMON_GEO_LOCATIONS;
15
+ exports.OXYLABS_US_STATES = _mod_z2fqld.OXYLABS_US_STATES;
16
+ exports.OXYLABS_EUROPEAN_COUNTRIES = _mod_z2fqld.OXYLABS_EUROPEAN_COUNTRIES;
17
+ exports.OXYLABS_ASIAN_COUNTRIES = _mod_z2fqld.OXYLABS_ASIAN_COUNTRIES;
18
+ exports.getRandomOxylabsBrowserType = _mod_z2fqld.getRandomBrowserType;
19
+ exports.getRandomOxylabsLocale = _mod_z2fqld.getRandomLocale;
20
+ exports.getRandomOxylabsGeoLocation = _mod_z2fqld.getRandomGeoLocation;;
21
+ const _mod_v2ufiz = require('./addon/decodo/index.cjs');
22
+ exports.Decodo = _mod_v2ufiz.Decodo;;
23
+ const _mod_3pykq7 = require('./addon/decodo/options.cjs');
24
+ exports.DECODO_DEVICE_TYPES = _mod_3pykq7.DECODO_DEVICE_TYPES;
25
+ exports.DECODO_HEADLESS_MODES = _mod_3pykq7.DECODO_HEADLESS_MODES;
26
+ exports.DECODO_COMMON_LOCALES = _mod_3pykq7.DECODO_COMMON_LOCALES;
27
+ exports.DECODO_COMMON_COUNTRIES = _mod_3pykq7.DECODO_COMMON_COUNTRIES;
28
+ exports.DECODO_EUROPEAN_COUNTRIES = _mod_3pykq7.DECODO_EUROPEAN_COUNTRIES;
29
+ exports.DECODO_ASIAN_COUNTRIES = _mod_3pykq7.DECODO_ASIAN_COUNTRIES;
30
+ exports.DECODO_US_STATES = _mod_3pykq7.DECODO_US_STATES;
31
+ exports.DECODO_COMMON_CITIES = _mod_3pykq7.DECODO_COMMON_CITIES;
32
+ exports.getRandomDecodoDeviceType = _mod_3pykq7.getRandomDeviceType;
33
+ exports.getRandomDecodoLocale = _mod_3pykq7.getRandomLocale;
34
+ exports.getRandomDecodoCountry = _mod_3pykq7.getRandomCountry;
35
+ exports.getRandomDecodoCity = _mod_3pykq7.getRandomCity;
36
+ exports.generateDecodoSessionId = _mod_3pykq7.generateSessionId;;
@@ -0,0 +1,32 @@
1
+ export { Crawler } from './crawler.js';
2
+ export { CrawlerOptions } from './crawler-options.js';
3
+ export { FileCacher } from '../cache/file-cacher.js';
4
+ export { UrlStore } from '../cache/url-store.js';
5
+ export { Oxylabs } from './addon/oxylabs/index.js';
6
+ export {
7
+ OXYLABS_BROWSER_TYPES,
8
+ OXYLABS_COMMON_LOCALES,
9
+ OXYLABS_COMMON_GEO_LOCATIONS,
10
+ OXYLABS_US_STATES,
11
+ OXYLABS_EUROPEAN_COUNTRIES,
12
+ OXYLABS_ASIAN_COUNTRIES,
13
+ getRandomBrowserType as getRandomOxylabsBrowserType,
14
+ getRandomLocale as getRandomOxylabsLocale,
15
+ getRandomGeoLocation as getRandomOxylabsGeoLocation
16
+ } from './addon/oxylabs/options.js';
17
+ export { Decodo } from './addon/decodo/index.js';
18
+ export {
19
+ DECODO_DEVICE_TYPES,
20
+ DECODO_HEADLESS_MODES,
21
+ DECODO_COMMON_LOCALES,
22
+ DECODO_COMMON_COUNTRIES,
23
+ DECODO_EUROPEAN_COUNTRIES,
24
+ DECODO_ASIAN_COUNTRIES,
25
+ DECODO_US_STATES,
26
+ DECODO_COMMON_CITIES,
27
+ getRandomDeviceType as getRandomDecodoDeviceType,
28
+ getRandomLocale as getRandomDecodoLocale,
29
+ getRandomCountry as getRandomDecodoCountry,
30
+ getRandomCity as getRandomDecodoCity,
31
+ generateSessionId as generateDecodoSessionId
32
+ } from './addon/decodo/options.js';