rezo 1.0.74 → 1.0.76
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/entries/curl.d.ts +4 -1
- package/dist/adapters/entries/fetch.d.ts +4 -1
- package/dist/adapters/entries/http.d.ts +4 -1
- package/dist/adapters/entries/http2.d.ts +4 -1
- package/dist/adapters/entries/react-native.d.ts +4 -1
- package/dist/adapters/entries/xhr.d.ts +4 -1
- package/dist/adapters/http.cjs +2 -1
- package/dist/adapters/http.js +2 -1
- package/dist/adapters/index.cjs +6 -6
- package/dist/cache/index.cjs +9 -9
- package/dist/crawler/crawler-options.cjs +1 -1
- package/dist/crawler/crawler-options.js +1 -1
- package/dist/crawler/crawler.cjs +72 -1
- package/dist/crawler/crawler.js +72 -1
- package/dist/crawler/index.cjs +40 -40
- package/dist/crawler/plugin/index.cjs +1 -1
- package/dist/crawler.d.ts +101 -0
- package/dist/entries/crawler.cjs +4 -4
- package/dist/index.cjs +30 -30
- package/dist/index.d.ts +4 -1
- package/dist/internal/agents/bun-socks-http.cjs +573 -0
- package/dist/internal/agents/bun-socks-http.js +570 -0
- package/dist/internal/agents/index.cjs +14 -10
- package/dist/internal/agents/index.js +1 -0
- package/dist/platform/browser.d.ts +4 -1
- package/dist/platform/bun.d.ts +4 -1
- package/dist/platform/deno.d.ts +4 -1
- package/dist/platform/node.d.ts +4 -1
- package/dist/platform/react-native.d.ts +4 -1
- package/dist/platform/worker.d.ts +4 -1
- package/dist/proxy/index.cjs +4 -4
- package/dist/queue/index.cjs +8 -8
- package/dist/queue/queue.cjs +3 -0
- package/dist/queue/queue.js +3 -0
- package/dist/responses/universal/index.cjs +11 -11
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/wget/index.cjs +49 -49
- package/dist/wget/index.d.ts +3 -0
- package/package.json +1 -1
|
@@ -1997,6 +1997,8 @@ export declare class RezoError<T = any> extends Error {
|
|
|
1997
1997
|
* Queue configuration options
|
|
1998
1998
|
*/
|
|
1999
1999
|
export interface QueueConfig {
|
|
2000
|
+
/** Name of the queue - useful for debugging and logging */
|
|
2001
|
+
name?: string;
|
|
2000
2002
|
/** Maximum concurrent tasks (default: Infinity) */
|
|
2001
2003
|
concurrency?: number;
|
|
2002
2004
|
/** Auto-start processing when tasks are added (default: true) */
|
|
@@ -2127,6 +2129,7 @@ declare class RezoQueue<T = any> {
|
|
|
2127
2129
|
private isPausedFlag;
|
|
2128
2130
|
private intervalId?;
|
|
2129
2131
|
private intervalCount;
|
|
2132
|
+
readonly name: string;
|
|
2130
2133
|
private intervalStart;
|
|
2131
2134
|
private eventHandlers;
|
|
2132
2135
|
private statsData;
|
|
@@ -4579,7 +4582,7 @@ export interface RezoInstance extends Rezo {
|
|
|
4579
4582
|
*
|
|
4580
4583
|
* IMPORTANT: Update these values when bumping package version.
|
|
4581
4584
|
*/
|
|
4582
|
-
export declare const VERSION = "1.0.
|
|
4585
|
+
export declare const VERSION = "1.0.76";
|
|
4583
4586
|
/**
|
|
4584
4587
|
* cURL Options Configuration
|
|
4585
4588
|
*
|
|
@@ -1997,6 +1997,8 @@ export declare class RezoError<T = any> extends Error {
|
|
|
1997
1997
|
* Queue configuration options
|
|
1998
1998
|
*/
|
|
1999
1999
|
export interface QueueConfig {
|
|
2000
|
+
/** Name of the queue - useful for debugging and logging */
|
|
2001
|
+
name?: string;
|
|
2000
2002
|
/** Maximum concurrent tasks (default: Infinity) */
|
|
2001
2003
|
concurrency?: number;
|
|
2002
2004
|
/** Auto-start processing when tasks are added (default: true) */
|
|
@@ -2127,6 +2129,7 @@ declare class RezoQueue<T = any> {
|
|
|
2127
2129
|
private isPausedFlag;
|
|
2128
2130
|
private intervalId?;
|
|
2129
2131
|
private intervalCount;
|
|
2132
|
+
readonly name: string;
|
|
2130
2133
|
private intervalStart;
|
|
2131
2134
|
private eventHandlers;
|
|
2132
2135
|
private statsData;
|
|
@@ -4579,7 +4582,7 @@ export interface RezoInstance extends Rezo {
|
|
|
4579
4582
|
*
|
|
4580
4583
|
* IMPORTANT: Update these values when bumping package version.
|
|
4581
4584
|
*/
|
|
4582
|
-
export declare const VERSION = "1.0.
|
|
4585
|
+
export declare const VERSION = "1.0.76";
|
|
4583
4586
|
export declare const isRezoError: typeof RezoError.isRezoError;
|
|
4584
4587
|
export declare const Cancel: typeof RezoError;
|
|
4585
4588
|
export declare const CancelToken: {
|
|
@@ -1997,6 +1997,8 @@ export declare class RezoError<T = any> extends Error {
|
|
|
1997
1997
|
* Queue configuration options
|
|
1998
1998
|
*/
|
|
1999
1999
|
export interface QueueConfig {
|
|
2000
|
+
/** Name of the queue - useful for debugging and logging */
|
|
2001
|
+
name?: string;
|
|
2000
2002
|
/** Maximum concurrent tasks (default: Infinity) */
|
|
2001
2003
|
concurrency?: number;
|
|
2002
2004
|
/** Auto-start processing when tasks are added (default: true) */
|
|
@@ -2127,6 +2129,7 @@ declare class RezoQueue<T = any> {
|
|
|
2127
2129
|
private isPausedFlag;
|
|
2128
2130
|
private intervalId?;
|
|
2129
2131
|
private intervalCount;
|
|
2132
|
+
readonly name: string;
|
|
2130
2133
|
private intervalStart;
|
|
2131
2134
|
private eventHandlers;
|
|
2132
2135
|
private statsData;
|
|
@@ -4579,7 +4582,7 @@ export interface RezoInstance extends Rezo {
|
|
|
4579
4582
|
*
|
|
4580
4583
|
* IMPORTANT: Update these values when bumping package version.
|
|
4581
4584
|
*/
|
|
4582
|
-
export declare const VERSION = "1.0.
|
|
4585
|
+
export declare const VERSION = "1.0.76";
|
|
4583
4586
|
/**
|
|
4584
4587
|
* Type guard to check if an error is a RezoError instance.
|
|
4585
4588
|
*/
|
|
@@ -1997,6 +1997,8 @@ export declare class RezoError<T = any> extends Error {
|
|
|
1997
1997
|
* Queue configuration options
|
|
1998
1998
|
*/
|
|
1999
1999
|
export interface QueueConfig {
|
|
2000
|
+
/** Name of the queue - useful for debugging and logging */
|
|
2001
|
+
name?: string;
|
|
2000
2002
|
/** Maximum concurrent tasks (default: Infinity) */
|
|
2001
2003
|
concurrency?: number;
|
|
2002
2004
|
/** Auto-start processing when tasks are added (default: true) */
|
|
@@ -2127,6 +2129,7 @@ declare class RezoQueue<T = any> {
|
|
|
2127
2129
|
private isPausedFlag;
|
|
2128
2130
|
private intervalId?;
|
|
2129
2131
|
private intervalCount;
|
|
2132
|
+
readonly name: string;
|
|
2130
2133
|
private intervalStart;
|
|
2131
2134
|
private eventHandlers;
|
|
2132
2135
|
private statsData;
|
|
@@ -4579,7 +4582,7 @@ export interface RezoInstance extends Rezo {
|
|
|
4579
4582
|
*
|
|
4580
4583
|
* IMPORTANT: Update these values when bumping package version.
|
|
4581
4584
|
*/
|
|
4582
|
-
export declare const VERSION = "1.0.
|
|
4585
|
+
export declare const VERSION = "1.0.76";
|
|
4583
4586
|
export declare const isRezoError: typeof RezoError.isRezoError;
|
|
4584
4587
|
export declare const Cancel: typeof RezoError;
|
|
4585
4588
|
export declare const CancelToken: {
|
|
@@ -1997,6 +1997,8 @@ export declare class RezoError<T = any> extends Error {
|
|
|
1997
1997
|
* Queue configuration options
|
|
1998
1998
|
*/
|
|
1999
1999
|
export interface QueueConfig {
|
|
2000
|
+
/** Name of the queue - useful for debugging and logging */
|
|
2001
|
+
name?: string;
|
|
2000
2002
|
/** Maximum concurrent tasks (default: Infinity) */
|
|
2001
2003
|
concurrency?: number;
|
|
2002
2004
|
/** Auto-start processing when tasks are added (default: true) */
|
|
@@ -2127,6 +2129,7 @@ declare class RezoQueue<T = any> {
|
|
|
2127
2129
|
private isPausedFlag;
|
|
2128
2130
|
private intervalId?;
|
|
2129
2131
|
private intervalCount;
|
|
2132
|
+
readonly name: string;
|
|
2130
2133
|
private intervalStart;
|
|
2131
2134
|
private eventHandlers;
|
|
2132
2135
|
private statsData;
|
|
@@ -4579,7 +4582,7 @@ export interface RezoInstance extends Rezo {
|
|
|
4579
4582
|
*
|
|
4580
4583
|
* IMPORTANT: Update these values when bumping package version.
|
|
4581
4584
|
*/
|
|
4582
|
-
export declare const VERSION = "1.0.
|
|
4585
|
+
export declare const VERSION = "1.0.76";
|
|
4583
4586
|
export declare const isRezoError: typeof RezoError.isRezoError;
|
|
4584
4587
|
export declare const Cancel: typeof RezoError;
|
|
4585
4588
|
export declare const CancelToken: {
|
|
@@ -1997,6 +1997,8 @@ export declare class RezoError<T = any> extends Error {
|
|
|
1997
1997
|
* Queue configuration options
|
|
1998
1998
|
*/
|
|
1999
1999
|
export interface QueueConfig {
|
|
2000
|
+
/** Name of the queue - useful for debugging and logging */
|
|
2001
|
+
name?: string;
|
|
2000
2002
|
/** Maximum concurrent tasks (default: Infinity) */
|
|
2001
2003
|
concurrency?: number;
|
|
2002
2004
|
/** Auto-start processing when tasks are added (default: true) */
|
|
@@ -2127,6 +2129,7 @@ declare class RezoQueue<T = any> {
|
|
|
2127
2129
|
private isPausedFlag;
|
|
2128
2130
|
private intervalId?;
|
|
2129
2131
|
private intervalCount;
|
|
2132
|
+
readonly name: string;
|
|
2130
2133
|
private intervalStart;
|
|
2131
2134
|
private eventHandlers;
|
|
2132
2135
|
private statsData;
|
|
@@ -4579,7 +4582,7 @@ export interface RezoInstance extends Rezo {
|
|
|
4579
4582
|
*
|
|
4580
4583
|
* IMPORTANT: Update these values when bumping package version.
|
|
4581
4584
|
*/
|
|
4582
|
-
export declare const VERSION = "1.0.
|
|
4585
|
+
export declare const VERSION = "1.0.76";
|
|
4583
4586
|
export declare const isRezoError: typeof RezoError.isRezoError;
|
|
4584
4587
|
export declare const Cancel: typeof RezoError;
|
|
4585
4588
|
export declare const CancelToken: {
|
package/dist/adapters/http.cjs
CHANGED
|
@@ -24,6 +24,7 @@ const { StagedTimeoutManager, parseStagedTimeouts } = require('../utils/staged-t
|
|
|
24
24
|
const { handleRateLimitWait, shouldWaitOnStatus } = require('../utils/rate-limit-wait.cjs');
|
|
25
25
|
const { getSocketTelemetry, beginRequestContext } = require('../utils/socket-telemetry.cjs');
|
|
26
26
|
const dns = require("node:dns");
|
|
27
|
+
const { bunHttp, isBunRuntime, isBunSocksRequest } = require('../internal/agents/bun-socks-http.cjs');
|
|
27
28
|
const debugLog = {
|
|
28
29
|
requestStart: (config, url, method) => {
|
|
29
30
|
if (config.debug) {
|
|
@@ -661,7 +662,7 @@ async function request(config, fetchOptions, requestCount, timing, _stats, respo
|
|
|
661
662
|
const { fullUrl, body, fileName: filename } = fetchOptions;
|
|
662
663
|
const url = new URL(fullUrl || fetchOptions.url);
|
|
663
664
|
const isSecure = url.protocol === "https:";
|
|
664
|
-
const httpModule = isSecure ? config.isSecure && config.adapter ? config.adapter : https : !config.isSecure && config.adapter ? config.adapter : http;
|
|
665
|
+
const httpModule = isBunRuntime() && isBunSocksRequest(fetchOptions.proxy) ? bunHttp : isSecure ? config.isSecure && config.adapter ? config.adapter : https : !config.isSecure && config.adapter ? config.adapter : http;
|
|
665
666
|
await setInitialConfig(config, fetchOptions, isSecure, url, httpModule, requestCount, timing.startTime, timing.startTimestamp);
|
|
666
667
|
const eventEmitter = streamResult || downloadResult || uploadResult;
|
|
667
668
|
if (eventEmitter && requestCount === 0) {
|
package/dist/adapters/http.js
CHANGED
|
@@ -24,6 +24,7 @@ import { StagedTimeoutManager, parseStagedTimeouts } from '../utils/staged-timeo
|
|
|
24
24
|
import { handleRateLimitWait, shouldWaitOnStatus } from '../utils/rate-limit-wait.js';
|
|
25
25
|
import { getSocketTelemetry, beginRequestContext } from '../utils/socket-telemetry.js';
|
|
26
26
|
import dns from "node:dns";
|
|
27
|
+
import { bunHttp, isBunRuntime, isBunSocksRequest } from '../internal/agents/bun-socks-http.js';
|
|
27
28
|
const debugLog = {
|
|
28
29
|
requestStart: (config, url, method) => {
|
|
29
30
|
if (config.debug) {
|
|
@@ -661,7 +662,7 @@ async function request(config, fetchOptions, requestCount, timing, _stats, respo
|
|
|
661
662
|
const { fullUrl, body, fileName: filename } = fetchOptions;
|
|
662
663
|
const url = new URL(fullUrl || fetchOptions.url);
|
|
663
664
|
const isSecure = url.protocol === "https:";
|
|
664
|
-
const httpModule = isSecure ? config.isSecure && config.adapter ? config.adapter : https : !config.isSecure && config.adapter ? config.adapter : http;
|
|
665
|
+
const httpModule = isBunRuntime() && isBunSocksRequest(fetchOptions.proxy) ? bunHttp : isSecure ? config.isSecure && config.adapter ? config.adapter : https : !config.isSecure && config.adapter ? config.adapter : http;
|
|
665
666
|
await setInitialConfig(config, fetchOptions, isSecure, url, httpModule, requestCount, timing.startTime, timing.startTimestamp);
|
|
666
667
|
const eventEmitter = streamResult || downloadResult || uploadResult;
|
|
667
668
|
if (eventEmitter && requestCount === 0) {
|
package/dist/adapters/index.cjs
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.detectRuntime =
|
|
3
|
-
exports.getAdapterCapabilities =
|
|
4
|
-
exports.buildAdapterContext =
|
|
5
|
-
exports.getAvailableAdapters =
|
|
6
|
-
exports.selectAdapter =
|
|
1
|
+
const _mod_p2iotn = require('./picker.cjs');
|
|
2
|
+
exports.detectRuntime = _mod_p2iotn.detectRuntime;
|
|
3
|
+
exports.getAdapterCapabilities = _mod_p2iotn.getAdapterCapabilities;
|
|
4
|
+
exports.buildAdapterContext = _mod_p2iotn.buildAdapterContext;
|
|
5
|
+
exports.getAvailableAdapters = _mod_p2iotn.getAvailableAdapters;
|
|
6
|
+
exports.selectAdapter = _mod_p2iotn.selectAdapter;;
|
package/dist/cache/index.cjs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.LRUCache =
|
|
3
|
-
const
|
|
4
|
-
exports.DNSCache =
|
|
5
|
-
exports.getGlobalDNSCache =
|
|
6
|
-
exports.resetGlobalDNSCache =
|
|
7
|
-
const
|
|
8
|
-
exports.ResponseCache =
|
|
9
|
-
exports.normalizeResponseCacheConfig =
|
|
1
|
+
const _mod_zvxqrg = require('./lru-cache.cjs');
|
|
2
|
+
exports.LRUCache = _mod_zvxqrg.LRUCache;;
|
|
3
|
+
const _mod_u7bdfq = require('./dns-cache.cjs');
|
|
4
|
+
exports.DNSCache = _mod_u7bdfq.DNSCache;
|
|
5
|
+
exports.getGlobalDNSCache = _mod_u7bdfq.getGlobalDNSCache;
|
|
6
|
+
exports.resetGlobalDNSCache = _mod_u7bdfq.resetGlobalDNSCache;;
|
|
7
|
+
const _mod_cjcq8q = require('./response-cache.cjs');
|
|
8
|
+
exports.ResponseCache = _mod_cjcq8q.ResponseCache;
|
|
9
|
+
exports.normalizeResponseCacheConfig = _mod_cjcq8q.normalizeResponseCacheConfig;;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
var{RezoQueue:m}=require("../queue/queue.cjs"),{Oxylabs:x}=require("./addon/oxylabs/index.cjs"),b=require("node:path"),g=require("node:os"),{Decodo:y}=require("./addon/decodo/index.cjs");class f{baseUrl;adapter;enableNavigationHistory;sessionId;rejectUnauthorized;userAgent;useRndUserAgent;timeout;maxRedirects;maxRetryAttempts;retryDelay;retryOnStatusCode;forceRevisit;retryWithoutProxyOnStatusCode;retryOnProxyError;maxRetryOnProxyError;allowRevisiting;enableCache;cacheTTL;cacheDir;throwFatalError;debug;enableSignalHandlers;concurrency;scraperConcurrency;maxDepth;maxUrls;maxResponseSize;respectRobotsTxt;followNofollow;autoThrottle;autoThrottleTargetDelay;autoThrottleMinDelay;autoThrottleMaxDelay;maxWaitOn429;alwaysWaitOn429;oxylabs=[];decodo=[];proxies=[];limiters=[];requestHeaders=[];userAgents=A();constructor(e={}){this.baseUrl=e.baseUrl||"",this.adapter=e.adapter??"http",this.enableNavigationHistory=e.enableNavigationHistory??!1,this.sessionId=e.sessionId??`session_${Date.now()}_${Math.random().toString(36).slice(2,8)}`,this.rejectUnauthorized=e.rejectUnauthorized??!0,this.userAgent=e.userAgent,this.useRndUserAgent=e.useRndUserAgent??!1,this.timeout=e.timeout??30000,this.maxRedirects=e.maxRedirects??10,this.maxRetryAttempts=e.maxRetryAttempts??3,this.retryDelay=e.retryDelay??0,this.retryOnStatusCode=e.retryOnStatusCode??[408,429,500,502,503,504],this.forceRevisit=e.forceRevisit??!1,this.retryWithoutProxyOnStatusCode=e.retryWithoutProxyOnStatusCode??[407,403],this.retryOnProxyError=e.retryOnProxyError??!0,this.maxRetryOnProxyError=e.maxRetryOnProxyError??3,this.allowRevisiting=e.allowRevisiting??!1,this.enableCache=e.enableCache??!0,this.cacheTTL=e.cacheTTL??604800000,this.cacheDir=e.cacheDir??b.join(g.tmpdir(),"uiniqhtt_cache"),this.throwFatalError=e.throwFatalError??!1,this.debug=e.debug??!1,this.enableSignalHandlers=e.enableSignalHandlers??!1,this.concurrency=e.concurrency??100,this.scraperConcurrency=e.scraperConcurrency??this.concurrency,this.maxDepth=e.maxDepth??0,this.maxUrls=e.maxUrls??0,this.maxResponseSize=e.maxResponseSize??0,this.respectRobotsTxt=e.respectRobotsTxt??!1,this.followNofollow=e.followNofollow??!1,this.autoThrottle=e.autoThrottle??!0,this.autoThrottleTargetDelay=e.autoThrottleTargetDelay??1000,this.autoThrottleMinDelay=e.autoThrottleMinDelay??100,this.autoThrottleMaxDelay=e.autoThrottleMaxDelay??60000,this.maxWaitOn429=e.maxWaitOn429??1800000,this.alwaysWaitOn429=e.alwaysWaitOn429??!1,this._addHeaders(e.headers),this._addOxylabs(e.oxylabs),this._addDecodo(e.decodo),this._addProxies(e.proxy),this._addLimiters(e.limiter)}getConfiguredDomains(e){return(e==="headers"?this.requestHeaders:e==="limiters"?this.limiters:e==="oxylabs"?this.oxylabs:this.proxies).filter((i)=>i.domain).map((i)=>i.domain).filter((i,a,r)=>r.indexOf(i)===a)}removeDomain(e){return this.requestHeaders=this.requestHeaders.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.proxies=this.proxies.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.limiters=this.limiters.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.oxylabs=this.oxylabs.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this}_domainsEqual(e,t){if(Array.isArray(e)&&Array.isArray(t))return e.length===t.length&&e.every((i,a)=>i===t[a]);return e===t}getConfigurationSummary(){let e=(t)=>({total:t.length,global:t.filter((i)=>i.isGlobal).length,domainSpecific:t.filter((i)=>!i.isGlobal&&i.domain).length});return{headers:e(this.requestHeaders),proxies:e(this.proxies),limiters:e(this.limiters),oxylabs:e(this.oxylabs)}}_addHeaders(e){if(!e||!e.enable)return;for(let t of e.httpHeaders){let{domain:i,isGlobal:a,headers:r}=t;if(!i&&!a)continue;if(r instanceof Headers){let s=Object.fromEntries(r.entries());if(Object.keys(s).length<1)continue;r=s}else if(!r||Object.keys(r).length<1)continue;this.requestHeaders.push({domain:i,isGlobal:a,headers:r})}}_addProxies(e){if(!e||!e.enable)return;for(let t of e.proxies){let{domain:i,isGlobal:a,proxy:r}=t;if(!i&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.proxies.push({domain:i,isGlobal:a,proxy:r})}}_addLimiters(e){if(!e||!e.enable)return;for(let t of e.limiters){let{domain:i,isGlobal:a,options:r}=t;if(!i&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.limiters.push({domain:i,isGlobal:a,pqueue:new m(r)})}}_addOxylabs(e){if(!e||!e.enable)return;for(let t of e.labs){let{domain:i,isGlobal:a,options:r,queueOptions:s}=t;if(!i&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.oxylabs.push({domain:i,isGlobal:a,adaptar:new x(r)})}}_addDecodo(e){if(!e||!e.enable)return;for(let t of e.labs){let{domain:i,isGlobal:a,options:r,queueOptions:s}=t;if(!i&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.decodo.push({domain:i,isGlobal:a,adaptar:new y(r)})}}addHeaders(e){return this._addHeaders({enable:!0,httpHeaders:[e]}),this}addProxy(e){return this._addProxies({enable:!0,proxies:[e]}),this}addLimiter(e){return this._addLimiters({enable:!0,limiters:[e]}),this}addOxylabs(e){return this._addOxylabs({enable:!0,labs:[e]}),this}addDecodo(e){return this._addDecodo({enable:!0,labs:[e]}),this}destroyLimiters(){for(let e of this.limiters)if(e.pqueue&&typeof e.pqueue.destroy==="function")e.pqueue.destroy();this.limiters=[]}clearGlobalConfigs(){if(Array.isArray(this.requestHeaders))this.requestHeaders=this.requestHeaders.filter((e)=>!e.isGlobal);if(Array.isArray(this.oxylabs))this.oxylabs=this.oxylabs.filter((e)=>!e.isGlobal);if(Array.isArray(this.limiters))this.limiters=this.limiters.filter((e)=>!e.isGlobal);if(Array.isArray(this.proxies))this.proxies=this.proxies.filter((e)=>!e.isGlobal);return this}getAdapter(e,t,i,a){if(!this.getDomainName(e))return null;let s=[],o=t==="headers"?this.requestHeaders:t==="limiters"?this.limiters:t==="oxylabs"?this.oxylabs:t==="decodo"?this.decodo:this.proxies;for(let n=0;n<o.length;n++)if(this._hasDomain(e,o[n].domain))s.push(n);if(s.length){let n=a?s[this.rnd(0,s.length-1)]:s[0];return t==="headers"?this.requestHeaders[n].headers:t==="limiters"?this.limiters[n].pqueue:t==="oxylabs"?this.oxylabs[n].adaptar:t==="decodo"?this.decodo[n].adaptar:this.proxies[n].proxy}s.length=0;for(let n=0;n<o.length;n++)s.push(n);if(s.length){let n=a?s[this.rnd(0,s.length-1)]:s[0];if(o[n].isGlobal&&i)return t==="headers"?this.requestHeaders[n].headers:t==="limiters"?this.limiters[n].pqueue:t==="oxylabs"?this.oxylabs[n].adaptar:t==="decodo"?this.decodo[n].adaptar:this.proxies[n].proxy}return null}rnd(e=0,t=Number.MAX_VALUE){return Math.floor(Math.random()*(t-e+1))+e}hasDomain(e,t,i){if(!this.getDomainName(e))return!1;let r=t==="headers"?this.requestHeaders:t==="limiters"?this.limiters:t==="oxylabs"?this.oxylabs:t==="decodo"?this.decodo:this.proxies;for(let s=0;s<r.length;s++)if(this._hasDomain(e,r[s].domain))return!0;if(i){for(let s=0;s<r.length;s++)if(r[s].isGlobal)return!0}return!1}pickHeaders(e,t,i,a){let r=this.getAdapter(e,"headers",t),s=new Headers(r??{}),o=s.count;if(i&&i instanceof Headers)for(let[n,l]of Object.entries(i.entries()))s.set(n,l);else if(i&&typeof i==="object"){for(let[n,l]of Object.entries(i))if(typeof l==="string")s.set(n,l)}if(a&&o===0)s.set("user-agent",this.getRandomUserAgent());return Object.fromEntries(s.entries())}_hasDomain(e,t){if(!t)return!1;let i=this.getDomainName(e);if(!i)return!1;let a=(s)=>{return/[\^\$\*\+\?\{\}\[\]\(\)\|\\]/.test(s)||s.startsWith("/")||s.includes(".*")||s.includes(".+")},r=(s)=>{if(s instanceof RegExp)return s.test(i)||s.test(e);let o=s.toString().trim();if(i.toLowerCase()===o.toLowerCase())return!0;if(o.includes("*")){let h=o.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\\*/g,".*"),c=new RegExp(`^${h}$`,"i");return c.test(i)||c.test(e)}if(a(o))try{let h=o,c="i",u=o.match(/^\/(.*)\/(\w*)$/);if(u)h=u[1],c=u[2]||"i";let d=new RegExp(h,c);return d.test(i)||d.test(e)}catch(h){return i.toLowerCase().includes(o.toLowerCase())}let n=i.toLowerCase(),l=o.toLowerCase();return n===l||n.endsWith("."+l)||l.endsWith("."+n)};if(Array.isArray(t)){for(let s of t)if(r(s))return!0;return!1}return r(t)}getDomainName(e){if(this.isValidUrl(e))return new URL(e).hostname;else if(this.isHostName(e))return e;return null}isHostName(e){if(!e)return!1;if(e.length>255)return!1;let t=/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+ [a-zA-Z]{2,})$/;return e=e.trim().toLowerCase(),t.test(e)&&!e.startsWith("-")&&!e.endsWith("-")}isValidUrl(e){if(!e)return!1;e=e.trim();try{let t=new URL(e);if(!t.protocol||!["http:","https:"].includes(t.protocol.toLowerCase()))return!1;if(!t.hostname)return!1;if(!/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,})$/.test(t.hostname))return!1;return!0}catch{return!1}}getRandomUserAgent(){return this.userAgents[Math.floor(Math.random()*this.userAgents.length)]}}function A(){let e=[{name:"Chrome",version:"91.0.4472.124",engine:"AppleWebKit/537.36"},{name:"Firefox",version:"89.0",engine:"Gecko/20100101"},{name:"Safari",version:"14.1.1",engine:"AppleWebKit/605.1.15"},{name:"Edge",version:"91.0.864.59",engine:"AppleWebKit/537.36"},{name:"Opera",version:"77.0.4054.277",engine:"AppleWebKit/537.36"},{name:"Vivaldi",version:"3.8.2259.42",engine:"AppleWebKit/537.36"},{name:"Brave",version:"1.26.74",engine:"AppleWebKit/537.36"},{name:"Chromium",version:"91.0.4472.101",engine:"AppleWebKit/537.36"},{name:"Yandex",version:"21.5.3.742",engine:"AppleWebKit/537.36"},{name:"Maxthon",version:"5.3.8.2000",engine:"AppleWebKit/537.36"}],t=["Windows NT 10.0","Windows NT 6.1","Macintosh; Intel Mac OS X 10_15_7","Macintosh; Intel Mac OS X 11_4_0","X11; Linux x86_64","X11; Ubuntu; Linux x86_64"],i=[];for(let a=0;a<200;a++){let r=e[Math.floor(Math.random()*e.length)],s=t[Math.floor(Math.random()*t.length)],o="";switch(r.name){case"Chrome":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36`;break;case"Firefox":o=`Mozilla/5.0 (${s}; rv:${r.version}) ${r.engine} Firefox/${r.version}`;break;case"Safari":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Version/${r.version} Safari/605.1.15`;break;case"Edge":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Edg/${r.version}`;break;case"Opera":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 OPR/${r.version}`;break;case"Vivaldi":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Vivaldi/${r.version}`;break;case"Brave":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Brave/${r.version}`;break;case"Chromium":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chromium/${r.version} Chrome/${r.version} Safari/537.36`;break;case"Yandex":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} YaBrowser/${r.version} Safari/537.36`;break;case"Maxthon":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Maxthon/${r.version}`;break}i.push(o)}return i}exports.CrawlerOptions=f;
|
|
1
|
+
var{RezoQueue:f}=require("../queue/queue.cjs"),{Oxylabs:x}=require("./addon/oxylabs/index.cjs"),b=require("node:path"),y=require("node:os"),{Decodo:g}=require("./addon/decodo/index.cjs");class m{baseUrl;adapter;enableNavigationHistory;sessionId;rejectUnauthorized;userAgent;useRndUserAgent;timeout;maxRedirects;maxRetryAttempts;retryDelay;retryOnStatusCode;forceRevisit;retryWithoutProxyOnStatusCode;retryOnProxyError;maxRetryOnProxyError;allowRevisiting;enableCache;cacheTTL;cacheDir;throwFatalError;debug;enableSignalHandlers;concurrency;scraperConcurrency;maxDepth;maxUrls;maxResponseSize;respectRobotsTxt;followNofollow;autoThrottle;autoThrottleTargetDelay;autoThrottleMinDelay;autoThrottleMaxDelay;maxWaitOn429;alwaysWaitOn429;oxylabs=[];decodo=[];proxies=[];limiters=[];onLimiterAdded;requestHeaders=[];userAgents=A();constructor(e={}){this.baseUrl=e.baseUrl||"",this.adapter=e.adapter??"http",this.enableNavigationHistory=e.enableNavigationHistory??!1,this.sessionId=e.sessionId??`session_${Date.now()}_${Math.random().toString(36).slice(2,8)}`,this.rejectUnauthorized=e.rejectUnauthorized??!0,this.userAgent=e.userAgent,this.useRndUserAgent=e.useRndUserAgent??!1,this.timeout=e.timeout??30000,this.maxRedirects=e.maxRedirects??10,this.maxRetryAttempts=e.maxRetryAttempts??3,this.retryDelay=e.retryDelay??0,this.retryOnStatusCode=e.retryOnStatusCode??[408,429,500,502,503,504],this.forceRevisit=e.forceRevisit??!1,this.retryWithoutProxyOnStatusCode=e.retryWithoutProxyOnStatusCode??[407,403],this.retryOnProxyError=e.retryOnProxyError??!0,this.maxRetryOnProxyError=e.maxRetryOnProxyError??3,this.allowRevisiting=e.allowRevisiting??!1,this.enableCache=e.enableCache??!0,this.cacheTTL=e.cacheTTL??604800000,this.cacheDir=e.cacheDir??b.join(y.tmpdir(),"uiniqhtt_cache"),this.throwFatalError=e.throwFatalError??!1,this.debug=e.debug??!1,this.enableSignalHandlers=e.enableSignalHandlers??!1,this.concurrency=e.concurrency??100,this.scraperConcurrency=e.scraperConcurrency??this.concurrency,this.maxDepth=e.maxDepth??0,this.maxUrls=e.maxUrls??0,this.maxResponseSize=e.maxResponseSize??0,this.respectRobotsTxt=e.respectRobotsTxt??!1,this.followNofollow=e.followNofollow??!1,this.autoThrottle=e.autoThrottle??!0,this.autoThrottleTargetDelay=e.autoThrottleTargetDelay??1000,this.autoThrottleMinDelay=e.autoThrottleMinDelay??100,this.autoThrottleMaxDelay=e.autoThrottleMaxDelay??60000,this.maxWaitOn429=e.maxWaitOn429??1800000,this.alwaysWaitOn429=e.alwaysWaitOn429??!1,this._addHeaders(e.headers),this._addOxylabs(e.oxylabs),this._addDecodo(e.decodo),this._addProxies(e.proxy),this._addLimiters(e.limiter)}getConfiguredDomains(e){return(e==="headers"?this.requestHeaders:e==="limiters"?this.limiters:e==="oxylabs"?this.oxylabs:this.proxies).filter((s)=>s.domain).map((s)=>s.domain).filter((s,a,r)=>r.indexOf(s)===a)}removeDomain(e){return this.requestHeaders=this.requestHeaders.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.proxies=this.proxies.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.limiters=this.limiters.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.oxylabs=this.oxylabs.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this}_domainsEqual(e,t){if(Array.isArray(e)&&Array.isArray(t))return e.length===t.length&&e.every((s,a)=>s===t[a]);return e===t}getConfigurationSummary(){let e=(t)=>({total:t.length,global:t.filter((s)=>s.isGlobal).length,domainSpecific:t.filter((s)=>!s.isGlobal&&s.domain).length});return{headers:e(this.requestHeaders),proxies:e(this.proxies),limiters:e(this.limiters),oxylabs:e(this.oxylabs)}}_addHeaders(e){if(!e||!e.enable)return;for(let t of e.httpHeaders){let{domain:s,isGlobal:a,headers:r}=t;if(!s&&!a)continue;if(r instanceof Headers){let i=Object.fromEntries(r.entries());if(Object.keys(i).length<1)continue;r=i}else if(!r||Object.keys(r).length<1)continue;this.requestHeaders.push({domain:s,isGlobal:a,headers:r})}}_addProxies(e){if(!e||!e.enable)return;for(let t of e.proxies){let{domain:s,isGlobal:a,proxy:r}=t;if(!s&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.proxies.push({domain:s,isGlobal:a,proxy:r})}}_addLimiters(e){if(!e||!e.enable)return;for(let t of e.limiters){let{domain:s,isGlobal:a,options:r}=t;if(!s&&!a)continue;if(!r||Object.keys(r).length<1)continue;let i=new f(r);if(this.limiters.push({domain:s,isGlobal:a,pqueue:i,randomDelay:r.randomDelay}),this.onLimiterAdded)this.onLimiterAdded(i)}}_addOxylabs(e){if(!e||!e.enable)return;for(let t of e.labs){let{domain:s,isGlobal:a,options:r,queueOptions:i}=t;if(!s&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.oxylabs.push({domain:s,isGlobal:a,adaptar:new x(r)})}}_addDecodo(e){if(!e||!e.enable)return;for(let t of e.labs){let{domain:s,isGlobal:a,options:r,queueOptions:i}=t;if(!s&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.decodo.push({domain:s,isGlobal:a,adaptar:new g(r)})}}addHeaders(e){return this._addHeaders({enable:!0,httpHeaders:[e]}),this}addProxy(e){return this._addProxies({enable:!0,proxies:[e]}),this}addLimiter(e){return this._addLimiters({enable:!0,limiters:[e]}),this}addOxylabs(e){return this._addOxylabs({enable:!0,labs:[e]}),this}addDecodo(e){return this._addDecodo({enable:!0,labs:[e]}),this}destroyLimiters(){for(let e of this.limiters)if(e.pqueue&&typeof e.pqueue.destroy==="function")e.pqueue.destroy();this.limiters=[]}getLimiters(){return this.limiters}getRandomDelay(e,t){if(!this.getDomainName(e))return;for(let a of this.limiters)if(this._hasDomain(e,a.domain)&&a.randomDelay!==void 0)return a.randomDelay;if(t){for(let a of this.limiters)if(a.isGlobal&&a.randomDelay!==void 0)return a.randomDelay}return}clearGlobalConfigs(){if(Array.isArray(this.requestHeaders))this.requestHeaders=this.requestHeaders.filter((e)=>!e.isGlobal);if(Array.isArray(this.oxylabs))this.oxylabs=this.oxylabs.filter((e)=>!e.isGlobal);if(Array.isArray(this.limiters))this.limiters=this.limiters.filter((e)=>!e.isGlobal);if(Array.isArray(this.proxies))this.proxies=this.proxies.filter((e)=>!e.isGlobal);return this}getAdapter(e,t,s,a){if(!this.getDomainName(e))return null;let i=[],o=t==="headers"?this.requestHeaders:t==="limiters"?this.limiters:t==="oxylabs"?this.oxylabs:t==="decodo"?this.decodo:this.proxies;for(let n=0;n<o.length;n++)if(this._hasDomain(e,o[n].domain))i.push(n);if(i.length){let n=a?i[this.rnd(0,i.length-1)]:i[0];return t==="headers"?this.requestHeaders[n].headers:t==="limiters"?this.limiters[n].pqueue:t==="oxylabs"?this.oxylabs[n].adaptar:t==="decodo"?this.decodo[n].adaptar:this.proxies[n].proxy}i.length=0;for(let n=0;n<o.length;n++)i.push(n);if(i.length){let n=a?i[this.rnd(0,i.length-1)]:i[0];if(o[n].isGlobal&&s)return t==="headers"?this.requestHeaders[n].headers:t==="limiters"?this.limiters[n].pqueue:t==="oxylabs"?this.oxylabs[n].adaptar:t==="decodo"?this.decodo[n].adaptar:this.proxies[n].proxy}return null}rnd(e=0,t=Number.MAX_VALUE){return Math.floor(Math.random()*(t-e+1))+e}hasDomain(e,t,s){if(!this.getDomainName(e))return!1;let r=t==="headers"?this.requestHeaders:t==="limiters"?this.limiters:t==="oxylabs"?this.oxylabs:t==="decodo"?this.decodo:this.proxies;for(let i=0;i<r.length;i++)if(this._hasDomain(e,r[i].domain))return!0;if(s){for(let i=0;i<r.length;i++)if(r[i].isGlobal)return!0}return!1}pickHeaders(e,t,s,a){let r=this.getAdapter(e,"headers",t),i=new Headers(r??{}),o=i.count;if(s&&s instanceof Headers)for(let[n,l]of Object.entries(s.entries()))i.set(n,l);else if(s&&typeof s==="object"){for(let[n,l]of Object.entries(s))if(typeof l==="string")i.set(n,l)}if(a&&o===0)i.set("user-agent",this.getRandomUserAgent());return Object.fromEntries(i.entries())}_hasDomain(e,t){if(!t)return!1;let s=this.getDomainName(e);if(!s)return!1;let a=(i)=>{return/[\^\$\*\+\?\{\}\[\]\(\)\|\\]/.test(i)||i.startsWith("/")||i.includes(".*")||i.includes(".+")},r=(i)=>{if(i instanceof RegExp)return i.test(s)||i.test(e);let o=i.toString().trim();if(s.toLowerCase()===o.toLowerCase())return!0;if(o.includes("*")){let h=o.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\\*/g,".*"),d=new RegExp(`^${h}$`,"i");return d.test(s)||d.test(e)}if(a(o))try{let h=o,d="i",c=o.match(/^\/(.*)\/(\w*)$/);if(c)h=c[1],d=c[2]||"i";let u=new RegExp(h,d);return u.test(s)||u.test(e)}catch(h){return s.toLowerCase().includes(o.toLowerCase())}let n=s.toLowerCase(),l=o.toLowerCase();return n===l||n.endsWith("."+l)||l.endsWith("."+n)};if(Array.isArray(t)){for(let i of t)if(r(i))return!0;return!1}return r(t)}getDomainName(e){if(this.isValidUrl(e))return new URL(e).hostname;else if(this.isHostName(e))return e;return null}isHostName(e){if(!e)return!1;if(e.length>255)return!1;let t=/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+ [a-zA-Z]{2,})$/;return e=e.trim().toLowerCase(),t.test(e)&&!e.startsWith("-")&&!e.endsWith("-")}isValidUrl(e){if(!e)return!1;e=e.trim();try{let t=new URL(e);if(!t.protocol||!["http:","https:"].includes(t.protocol.toLowerCase()))return!1;if(!t.hostname)return!1;if(!/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,})$/.test(t.hostname))return!1;return!0}catch{return!1}}getRandomUserAgent(){return this.userAgents[Math.floor(Math.random()*this.userAgents.length)]}}function A(){let e=[{name:"Chrome",version:"91.0.4472.124",engine:"AppleWebKit/537.36"},{name:"Firefox",version:"89.0",engine:"Gecko/20100101"},{name:"Safari",version:"14.1.1",engine:"AppleWebKit/605.1.15"},{name:"Edge",version:"91.0.864.59",engine:"AppleWebKit/537.36"},{name:"Opera",version:"77.0.4054.277",engine:"AppleWebKit/537.36"},{name:"Vivaldi",version:"3.8.2259.42",engine:"AppleWebKit/537.36"},{name:"Brave",version:"1.26.74",engine:"AppleWebKit/537.36"},{name:"Chromium",version:"91.0.4472.101",engine:"AppleWebKit/537.36"},{name:"Yandex",version:"21.5.3.742",engine:"AppleWebKit/537.36"},{name:"Maxthon",version:"5.3.8.2000",engine:"AppleWebKit/537.36"}],t=["Windows NT 10.0","Windows NT 6.1","Macintosh; Intel Mac OS X 10_15_7","Macintosh; Intel Mac OS X 11_4_0","X11; Linux x86_64","X11; Ubuntu; Linux x86_64"],s=[];for(let a=0;a<200;a++){let r=e[Math.floor(Math.random()*e.length)],i=t[Math.floor(Math.random()*t.length)],o="";switch(r.name){case"Chrome":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36`;break;case"Firefox":o=`Mozilla/5.0 (${i}; rv:${r.version}) ${r.engine} Firefox/${r.version}`;break;case"Safari":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Version/${r.version} Safari/605.1.15`;break;case"Edge":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Edg/${r.version}`;break;case"Opera":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 OPR/${r.version}`;break;case"Vivaldi":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Vivaldi/${r.version}`;break;case"Brave":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Brave/${r.version}`;break;case"Chromium":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chromium/${r.version} Chrome/${r.version} Safari/537.36`;break;case"Yandex":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} YaBrowser/${r.version} Safari/537.36`;break;case"Maxthon":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Maxthon/${r.version}`;break}s.push(o)}return s}exports.CrawlerOptions=m;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
import{RezoQueue as f}from"../queue/queue.js";import{Oxylabs as m}from"./addon/oxylabs/index.js";import x from"node:path";import b from"node:os";import{Decodo as g}from"./addon/decodo/index.js";class y{baseUrl;adapter;enableNavigationHistory;sessionId;rejectUnauthorized;userAgent;useRndUserAgent;timeout;maxRedirects;maxRetryAttempts;retryDelay;retryOnStatusCode;forceRevisit;retryWithoutProxyOnStatusCode;retryOnProxyError;maxRetryOnProxyError;allowRevisiting;enableCache;cacheTTL;cacheDir;throwFatalError;debug;enableSignalHandlers;concurrency;scraperConcurrency;maxDepth;maxUrls;maxResponseSize;respectRobotsTxt;followNofollow;autoThrottle;autoThrottleTargetDelay;autoThrottleMinDelay;autoThrottleMaxDelay;maxWaitOn429;alwaysWaitOn429;oxylabs=[];decodo=[];proxies=[];limiters=[];requestHeaders=[];userAgents=A();constructor(e={}){this.baseUrl=e.baseUrl||"",this.adapter=e.adapter??"http",this.enableNavigationHistory=e.enableNavigationHistory??!1,this.sessionId=e.sessionId??`session_${Date.now()}_${Math.random().toString(36).slice(2,8)}`,this.rejectUnauthorized=e.rejectUnauthorized??!0,this.userAgent=e.userAgent,this.useRndUserAgent=e.useRndUserAgent??!1,this.timeout=e.timeout??30000,this.maxRedirects=e.maxRedirects??10,this.maxRetryAttempts=e.maxRetryAttempts??3,this.retryDelay=e.retryDelay??0,this.retryOnStatusCode=e.retryOnStatusCode??[408,429,500,502,503,504],this.forceRevisit=e.forceRevisit??!1,this.retryWithoutProxyOnStatusCode=e.retryWithoutProxyOnStatusCode??[407,403],this.retryOnProxyError=e.retryOnProxyError??!0,this.maxRetryOnProxyError=e.maxRetryOnProxyError??3,this.allowRevisiting=e.allowRevisiting??!1,this.enableCache=e.enableCache??!0,this.cacheTTL=e.cacheTTL??604800000,this.cacheDir=e.cacheDir??x.join(b.tmpdir(),"uiniqhtt_cache"),this.throwFatalError=e.throwFatalError??!1,this.debug=e.debug??!1,this.enableSignalHandlers=e.enableSignalHandlers??!1,this.concurrency=e.concurrency??100,this.scraperConcurrency=e.scraperConcurrency??this.concurrency,this.maxDepth=e.maxDepth??0,this.maxUrls=e.maxUrls??0,this.maxResponseSize=e.maxResponseSize??0,this.respectRobotsTxt=e.respectRobotsTxt??!1,this.followNofollow=e.followNofollow??!1,this.autoThrottle=e.autoThrottle??!0,this.autoThrottleTargetDelay=e.autoThrottleTargetDelay??1000,this.autoThrottleMinDelay=e.autoThrottleMinDelay??100,this.autoThrottleMaxDelay=e.autoThrottleMaxDelay??60000,this.maxWaitOn429=e.maxWaitOn429??1800000,this.alwaysWaitOn429=e.alwaysWaitOn429??!1,this._addHeaders(e.headers),this._addOxylabs(e.oxylabs),this._addDecodo(e.decodo),this._addProxies(e.proxy),this._addLimiters(e.limiter)}getConfiguredDomains(e){return(e==="headers"?this.requestHeaders:e==="limiters"?this.limiters:e==="oxylabs"?this.oxylabs:this.proxies).filter((i)=>i.domain).map((i)=>i.domain).filter((i,a,r)=>r.indexOf(i)===a)}removeDomain(e){return this.requestHeaders=this.requestHeaders.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.proxies=this.proxies.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.limiters=this.limiters.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.oxylabs=this.oxylabs.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this}_domainsEqual(e,t){if(Array.isArray(e)&&Array.isArray(t))return e.length===t.length&&e.every((i,a)=>i===t[a]);return e===t}getConfigurationSummary(){let e=(t)=>({total:t.length,global:t.filter((i)=>i.isGlobal).length,domainSpecific:t.filter((i)=>!i.isGlobal&&i.domain).length});return{headers:e(this.requestHeaders),proxies:e(this.proxies),limiters:e(this.limiters),oxylabs:e(this.oxylabs)}}_addHeaders(e){if(!e||!e.enable)return;for(let t of e.httpHeaders){let{domain:i,isGlobal:a,headers:r}=t;if(!i&&!a)continue;if(r instanceof Headers){let s=Object.fromEntries(r.entries());if(Object.keys(s).length<1)continue;r=s}else if(!r||Object.keys(r).length<1)continue;this.requestHeaders.push({domain:i,isGlobal:a,headers:r})}}_addProxies(e){if(!e||!e.enable)return;for(let t of e.proxies){let{domain:i,isGlobal:a,proxy:r}=t;if(!i&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.proxies.push({domain:i,isGlobal:a,proxy:r})}}_addLimiters(e){if(!e||!e.enable)return;for(let t of e.limiters){let{domain:i,isGlobal:a,options:r}=t;if(!i&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.limiters.push({domain:i,isGlobal:a,pqueue:new f(r)})}}_addOxylabs(e){if(!e||!e.enable)return;for(let t of e.labs){let{domain:i,isGlobal:a,options:r,queueOptions:s}=t;if(!i&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.oxylabs.push({domain:i,isGlobal:a,adaptar:new m(r)})}}_addDecodo(e){if(!e||!e.enable)return;for(let t of e.labs){let{domain:i,isGlobal:a,options:r,queueOptions:s}=t;if(!i&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.decodo.push({domain:i,isGlobal:a,adaptar:new g(r)})}}addHeaders(e){return this._addHeaders({enable:!0,httpHeaders:[e]}),this}addProxy(e){return this._addProxies({enable:!0,proxies:[e]}),this}addLimiter(e){return this._addLimiters({enable:!0,limiters:[e]}),this}addOxylabs(e){return this._addOxylabs({enable:!0,labs:[e]}),this}addDecodo(e){return this._addDecodo({enable:!0,labs:[e]}),this}destroyLimiters(){for(let e of this.limiters)if(e.pqueue&&typeof e.pqueue.destroy==="function")e.pqueue.destroy();this.limiters=[]}clearGlobalConfigs(){if(Array.isArray(this.requestHeaders))this.requestHeaders=this.requestHeaders.filter((e)=>!e.isGlobal);if(Array.isArray(this.oxylabs))this.oxylabs=this.oxylabs.filter((e)=>!e.isGlobal);if(Array.isArray(this.limiters))this.limiters=this.limiters.filter((e)=>!e.isGlobal);if(Array.isArray(this.proxies))this.proxies=this.proxies.filter((e)=>!e.isGlobal);return this}getAdapter(e,t,i,a){if(!this.getDomainName(e))return null;let s=[],o=t==="headers"?this.requestHeaders:t==="limiters"?this.limiters:t==="oxylabs"?this.oxylabs:t==="decodo"?this.decodo:this.proxies;for(let n=0;n<o.length;n++)if(this._hasDomain(e,o[n].domain))s.push(n);if(s.length){let n=a?s[this.rnd(0,s.length-1)]:s[0];return t==="headers"?this.requestHeaders[n].headers:t==="limiters"?this.limiters[n].pqueue:t==="oxylabs"?this.oxylabs[n].adaptar:t==="decodo"?this.decodo[n].adaptar:this.proxies[n].proxy}s.length=0;for(let n=0;n<o.length;n++)s.push(n);if(s.length){let n=a?s[this.rnd(0,s.length-1)]:s[0];if(o[n].isGlobal&&i)return t==="headers"?this.requestHeaders[n].headers:t==="limiters"?this.limiters[n].pqueue:t==="oxylabs"?this.oxylabs[n].adaptar:t==="decodo"?this.decodo[n].adaptar:this.proxies[n].proxy}return null}rnd(e=0,t=Number.MAX_VALUE){return Math.floor(Math.random()*(t-e+1))+e}hasDomain(e,t,i){if(!this.getDomainName(e))return!1;let r=t==="headers"?this.requestHeaders:t==="limiters"?this.limiters:t==="oxylabs"?this.oxylabs:t==="decodo"?this.decodo:this.proxies;for(let s=0;s<r.length;s++)if(this._hasDomain(e,r[s].domain))return!0;if(i){for(let s=0;s<r.length;s++)if(r[s].isGlobal)return!0}return!1}pickHeaders(e,t,i,a){let r=this.getAdapter(e,"headers",t),s=new Headers(r??{}),o=s.count;if(i&&i instanceof Headers)for(let[n,l]of Object.entries(i.entries()))s.set(n,l);else if(i&&typeof i==="object"){for(let[n,l]of Object.entries(i))if(typeof l==="string")s.set(n,l)}if(a&&o===0)s.set("user-agent",this.getRandomUserAgent());return Object.fromEntries(s.entries())}_hasDomain(e,t){if(!t)return!1;let i=this.getDomainName(e);if(!i)return!1;let a=(s)=>{return/[\^\$\*\+\?\{\}\[\]\(\)\|\\]/.test(s)||s.startsWith("/")||s.includes(".*")||s.includes(".+")},r=(s)=>{if(s instanceof RegExp)return s.test(i)||s.test(e);let o=s.toString().trim();if(i.toLowerCase()===o.toLowerCase())return!0;if(o.includes("*")){let h=o.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\\*/g,".*"),c=new RegExp(`^${h}$`,"i");return c.test(i)||c.test(e)}if(a(o))try{let h=o,c="i",d=o.match(/^\/(.*)\/(\w*)$/);if(d)h=d[1],c=d[2]||"i";let u=new RegExp(h,c);return u.test(i)||u.test(e)}catch(h){return i.toLowerCase().includes(o.toLowerCase())}let n=i.toLowerCase(),l=o.toLowerCase();return n===l||n.endsWith("."+l)||l.endsWith("."+n)};if(Array.isArray(t)){for(let s of t)if(r(s))return!0;return!1}return r(t)}getDomainName(e){if(this.isValidUrl(e))return new URL(e).hostname;else if(this.isHostName(e))return e;return null}isHostName(e){if(!e)return!1;if(e.length>255)return!1;let t=/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+ [a-zA-Z]{2,})$/;return e=e.trim().toLowerCase(),t.test(e)&&!e.startsWith("-")&&!e.endsWith("-")}isValidUrl(e){if(!e)return!1;e=e.trim();try{let t=new URL(e);if(!t.protocol||!["http:","https:"].includes(t.protocol.toLowerCase()))return!1;if(!t.hostname)return!1;if(!/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,})$/.test(t.hostname))return!1;return!0}catch{return!1}}getRandomUserAgent(){return this.userAgents[Math.floor(Math.random()*this.userAgents.length)]}}function A(){let e=[{name:"Chrome",version:"91.0.4472.124",engine:"AppleWebKit/537.36"},{name:"Firefox",version:"89.0",engine:"Gecko/20100101"},{name:"Safari",version:"14.1.1",engine:"AppleWebKit/605.1.15"},{name:"Edge",version:"91.0.864.59",engine:"AppleWebKit/537.36"},{name:"Opera",version:"77.0.4054.277",engine:"AppleWebKit/537.36"},{name:"Vivaldi",version:"3.8.2259.42",engine:"AppleWebKit/537.36"},{name:"Brave",version:"1.26.74",engine:"AppleWebKit/537.36"},{name:"Chromium",version:"91.0.4472.101",engine:"AppleWebKit/537.36"},{name:"Yandex",version:"21.5.3.742",engine:"AppleWebKit/537.36"},{name:"Maxthon",version:"5.3.8.2000",engine:"AppleWebKit/537.36"}],t=["Windows NT 10.0","Windows NT 6.1","Macintosh; Intel Mac OS X 10_15_7","Macintosh; Intel Mac OS X 11_4_0","X11; Linux x86_64","X11; Ubuntu; Linux x86_64"],i=[];for(let a=0;a<200;a++){let r=e[Math.floor(Math.random()*e.length)],s=t[Math.floor(Math.random()*t.length)],o="";switch(r.name){case"Chrome":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36`;break;case"Firefox":o=`Mozilla/5.0 (${s}; rv:${r.version}) ${r.engine} Firefox/${r.version}`;break;case"Safari":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Version/${r.version} Safari/605.1.15`;break;case"Edge":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Edg/${r.version}`;break;case"Opera":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 OPR/${r.version}`;break;case"Vivaldi":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Vivaldi/${r.version}`;break;case"Brave":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Brave/${r.version}`;break;case"Chromium":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chromium/${r.version} Chrome/${r.version} Safari/537.36`;break;case"Yandex":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} YaBrowser/${r.version} Safari/537.36`;break;case"Maxthon":o=`Mozilla/5.0 (${s}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Maxthon/${r.version}`;break}i.push(o)}return i}export{y as CrawlerOptions};
|
|
1
|
+
import{RezoQueue as m}from"../queue/queue.js";import{Oxylabs as f}from"./addon/oxylabs/index.js";import x from"node:path";import b from"node:os";import{Decodo as y}from"./addon/decodo/index.js";class g{baseUrl;adapter;enableNavigationHistory;sessionId;rejectUnauthorized;userAgent;useRndUserAgent;timeout;maxRedirects;maxRetryAttempts;retryDelay;retryOnStatusCode;forceRevisit;retryWithoutProxyOnStatusCode;retryOnProxyError;maxRetryOnProxyError;allowRevisiting;enableCache;cacheTTL;cacheDir;throwFatalError;debug;enableSignalHandlers;concurrency;scraperConcurrency;maxDepth;maxUrls;maxResponseSize;respectRobotsTxt;followNofollow;autoThrottle;autoThrottleTargetDelay;autoThrottleMinDelay;autoThrottleMaxDelay;maxWaitOn429;alwaysWaitOn429;oxylabs=[];decodo=[];proxies=[];limiters=[];onLimiterAdded;requestHeaders=[];userAgents=A();constructor(e={}){this.baseUrl=e.baseUrl||"",this.adapter=e.adapter??"http",this.enableNavigationHistory=e.enableNavigationHistory??!1,this.sessionId=e.sessionId??`session_${Date.now()}_${Math.random().toString(36).slice(2,8)}`,this.rejectUnauthorized=e.rejectUnauthorized??!0,this.userAgent=e.userAgent,this.useRndUserAgent=e.useRndUserAgent??!1,this.timeout=e.timeout??30000,this.maxRedirects=e.maxRedirects??10,this.maxRetryAttempts=e.maxRetryAttempts??3,this.retryDelay=e.retryDelay??0,this.retryOnStatusCode=e.retryOnStatusCode??[408,429,500,502,503,504],this.forceRevisit=e.forceRevisit??!1,this.retryWithoutProxyOnStatusCode=e.retryWithoutProxyOnStatusCode??[407,403],this.retryOnProxyError=e.retryOnProxyError??!0,this.maxRetryOnProxyError=e.maxRetryOnProxyError??3,this.allowRevisiting=e.allowRevisiting??!1,this.enableCache=e.enableCache??!0,this.cacheTTL=e.cacheTTL??604800000,this.cacheDir=e.cacheDir??x.join(b.tmpdir(),"uiniqhtt_cache"),this.throwFatalError=e.throwFatalError??!1,this.debug=e.debug??!1,this.enableSignalHandlers=e.enableSignalHandlers??!1,this.concurrency=e.concurrency??100,this.scraperConcurrency=e.scraperConcurrency??this.concurrency,this.maxDepth=e.maxDepth??0,this.maxUrls=e.maxUrls??0,this.maxResponseSize=e.maxResponseSize??0,this.respectRobotsTxt=e.respectRobotsTxt??!1,this.followNofollow=e.followNofollow??!1,this.autoThrottle=e.autoThrottle??!0,this.autoThrottleTargetDelay=e.autoThrottleTargetDelay??1000,this.autoThrottleMinDelay=e.autoThrottleMinDelay??100,this.autoThrottleMaxDelay=e.autoThrottleMaxDelay??60000,this.maxWaitOn429=e.maxWaitOn429??1800000,this.alwaysWaitOn429=e.alwaysWaitOn429??!1,this._addHeaders(e.headers),this._addOxylabs(e.oxylabs),this._addDecodo(e.decodo),this._addProxies(e.proxy),this._addLimiters(e.limiter)}getConfiguredDomains(e){return(e==="headers"?this.requestHeaders:e==="limiters"?this.limiters:e==="oxylabs"?this.oxylabs:this.proxies).filter((s)=>s.domain).map((s)=>s.domain).filter((s,a,r)=>r.indexOf(s)===a)}removeDomain(e){return this.requestHeaders=this.requestHeaders.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.proxies=this.proxies.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.limiters=this.limiters.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.oxylabs=this.oxylabs.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this}_domainsEqual(e,t){if(Array.isArray(e)&&Array.isArray(t))return e.length===t.length&&e.every((s,a)=>s===t[a]);return e===t}getConfigurationSummary(){let e=(t)=>({total:t.length,global:t.filter((s)=>s.isGlobal).length,domainSpecific:t.filter((s)=>!s.isGlobal&&s.domain).length});return{headers:e(this.requestHeaders),proxies:e(this.proxies),limiters:e(this.limiters),oxylabs:e(this.oxylabs)}}_addHeaders(e){if(!e||!e.enable)return;for(let t of e.httpHeaders){let{domain:s,isGlobal:a,headers:r}=t;if(!s&&!a)continue;if(r instanceof Headers){let i=Object.fromEntries(r.entries());if(Object.keys(i).length<1)continue;r=i}else if(!r||Object.keys(r).length<1)continue;this.requestHeaders.push({domain:s,isGlobal:a,headers:r})}}_addProxies(e){if(!e||!e.enable)return;for(let t of e.proxies){let{domain:s,isGlobal:a,proxy:r}=t;if(!s&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.proxies.push({domain:s,isGlobal:a,proxy:r})}}_addLimiters(e){if(!e||!e.enable)return;for(let t of e.limiters){let{domain:s,isGlobal:a,options:r}=t;if(!s&&!a)continue;if(!r||Object.keys(r).length<1)continue;let i=new m(r);if(this.limiters.push({domain:s,isGlobal:a,pqueue:i,randomDelay:r.randomDelay}),this.onLimiterAdded)this.onLimiterAdded(i)}}_addOxylabs(e){if(!e||!e.enable)return;for(let t of e.labs){let{domain:s,isGlobal:a,options:r,queueOptions:i}=t;if(!s&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.oxylabs.push({domain:s,isGlobal:a,adaptar:new f(r)})}}_addDecodo(e){if(!e||!e.enable)return;for(let t of e.labs){let{domain:s,isGlobal:a,options:r,queueOptions:i}=t;if(!s&&!a)continue;if(!r||Object.keys(r).length<1)continue;this.decodo.push({domain:s,isGlobal:a,adaptar:new y(r)})}}addHeaders(e){return this._addHeaders({enable:!0,httpHeaders:[e]}),this}addProxy(e){return this._addProxies({enable:!0,proxies:[e]}),this}addLimiter(e){return this._addLimiters({enable:!0,limiters:[e]}),this}addOxylabs(e){return this._addOxylabs({enable:!0,labs:[e]}),this}addDecodo(e){return this._addDecodo({enable:!0,labs:[e]}),this}destroyLimiters(){for(let e of this.limiters)if(e.pqueue&&typeof e.pqueue.destroy==="function")e.pqueue.destroy();this.limiters=[]}getLimiters(){return this.limiters}getRandomDelay(e,t){if(!this.getDomainName(e))return;for(let a of this.limiters)if(this._hasDomain(e,a.domain)&&a.randomDelay!==void 0)return a.randomDelay;if(t){for(let a of this.limiters)if(a.isGlobal&&a.randomDelay!==void 0)return a.randomDelay}return}clearGlobalConfigs(){if(Array.isArray(this.requestHeaders))this.requestHeaders=this.requestHeaders.filter((e)=>!e.isGlobal);if(Array.isArray(this.oxylabs))this.oxylabs=this.oxylabs.filter((e)=>!e.isGlobal);if(Array.isArray(this.limiters))this.limiters=this.limiters.filter((e)=>!e.isGlobal);if(Array.isArray(this.proxies))this.proxies=this.proxies.filter((e)=>!e.isGlobal);return this}getAdapter(e,t,s,a){if(!this.getDomainName(e))return null;let i=[],o=t==="headers"?this.requestHeaders:t==="limiters"?this.limiters:t==="oxylabs"?this.oxylabs:t==="decodo"?this.decodo:this.proxies;for(let n=0;n<o.length;n++)if(this._hasDomain(e,o[n].domain))i.push(n);if(i.length){let n=a?i[this.rnd(0,i.length-1)]:i[0];return t==="headers"?this.requestHeaders[n].headers:t==="limiters"?this.limiters[n].pqueue:t==="oxylabs"?this.oxylabs[n].adaptar:t==="decodo"?this.decodo[n].adaptar:this.proxies[n].proxy}i.length=0;for(let n=0;n<o.length;n++)i.push(n);if(i.length){let n=a?i[this.rnd(0,i.length-1)]:i[0];if(o[n].isGlobal&&s)return t==="headers"?this.requestHeaders[n].headers:t==="limiters"?this.limiters[n].pqueue:t==="oxylabs"?this.oxylabs[n].adaptar:t==="decodo"?this.decodo[n].adaptar:this.proxies[n].proxy}return null}rnd(e=0,t=Number.MAX_VALUE){return Math.floor(Math.random()*(t-e+1))+e}hasDomain(e,t,s){if(!this.getDomainName(e))return!1;let r=t==="headers"?this.requestHeaders:t==="limiters"?this.limiters:t==="oxylabs"?this.oxylabs:t==="decodo"?this.decodo:this.proxies;for(let i=0;i<r.length;i++)if(this._hasDomain(e,r[i].domain))return!0;if(s){for(let i=0;i<r.length;i++)if(r[i].isGlobal)return!0}return!1}pickHeaders(e,t,s,a){let r=this.getAdapter(e,"headers",t),i=new Headers(r??{}),o=i.count;if(s&&s instanceof Headers)for(let[n,l]of Object.entries(s.entries()))i.set(n,l);else if(s&&typeof s==="object"){for(let[n,l]of Object.entries(s))if(typeof l==="string")i.set(n,l)}if(a&&o===0)i.set("user-agent",this.getRandomUserAgent());return Object.fromEntries(i.entries())}_hasDomain(e,t){if(!t)return!1;let s=this.getDomainName(e);if(!s)return!1;let a=(i)=>{return/[\^\$\*\+\?\{\}\[\]\(\)\|\\]/.test(i)||i.startsWith("/")||i.includes(".*")||i.includes(".+")},r=(i)=>{if(i instanceof RegExp)return i.test(s)||i.test(e);let o=i.toString().trim();if(s.toLowerCase()===o.toLowerCase())return!0;if(o.includes("*")){let h=o.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\\*/g,".*"),d=new RegExp(`^${h}$`,"i");return d.test(s)||d.test(e)}if(a(o))try{let h=o,d="i",c=o.match(/^\/(.*)\/(\w*)$/);if(c)h=c[1],d=c[2]||"i";let u=new RegExp(h,d);return u.test(s)||u.test(e)}catch(h){return s.toLowerCase().includes(o.toLowerCase())}let n=s.toLowerCase(),l=o.toLowerCase();return n===l||n.endsWith("."+l)||l.endsWith("."+n)};if(Array.isArray(t)){for(let i of t)if(r(i))return!0;return!1}return r(t)}getDomainName(e){if(this.isValidUrl(e))return new URL(e).hostname;else if(this.isHostName(e))return e;return null}isHostName(e){if(!e)return!1;if(e.length>255)return!1;let t=/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+ [a-zA-Z]{2,})$/;return e=e.trim().toLowerCase(),t.test(e)&&!e.startsWith("-")&&!e.endsWith("-")}isValidUrl(e){if(!e)return!1;e=e.trim();try{let t=new URL(e);if(!t.protocol||!["http:","https:"].includes(t.protocol.toLowerCase()))return!1;if(!t.hostname)return!1;if(!/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,})$/.test(t.hostname))return!1;return!0}catch{return!1}}getRandomUserAgent(){return this.userAgents[Math.floor(Math.random()*this.userAgents.length)]}}function A(){let e=[{name:"Chrome",version:"91.0.4472.124",engine:"AppleWebKit/537.36"},{name:"Firefox",version:"89.0",engine:"Gecko/20100101"},{name:"Safari",version:"14.1.1",engine:"AppleWebKit/605.1.15"},{name:"Edge",version:"91.0.864.59",engine:"AppleWebKit/537.36"},{name:"Opera",version:"77.0.4054.277",engine:"AppleWebKit/537.36"},{name:"Vivaldi",version:"3.8.2259.42",engine:"AppleWebKit/537.36"},{name:"Brave",version:"1.26.74",engine:"AppleWebKit/537.36"},{name:"Chromium",version:"91.0.4472.101",engine:"AppleWebKit/537.36"},{name:"Yandex",version:"21.5.3.742",engine:"AppleWebKit/537.36"},{name:"Maxthon",version:"5.3.8.2000",engine:"AppleWebKit/537.36"}],t=["Windows NT 10.0","Windows NT 6.1","Macintosh; Intel Mac OS X 10_15_7","Macintosh; Intel Mac OS X 11_4_0","X11; Linux x86_64","X11; Ubuntu; Linux x86_64"],s=[];for(let a=0;a<200;a++){let r=e[Math.floor(Math.random()*e.length)],i=t[Math.floor(Math.random()*t.length)],o="";switch(r.name){case"Chrome":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36`;break;case"Firefox":o=`Mozilla/5.0 (${i}; rv:${r.version}) ${r.engine} Firefox/${r.version}`;break;case"Safari":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Version/${r.version} Safari/605.1.15`;break;case"Edge":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Edg/${r.version}`;break;case"Opera":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 OPR/${r.version}`;break;case"Vivaldi":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Vivaldi/${r.version}`;break;case"Brave":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Brave/${r.version}`;break;case"Chromium":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chromium/${r.version} Chrome/${r.version} Safari/537.36`;break;case"Yandex":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} YaBrowser/${r.version} Safari/537.36`;break;case"Maxthon":o=`Mozilla/5.0 (${i}) ${r.engine} (KHTML, like Gecko) Chrome/${r.version} Safari/537.36 Maxthon/${r.version}`;break}s.push(o)}return s}export{g as CrawlerOptions};
|
package/dist/crawler/crawler.cjs
CHANGED
|
@@ -85,6 +85,7 @@ class Crawler {
|
|
|
85
85
|
startHandlers = [];
|
|
86
86
|
finishHandlers = [];
|
|
87
87
|
redirectHandlers = [];
|
|
88
|
+
queueChangeHandlers = [];
|
|
88
89
|
collectedData = new CappedArray({
|
|
89
90
|
maxSize: 1e5,
|
|
90
91
|
evictionRatio: 0.1,
|
|
@@ -100,12 +101,22 @@ class Crawler {
|
|
|
100
101
|
this.adapterType = this.config.adapter;
|
|
101
102
|
const concurrency = this.config.concurrency;
|
|
102
103
|
this.queue = new RezoQueue({
|
|
104
|
+
name: "crawler",
|
|
103
105
|
concurrency
|
|
104
106
|
});
|
|
105
107
|
this.originalConcurrency = concurrency;
|
|
106
108
|
this.scraperQueue = new RezoQueue({
|
|
109
|
+
name: "scraper",
|
|
107
110
|
concurrency: this.config.scraperConcurrency
|
|
108
111
|
});
|
|
112
|
+
this._subscribeToQueueEvents(this.queue, "crawler");
|
|
113
|
+
this._subscribeToQueueEvents(this.scraperQueue, "scraper");
|
|
114
|
+
this.config.onLimiterAdded = (queue) => {
|
|
115
|
+
if (!this.subscribedLimiterQueues.has(queue)) {
|
|
116
|
+
this._subscribeToQueueEvents(queue, "limiter");
|
|
117
|
+
this.subscribedLimiterQueues.add(queue);
|
|
118
|
+
}
|
|
119
|
+
};
|
|
109
120
|
this.memoryMonitor = new MemoryMonitor({ warningRatio: 0.7, criticalRatio: 0.85 });
|
|
110
121
|
this.healthMetrics = new HealthMetrics({ windowSize: 60000 });
|
|
111
122
|
const enableCache = this.config.enableCache;
|
|
@@ -596,6 +607,11 @@ class Crawler {
|
|
|
596
607
|
this.redirectHandlers.push(handler);
|
|
597
608
|
return this;
|
|
598
609
|
}
|
|
610
|
+
onQueueChange(handler) {
|
|
611
|
+
this.queueChangeHandlers.push(handler);
|
|
612
|
+
this._subscribeToLimiterQueues();
|
|
613
|
+
return this;
|
|
614
|
+
}
|
|
599
615
|
onRawData(handler) {
|
|
600
616
|
this.rawResponseEvents.push({
|
|
601
617
|
handler: "_onRawResponse",
|
|
@@ -666,6 +682,52 @@ class Crawler {
|
|
|
666
682
|
});
|
|
667
683
|
return this;
|
|
668
684
|
}
|
|
685
|
+
subscribedLimiterQueues = new Set;
|
|
686
|
+
_subscribeToQueueEvents(queue, queueType) {
|
|
687
|
+
const emitEvent = (event, taskId) => {
|
|
688
|
+
if (this.queueChangeHandlers.length === 0)
|
|
689
|
+
return;
|
|
690
|
+
const state = queue.state;
|
|
691
|
+
const queueChangeEvent = {
|
|
692
|
+
queueName: queue.name,
|
|
693
|
+
queueType,
|
|
694
|
+
event,
|
|
695
|
+
pending: state.pending,
|
|
696
|
+
size: state.size,
|
|
697
|
+
total: state.total,
|
|
698
|
+
isPaused: state.isPaused,
|
|
699
|
+
isIdle: state.isIdle,
|
|
700
|
+
taskId
|
|
701
|
+
};
|
|
702
|
+
for (const handler of this.queueChangeHandlers) {
|
|
703
|
+
try {
|
|
704
|
+
handler(queueChangeEvent);
|
|
705
|
+
} catch (err) {
|
|
706
|
+
if (this.config.debug)
|
|
707
|
+
console.error("[Crawler] onQueueChange handler error:", err);
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
};
|
|
711
|
+
queue.on("add", (data) => emitEvent("add", data.id));
|
|
712
|
+
queue.on("start", (data) => emitEvent("start", data.id));
|
|
713
|
+
queue.on("completed", (data) => emitEvent("completed", data.id));
|
|
714
|
+
queue.on("error", (data) => emitEvent("error", data.id));
|
|
715
|
+
queue.on("timeout", (data) => emitEvent("timeout", data.id));
|
|
716
|
+
queue.on("cancelled", (data) => emitEvent("cancelled", data.id));
|
|
717
|
+
queue.on("idle", () => emitEvent("idle"));
|
|
718
|
+
queue.on("active", () => emitEvent("active"));
|
|
719
|
+
queue.on("paused", () => emitEvent("paused"));
|
|
720
|
+
queue.on("resumed", () => emitEvent("resumed"));
|
|
721
|
+
}
|
|
722
|
+
_subscribeToLimiterQueues() {
|
|
723
|
+
const limiters = this.config.getLimiters();
|
|
724
|
+
for (const limiter of limiters) {
|
|
725
|
+
if (!this.subscribedLimiterQueues.has(limiter.pqueue)) {
|
|
726
|
+
this._subscribeToQueueEvents(limiter.pqueue, "limiter");
|
|
727
|
+
this.subscribedLimiterQueues.add(limiter.pqueue);
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
}
|
|
669
731
|
_onBody(handler, document) {
|
|
670
732
|
this.queue.add(() => handler(document.body));
|
|
671
733
|
}
|
|
@@ -1119,7 +1181,16 @@ class Crawler {
|
|
|
1119
1181
|
}
|
|
1120
1182
|
this.crawlStats.urlsQueued++;
|
|
1121
1183
|
const domain = new URL(url).hostname;
|
|
1122
|
-
const
|
|
1184
|
+
const limiterRandomDelay = this.config.getRandomDelay(url, true);
|
|
1185
|
+
let delay = 0;
|
|
1186
|
+
if (limiterRandomDelay !== undefined && limiterRandomDelay > 0) {
|
|
1187
|
+
delay = Math.floor(Math.random() * limiterRandomDelay);
|
|
1188
|
+
if (this.config.debug) {
|
|
1189
|
+
console.log(`[RandomDelay] ${domain}: ${delay}ms (max: ${limiterRandomDelay}ms)`);
|
|
1190
|
+
}
|
|
1191
|
+
} else {
|
|
1192
|
+
delay = this.getAutoThrottleDelay(domain);
|
|
1193
|
+
}
|
|
1123
1194
|
if (delay > 0) {
|
|
1124
1195
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
1125
1196
|
}
|
package/dist/crawler/crawler.js
CHANGED
|
@@ -85,6 +85,7 @@ export class Crawler {
|
|
|
85
85
|
startHandlers = [];
|
|
86
86
|
finishHandlers = [];
|
|
87
87
|
redirectHandlers = [];
|
|
88
|
+
queueChangeHandlers = [];
|
|
88
89
|
collectedData = new CappedArray({
|
|
89
90
|
maxSize: 1e5,
|
|
90
91
|
evictionRatio: 0.1,
|
|
@@ -100,12 +101,22 @@ export class Crawler {
|
|
|
100
101
|
this.adapterType = this.config.adapter;
|
|
101
102
|
const concurrency = this.config.concurrency;
|
|
102
103
|
this.queue = new RezoQueue({
|
|
104
|
+
name: "crawler",
|
|
103
105
|
concurrency
|
|
104
106
|
});
|
|
105
107
|
this.originalConcurrency = concurrency;
|
|
106
108
|
this.scraperQueue = new RezoQueue({
|
|
109
|
+
name: "scraper",
|
|
107
110
|
concurrency: this.config.scraperConcurrency
|
|
108
111
|
});
|
|
112
|
+
this._subscribeToQueueEvents(this.queue, "crawler");
|
|
113
|
+
this._subscribeToQueueEvents(this.scraperQueue, "scraper");
|
|
114
|
+
this.config.onLimiterAdded = (queue) => {
|
|
115
|
+
if (!this.subscribedLimiterQueues.has(queue)) {
|
|
116
|
+
this._subscribeToQueueEvents(queue, "limiter");
|
|
117
|
+
this.subscribedLimiterQueues.add(queue);
|
|
118
|
+
}
|
|
119
|
+
};
|
|
109
120
|
this.memoryMonitor = new MemoryMonitor({ warningRatio: 0.7, criticalRatio: 0.85 });
|
|
110
121
|
this.healthMetrics = new HealthMetrics({ windowSize: 60000 });
|
|
111
122
|
const enableCache = this.config.enableCache;
|
|
@@ -596,6 +607,11 @@ export class Crawler {
|
|
|
596
607
|
this.redirectHandlers.push(handler);
|
|
597
608
|
return this;
|
|
598
609
|
}
|
|
610
|
+
onQueueChange(handler) {
|
|
611
|
+
this.queueChangeHandlers.push(handler);
|
|
612
|
+
this._subscribeToLimiterQueues();
|
|
613
|
+
return this;
|
|
614
|
+
}
|
|
599
615
|
onRawData(handler) {
|
|
600
616
|
this.rawResponseEvents.push({
|
|
601
617
|
handler: "_onRawResponse",
|
|
@@ -666,6 +682,52 @@ export class Crawler {
|
|
|
666
682
|
});
|
|
667
683
|
return this;
|
|
668
684
|
}
|
|
685
|
+
subscribedLimiterQueues = new Set;
|
|
686
|
+
_subscribeToQueueEvents(queue, queueType) {
|
|
687
|
+
const emitEvent = (event, taskId) => {
|
|
688
|
+
if (this.queueChangeHandlers.length === 0)
|
|
689
|
+
return;
|
|
690
|
+
const state = queue.state;
|
|
691
|
+
const queueChangeEvent = {
|
|
692
|
+
queueName: queue.name,
|
|
693
|
+
queueType,
|
|
694
|
+
event,
|
|
695
|
+
pending: state.pending,
|
|
696
|
+
size: state.size,
|
|
697
|
+
total: state.total,
|
|
698
|
+
isPaused: state.isPaused,
|
|
699
|
+
isIdle: state.isIdle,
|
|
700
|
+
taskId
|
|
701
|
+
};
|
|
702
|
+
for (const handler of this.queueChangeHandlers) {
|
|
703
|
+
try {
|
|
704
|
+
handler(queueChangeEvent);
|
|
705
|
+
} catch (err) {
|
|
706
|
+
if (this.config.debug)
|
|
707
|
+
console.error("[Crawler] onQueueChange handler error:", err);
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
};
|
|
711
|
+
queue.on("add", (data) => emitEvent("add", data.id));
|
|
712
|
+
queue.on("start", (data) => emitEvent("start", data.id));
|
|
713
|
+
queue.on("completed", (data) => emitEvent("completed", data.id));
|
|
714
|
+
queue.on("error", (data) => emitEvent("error", data.id));
|
|
715
|
+
queue.on("timeout", (data) => emitEvent("timeout", data.id));
|
|
716
|
+
queue.on("cancelled", (data) => emitEvent("cancelled", data.id));
|
|
717
|
+
queue.on("idle", () => emitEvent("idle"));
|
|
718
|
+
queue.on("active", () => emitEvent("active"));
|
|
719
|
+
queue.on("paused", () => emitEvent("paused"));
|
|
720
|
+
queue.on("resumed", () => emitEvent("resumed"));
|
|
721
|
+
}
|
|
722
|
+
_subscribeToLimiterQueues() {
|
|
723
|
+
const limiters = this.config.getLimiters();
|
|
724
|
+
for (const limiter of limiters) {
|
|
725
|
+
if (!this.subscribedLimiterQueues.has(limiter.pqueue)) {
|
|
726
|
+
this._subscribeToQueueEvents(limiter.pqueue, "limiter");
|
|
727
|
+
this.subscribedLimiterQueues.add(limiter.pqueue);
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
}
|
|
669
731
|
_onBody(handler, document) {
|
|
670
732
|
this.queue.add(() => handler(document.body));
|
|
671
733
|
}
|
|
@@ -1119,7 +1181,16 @@ export class Crawler {
|
|
|
1119
1181
|
}
|
|
1120
1182
|
this.crawlStats.urlsQueued++;
|
|
1121
1183
|
const domain = new URL(url).hostname;
|
|
1122
|
-
const
|
|
1184
|
+
const limiterRandomDelay = this.config.getRandomDelay(url, true);
|
|
1185
|
+
let delay = 0;
|
|
1186
|
+
if (limiterRandomDelay !== undefined && limiterRandomDelay > 0) {
|
|
1187
|
+
delay = Math.floor(Math.random() * limiterRandomDelay);
|
|
1188
|
+
if (this.config.debug) {
|
|
1189
|
+
console.log(`[RandomDelay] ${domain}: ${delay}ms (max: ${limiterRandomDelay}ms)`);
|
|
1190
|
+
}
|
|
1191
|
+
} else {
|
|
1192
|
+
delay = this.getAutoThrottleDelay(domain);
|
|
1193
|
+
}
|
|
1123
1194
|
if (delay > 0) {
|
|
1124
1195
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
1125
1196
|
}
|