rezo 1.0.131 → 1.0.133
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/entries/curl.d.ts +1 -1
- package/dist/adapters/entries/fetch.d.ts +1 -1
- package/dist/adapters/entries/http.d.ts +1 -1
- package/dist/adapters/entries/http2.d.ts +1 -1
- package/dist/adapters/entries/react-native.cjs +6 -6
- package/dist/adapters/entries/react-native.d.ts +1 -1
- package/dist/adapters/entries/xhr.d.ts +1 -1
- package/dist/adapters/index.cjs +6 -6
- package/dist/cache/index.cjs +9 -9
- package/dist/cookies/cookie-jar.cjs +4 -4
- package/dist/cookies/index.cjs +10 -10
- package/dist/crawler/addon/oxylabs/index.cjs +1 -1
- package/dist/crawler/crawler-options.cjs +1 -1
- package/dist/crawler/crawler-options.js +1 -1
- package/dist/crawler/crawler.cjs +24 -4
- package/dist/crawler/crawler.js +24 -4
- package/dist/crawler/index.cjs +42 -42
- package/dist/crawler/plugin/index.cjs +1 -1
- package/dist/crawler/plugin/sqlite-utils.cjs +1 -1
- package/dist/crawler/plugin/sqlite-utils.js +1 -1
- package/dist/crawler.d.ts +7 -0
- package/dist/dom/index.cjs +1 -23
- package/dist/dom/index.d.ts +3 -0
- package/dist/dom/index.js +1 -20
- package/dist/entries/crawler.cjs +6 -6
- package/dist/index.cjs +48 -48
- package/dist/index.d.ts +1 -1
- package/dist/internal/agents/index.cjs +14 -14
- package/dist/platform/browser.d.ts +1 -1
- package/dist/platform/bun.d.ts +1 -1
- package/dist/platform/deno.d.ts +1 -1
- package/dist/platform/node.d.ts +1 -1
- package/dist/platform/react-native.cjs +6 -6
- package/dist/platform/react-native.d.ts +1 -1
- package/dist/platform/worker.d.ts +1 -1
- package/dist/proxy/index.cjs +4 -4
- package/dist/queue/index.cjs +8 -8
- package/dist/responses/universal/index.cjs +11 -11
- package/dist/stealth/index.cjs +17 -17
- package/dist/stealth/profiles/index.cjs +10 -10
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/wget/asset-extractor.cjs +187 -19
- package/dist/wget/asset-extractor.js +184 -19
- package/dist/wget/downloader.cjs +288 -12
- package/dist/wget/downloader.js +288 -12
- package/dist/wget/index.cjs +221 -52
- package/dist/wget/index.d.ts +219 -9
- package/dist/wget/index.js +170 -1
- package/dist/wget/progress.cjs +90 -7
- package/dist/wget/progress.js +90 -7
- package/dist/wget/session.cjs +122 -0
- package/dist/wget/session.js +119 -0
- package/dist/wget/types.cjs +6 -0
- package/dist/wget/types.js +6 -0
- package/dist/wget/url-filter.cjs +4 -1
- package/dist/wget/url-filter.js +4 -1
- package/package.json +23 -22
|
@@ -6027,7 +6027,7 @@ export interface RezoInstance extends Rezo, RezoCallable {
|
|
|
6027
6027
|
*
|
|
6028
6028
|
* IMPORTANT: Update these values when bumping package version.
|
|
6029
6029
|
*/
|
|
6030
|
-
export declare const VERSION = "1.0.
|
|
6030
|
+
export declare const VERSION = "1.0.133";
|
|
6031
6031
|
/**
|
|
6032
6032
|
* cURL Options Configuration
|
|
6033
6033
|
*
|
|
@@ -6027,7 +6027,7 @@ export interface RezoInstance extends Rezo, RezoCallable {
|
|
|
6027
6027
|
*
|
|
6028
6028
|
* IMPORTANT: Update these values when bumping package version.
|
|
6029
6029
|
*/
|
|
6030
|
-
export declare const VERSION = "1.0.
|
|
6030
|
+
export declare const VERSION = "1.0.133";
|
|
6031
6031
|
export declare const isRezoError: typeof RezoError.isRezoError;
|
|
6032
6032
|
export declare const Cancel: typeof RezoError;
|
|
6033
6033
|
export declare const CancelToken: {
|
|
@@ -6027,7 +6027,7 @@ export interface RezoInstance extends Rezo, RezoCallable {
|
|
|
6027
6027
|
*
|
|
6028
6028
|
* IMPORTANT: Update these values when bumping package version.
|
|
6029
6029
|
*/
|
|
6030
|
-
export declare const VERSION = "1.0.
|
|
6030
|
+
export declare const VERSION = "1.0.133";
|
|
6031
6031
|
/**
|
|
6032
6032
|
* Type guard to check if an error is a RezoError instance.
|
|
6033
6033
|
*/
|
|
@@ -6027,7 +6027,7 @@ export interface RezoInstance extends Rezo, RezoCallable {
|
|
|
6027
6027
|
*
|
|
6028
6028
|
* IMPORTANT: Update these values when bumping package version.
|
|
6029
6029
|
*/
|
|
6030
|
-
export declare const VERSION = "1.0.
|
|
6030
|
+
export declare const VERSION = "1.0.133";
|
|
6031
6031
|
export declare const isRezoError: typeof RezoError.isRezoError;
|
|
6032
6032
|
export declare const Cancel: typeof RezoError;
|
|
6033
6033
|
export declare const CancelToken: {
|
|
@@ -6,12 +6,12 @@ const { RezoFormData } = require('../../utils/form-data.cjs');
|
|
|
6
6
|
const { RezoCookieJar, Cookie } = require('../../cookies/cookie-jar.cjs');
|
|
7
7
|
const { createDefaultHooks, mergeHooks } = require('../../core/hooks.cjs');
|
|
8
8
|
const { VERSION } = require('../../version.cjs');
|
|
9
|
-
const
|
|
10
|
-
exports.createFetchStreamTransport =
|
|
11
|
-
exports.createExpoFileSystemAdapter =
|
|
12
|
-
exports.createReactNativeFsAdapter =
|
|
13
|
-
exports.createNetInfoProvider =
|
|
14
|
-
exports.createExpoBackgroundTaskProvider =
|
|
9
|
+
const _mod_6jezic = require('../../platform/react-native-providers.cjs');
|
|
10
|
+
exports.createFetchStreamTransport = _mod_6jezic.createFetchStreamTransport;
|
|
11
|
+
exports.createExpoFileSystemAdapter = _mod_6jezic.createExpoFileSystemAdapter;
|
|
12
|
+
exports.createReactNativeFsAdapter = _mod_6jezic.createReactNativeFsAdapter;
|
|
13
|
+
exports.createNetInfoProvider = _mod_6jezic.createNetInfoProvider;
|
|
14
|
+
exports.createExpoBackgroundTaskProvider = _mod_6jezic.createExpoBackgroundTaskProvider;;
|
|
15
15
|
exports.Rezo = Rezo;
|
|
16
16
|
exports.RezoError = RezoError;
|
|
17
17
|
exports.RezoErrorCode = RezoErrorCode;
|
|
@@ -6027,7 +6027,7 @@ export interface RezoInstance extends Rezo, RezoCallable {
|
|
|
6027
6027
|
*
|
|
6028
6028
|
* IMPORTANT: Update these values when bumping package version.
|
|
6029
6029
|
*/
|
|
6030
|
-
export declare const VERSION = "1.0.
|
|
6030
|
+
export declare const VERSION = "1.0.133";
|
|
6031
6031
|
export interface ExpoFileSystemFileLike {
|
|
6032
6032
|
uri?: string;
|
|
6033
6033
|
size?: number;
|
|
@@ -6027,7 +6027,7 @@ export interface RezoInstance extends Rezo, RezoCallable {
|
|
|
6027
6027
|
*
|
|
6028
6028
|
* IMPORTANT: Update these values when bumping package version.
|
|
6029
6029
|
*/
|
|
6030
|
-
export declare const VERSION = "1.0.
|
|
6030
|
+
export declare const VERSION = "1.0.133";
|
|
6031
6031
|
export declare const isRezoError: typeof RezoError.isRezoError;
|
|
6032
6032
|
export declare const Cancel: typeof RezoError;
|
|
6033
6033
|
export declare const CancelToken: {
|
package/dist/adapters/index.cjs
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.detectRuntime =
|
|
3
|
-
exports.getAdapterCapabilities =
|
|
4
|
-
exports.buildAdapterContext =
|
|
5
|
-
exports.getAvailableAdapters =
|
|
6
|
-
exports.selectAdapter =
|
|
1
|
+
const _mod_wl45y9 = require('./picker.cjs');
|
|
2
|
+
exports.detectRuntime = _mod_wl45y9.detectRuntime;
|
|
3
|
+
exports.getAdapterCapabilities = _mod_wl45y9.getAdapterCapabilities;
|
|
4
|
+
exports.buildAdapterContext = _mod_wl45y9.buildAdapterContext;
|
|
5
|
+
exports.getAvailableAdapters = _mod_wl45y9.getAvailableAdapters;
|
|
6
|
+
exports.selectAdapter = _mod_wl45y9.selectAdapter;;
|
package/dist/cache/index.cjs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.LRUCache =
|
|
3
|
-
const
|
|
4
|
-
exports.DNSCache =
|
|
5
|
-
exports.getGlobalDNSCache =
|
|
6
|
-
exports.resetGlobalDNSCache =
|
|
7
|
-
const
|
|
8
|
-
exports.ResponseCache =
|
|
9
|
-
exports.normalizeResponseCacheConfig =
|
|
1
|
+
const _mod_pwv10q = require('./lru-cache.cjs');
|
|
2
|
+
exports.LRUCache = _mod_pwv10q.LRUCache;;
|
|
3
|
+
const _mod_r89cab = require('./dns-cache.cjs');
|
|
4
|
+
exports.DNSCache = _mod_r89cab.DNSCache;
|
|
5
|
+
exports.getGlobalDNSCache = _mod_r89cab.getGlobalDNSCache;
|
|
6
|
+
exports.resetGlobalDNSCache = _mod_r89cab.resetGlobalDNSCache;;
|
|
7
|
+
const _mod_vz9afj = require('./response-cache.cjs');
|
|
8
|
+
exports.ResponseCache = _mod_vz9afj.ResponseCache;
|
|
9
|
+
exports.normalizeResponseCacheConfig = _mod_vz9afj.normalizeResponseCacheConfig;;
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
const { CookieJar: TouchCookieJar } = require("tough-cookie");
|
|
2
2
|
const { Cookie } = require('./cookie.cjs');
|
|
3
3
|
const { requireNodeModule } = require('../utils/node-runtime.cjs');
|
|
4
|
-
const
|
|
5
|
-
exports.Cookie =
|
|
4
|
+
const _mod_egd39a = require('./cookie.cjs');
|
|
5
|
+
exports.Cookie = _mod_egd39a.Cookie;;
|
|
6
6
|
|
|
7
7
|
class RezoCookieJar extends TouchCookieJar {
|
|
8
8
|
constructor(store, options) {
|
|
@@ -481,7 +481,7 @@ class RezoCookieJar extends TouchCookieJar {
|
|
|
481
481
|
}
|
|
482
482
|
}
|
|
483
483
|
const CookieJar = exports.CookieJar = RezoCookieJar;
|
|
484
|
-
const
|
|
485
|
-
exports.Store =
|
|
484
|
+
const _mod_uokl3t = require("tough-cookie");
|
|
485
|
+
exports.Store = _mod_uokl3t.Store;;
|
|
486
486
|
|
|
487
487
|
exports.RezoCookieJar = RezoCookieJar;
|
package/dist/cookies/index.cjs
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.Cookie =
|
|
3
|
-
exports.RezoCookie =
|
|
4
|
-
const
|
|
5
|
-
exports.RezoCookieStore =
|
|
6
|
-
const
|
|
7
|
-
exports.RezoCookieJar =
|
|
8
|
-
exports.CookieJar =
|
|
9
|
-
const
|
|
10
|
-
exports.Store =
|
|
1
|
+
const _mod_3anom3 = require('./cookie.cjs');
|
|
2
|
+
exports.Cookie = _mod_3anom3.Cookie;
|
|
3
|
+
exports.RezoCookie = _mod_3anom3.RezoCookie;;
|
|
4
|
+
const _mod_p2lai5 = require('./cookie-store.cjs');
|
|
5
|
+
exports.RezoCookieStore = _mod_p2lai5.RezoCookieStore;;
|
|
6
|
+
const _mod_dze7qd = require('./cookie-jar.cjs');
|
|
7
|
+
exports.RezoCookieJar = _mod_dze7qd.RezoCookieJar;
|
|
8
|
+
exports.CookieJar = _mod_dze7qd.CookieJar;;
|
|
9
|
+
const _mod_e0fpg1 = require("tough-cookie");
|
|
10
|
+
exports.Store = _mod_e0fpg1.Store;;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
var{Rezo:
|
|
1
|
+
var{Rezo:_}=require("../../../core/rezo.cjs"),a=require("./options.cjs");exports.OXYLABS_BROWSER_TYPES=a.OXYLABS_BROWSER_TYPES;exports.OXYLABS_COMMON_LOCALES=a.OXYLABS_COMMON_LOCALES;exports.OXYLABS_COMMON_GEO_LOCATIONS=a.OXYLABS_COMMON_GEO_LOCATIONS;exports.OXYLABS_US_STATES=a.OXYLABS_US_STATES;exports.OXYLABS_EUROPEAN_COUNTRIES=a.OXYLABS_EUROPEAN_COUNTRIES;exports.OXYLABS_ASIAN_COUNTRIES=a.OXYLABS_ASIAN_COUNTRIES;exports.getRandomBrowserType=a.getRandomBrowserType;exports.getRandomLocale=a.getRandomLocale;exports.getRandomGeoLocation=a.getRandomGeoLocation;var i="https://realtime.oxylabs.io/v1/queries";class d{config;http;authHeader;constructor(s){if(!s.username||!s.password)throw Error("Oxylabs username and password are required");this.config={username:s.username,password:s.password,browserType:s.browserType??"desktop",locale:s.locale??"en-US",geoLocation:s.geoLocation??"",render:s.render??!1,context:s.context??{},timeout:s.timeout??120000},this.http=new _({baseURL:i,timeout:this.config.timeout}),this.authHeader=`Basic ${Buffer.from(`${s.username}:${s.password}`).toString("base64")}`}async scrape(s,e){let t={...this.config,...e},r=this.buildRequestBody(s,t),o=(await this.http.postJson(i,r,{headers:{Authorization:this.authHeader,"Content-Type":"application/json"}})).data;if(o.error)throw Error(`Oxylabs API error: ${o.error}`);if(!o.results||o.results.length===0)throw Error("Oxylabs API returned no results");let n=o.results[0],l=n._response?.cookies||[],h=n._response?.headers||{};return{statusCode:n.status_code,url:n.url,content:n.content,cookies:l,headers:h,jobId:o.job_id||n.job_id,rendered:t.render,geoLocation:t.geoLocation||void 0,locale:t.locale,browserType:t.browserType,raw:o}}async scrapeMany(s,e,t=1000){let r=[];for(let u=0;u<s.length;u++){let o=await this.scrape(s[u],e);if(r.push(o),u<s.length-1&&t>0)await new Promise((n)=>setTimeout(n,t))}return r}buildRequestBody(s,e){let t=[];if(t.push({key:"return_page_cookies",value:!0}),t.push({key:"return_page_headers",value:!0}),e.context)for(let[u,o]of Object.entries(e.context))t.push({key:u,value:o});if(e.headers&&Object.keys(e.headers).length>0)t.push({key:"headers",value:e.headers});if(e.cookies&&e.cookies.length>0)t.push({key:"cookies",value:e.cookies});if(e.session_id)t.push({key:"session_id",value:e.session_id});if(e.http_method)t.push({key:"http_method",value:e.http_method});if(e.base64Body)t.push({key:"content",value:e.base64Body});if(typeof e.follow_redirects==="boolean")t.push({key:"follow_redirects",value:e.follow_redirects});if(e.successful_status_codes&&e.successful_status_codes.length>0)t.push({key:"successful_status_codes",value:e.successful_status_codes});let r={source:"universal",url:s,context:t};if(e.render||e.javascript_rendering)r.render="html";if(e.browserType)r.user_agent_type=e.browserType;if(e.locale)r.locale=e.locale;if(e.geoLocation)r.geo_location=e.geoLocation;if(e.returnAsBase64)r.content_encoding="base64";return r}getConfig(){return{...this.config,password:"***"}}withConfig(s){return new d({...this.config,...s})}async testConnection(){try{return await this.scrape("https://httpbin.org/ip"),!0}catch(s){throw Error(`Oxylabs connection test failed: ${s.message}`)}}}exports.Oxylabs=d;exports.default=d;module.exports=Object.assign(d,exports);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
var{RezoQueue:y}=require("../queue/queue.cjs"),{Oxylabs:x}=require("./addon/oxylabs/index.cjs"),v=require("node:path"),A=require("node:os"),{Decodo:p}=require("./addon/decodo/index.cjs"),{Rezo:D}=require("../core/rezo.cjs");class l{static destroyQueue(e){if(e&&typeof e.destroy==="function")e.destroy()}static destroyConfigQueues(e){for(let r of e)l.destroyQueue(r.pqueue)}static splitConfigs(e,r){let s=[],a=[];for(let t of e)if(r(t))a.push(t);else s.push(t);return{kept:s,removed:a}}static getHostname(e){try{return new URL(e).hostname||void 0}catch{return}}static createStableThroughputOptions(e){let{baseUrl:r,concurrency:s=40,scraperConcurrency:a=10,retryDelay:t=1000,maxRetryAttempts:i=2,retryOnStatusCode:o=[408,500,502,503,504],maxWaitOn429:d=15000,alwaysWaitOn429:n=!1,globalLimiter:h={concurrency:8},domain:u=l.getHostname(r),domainLimiter:c={concurrency:2,interval:1000,intervalCap:2,randomDelay:150},domainRetry:m={enable:!0,max429Retries:2,retryDelay:1000,maxRetryAttempts:2,backoff:!0},extraLimiters:g=[],overrides:b={}}=e,f=[];if(h)f.push({isGlobal:!0,options:h});if(u&&c)f.push({domain:u,options:c,retry:m||void 0});return f.push(...g),{...{baseUrl:r,autoThrottle:!1,concurrency:s,scraperConcurrency:a,retryDelay:t,maxRetryAttempts:i,retryOnStatusCode:o,maxWaitOn429:d,alwaysWaitOn429:n,limiter:f.length>0?{enable:!0,limiters:f}:!1},...b,baseUrl:r}}baseUrl;adapter;enableNavigationHistory;sessionId;rejectUnauthorized;userAgent;useRndUserAgent;timeout;maxRedirects;maxRetryAttempts;retryDelay;retryOnStatusCode;forceRevisit;retryWithoutProxyOnStatusCode;retryOnProxyError;maxRetryOnProxyError;allowRevisiting;enableCache;cacheTTL;cacheDir;throwFatalError;debug;enableSignalHandlers;concurrency;scraperConcurrency;maxDepth;maxUrls;maxResponseSize;respectRobotsTxt;followNofollow;autoThrottle;autoThrottleTargetDelay;autoThrottleMinDelay;autoThrottleMaxDelay;maxWaitOn429;alwaysWaitOn429;stealth;oxylabs=[];decodo=[];stealthProfiles=[];proxies=[];limiters=[];onLimiterAdded;onProviderAdded;requestHeaders=[];userAgents=$();constructor(e={}){this.baseUrl=e.baseUrl||"",this.adapter=e.adapter??"http",this.enableNavigationHistory=e.enableNavigationHistory??!1,this.sessionId=e.sessionId??`session_${Date.now()}_${Math.random().toString(36).slice(2,8)}`,this.rejectUnauthorized=e.rejectUnauthorized??!0,this.userAgent=e.userAgent,this.useRndUserAgent=e.useRndUserAgent??!1,this.timeout=e.timeout??30000,this.maxRedirects=e.maxRedirects??10,this.maxRetryAttempts=e.maxRetryAttempts??3,this.retryDelay=e.retryDelay??0,this.retryOnStatusCode=e.retryOnStatusCode??[408,429,500,502,503,504],this.forceRevisit=e.forceRevisit??!1,this.retryWithoutProxyOnStatusCode=e.retryWithoutProxyOnStatusCode??[407,403],this.retryOnProxyError=e.retryOnProxyError??!0,this.maxRetryOnProxyError=e.maxRetryOnProxyError??3,this.allowRevisiting=e.allowRevisiting??!1,this.enableCache=e.enableCache??!0,this.cacheTTL=e.cacheTTL??604800000,this.cacheDir=e.cacheDir??v.join(A.tmpdir(),"rezo_cache"),this.throwFatalError=e.throwFatalError??!1,this.debug=e.debug??!1,this.enableSignalHandlers=e.enableSignalHandlers??!1,this.concurrency=e.concurrency??100,this.scraperConcurrency=e.scraperConcurrency??this.concurrency,this.maxDepth=e.maxDepth??0,this.maxUrls=e.maxUrls??0,this.maxResponseSize=e.maxResponseSize??0,this.respectRobotsTxt=e.respectRobotsTxt??!1,this.followNofollow=e.followNofollow??!1,this.autoThrottle=e.autoThrottle??!0,this.autoThrottleTargetDelay=e.autoThrottleTargetDelay??1000,this.autoThrottleMinDelay=e.autoThrottleMinDelay??100,this.autoThrottleMaxDelay=e.autoThrottleMaxDelay??60000,this.maxWaitOn429=e.maxWaitOn429??1800000,this.alwaysWaitOn429=e.alwaysWaitOn429??!1,this._addHeaders(e.headers),this._addOxylabs(e.oxylabs),this._addDecodo(e.decodo),this._addProxies(e.proxy),this._addLimiters(e.limiter),this._addStealths(e.stealths),this.stealth=e.stealth}getConfiguredDomains(e){return(e==="headers"?this.requestHeaders:e==="limiters"?this.limiters:e==="oxylabs"?this.oxylabs:e==="decodo"?this.decodo:e==="stealth"?this.stealthProfiles:this.proxies).filter((s)=>s.domain).map((s)=>s.domain).filter((s,a,t)=>t.indexOf(s)===a)}removeDomain(e){this.requestHeaders=this.requestHeaders.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.proxies=this.proxies.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e));let r=l.splitConfigs(this.limiters,(t)=>!!t.domain&&this._domainsEqual(t.domain,e));l.destroyConfigQueues(r.removed),this.limiters=r.kept;let s=l.splitConfigs(this.oxylabs,(t)=>!!t.domain&&this._domainsEqual(t.domain,e));l.destroyConfigQueues(s.removed),this.oxylabs=s.kept;let a=l.splitConfigs(this.decodo,(t)=>!!t.domain&&this._domainsEqual(t.domain,e));return l.destroyConfigQueues(a.removed),this.decodo=a.kept,this.stealthProfiles=this.stealthProfiles.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this}_domainsEqual(e,r){if(Array.isArray(e)&&Array.isArray(r))return e.length===r.length&&e.every((s,a)=>s===r[a]);return e===r}getConfigurationSummary(){let e=(r)=>({total:r.length,global:r.filter((s)=>s.isGlobal).length,domainSpecific:r.filter((s)=>!s.isGlobal&&s.domain).length});return{headers:e(this.requestHeaders),proxies:e(this.proxies),limiters:e(this.limiters),oxylabs:e(this.oxylabs),decodo:e(this.decodo)}}_addHeaders(e){if(!e||!e.enable)return;for(let r of e.httpHeaders){let{domain:s,isGlobal:a,headers:t}=r;if(!s&&!a)continue;if(t instanceof Headers){let i=Object.fromEntries(t.entries());if(Object.keys(i).length<1)continue;t=i}else if(!t||Object.keys(t).length<1)continue;this.requestHeaders.push({domain:s,isGlobal:a,headers:t})}}_addProxies(e){if(!e||!e.enable)return;for(let r of e.proxies){let{domain:s,isGlobal:a,proxy:t,rotating:i}=r;if(!s&&!a)continue;if(!t||Object.keys(t).length<1)continue;this.proxies.push({domain:s,isGlobal:a,proxy:t,rotating:!!i})}}_addLimiters(e){if(!e||!e.enable)return;for(let r of e.limiters){let{domain:s,isGlobal:a,options:t,retry:i}=r;if(!s&&!a)continue;if(!t&&!i)continue;let o=t?new y(t):new y({name:"limiter"});if(this.limiters.push({domain:s,isGlobal:a,pqueue:o,randomDelay:t?.randomDelay,retry:i}),this.onLimiterAdded)this.onLimiterAdded(o)}}_addOxylabs(e){if(!e||!e.enable)return;for(let r of e.labs){let{domain:s,isGlobal:a,options:t,queueOptions:i}=r;if(!s&&!a)continue;if(!t||Object.keys(t).length<1)continue;let o=i?new y(i):void 0;if(this.oxylabs.push({domain:s,isGlobal:a,adaptar:new x(t),pqueue:o}),o&&this.onProviderAdded)this.onProviderAdded(o)}}_addDecodo(e){if(!e||!e.enable)return;for(let r of e.labs){let{domain:s,isGlobal:a,options:t,queueOptions:i}=r;if(!s&&!a)continue;if(!t||Object.keys(t).length<1)continue;let o=i?new y(i):void 0;if(this.decodo.push({domain:s,isGlobal:a,adaptar:new p(t),pqueue:o}),o&&this.onProviderAdded)this.onProviderAdded(o)}}_addStealths(e){if(!e||!e.enable)return;for(let r of e.profiles){let{domain:s,isGlobal:a,stealth:t}=r;if(!s&&!a)continue;if(!t)continue;this.stealthProfiles.push({domain:s,isGlobal:a,adaptar:new D({stealth:t})})}}addHeaders(e){return this._addHeaders({enable:!0,httpHeaders:[e]}),this}addProxy(e){return this._addProxies({enable:!0,proxies:[e]}),this}addProxies(e){return this._addProxies({enable:!0,proxies:e}),this}getProxyConfig(e,r,s){let a=this.selectConfigEntry(e,this.proxies,r,s);if(!a)return null;return{proxy:a.proxy,rotating:!!a.rotating}}addLimiter(e){return this._addLimiters({enable:!0,limiters:[e]}),this}addOxylabs(e){return this._addOxylabs({enable:!0,labs:[e]}),this}addDecodo(e){return this._addDecodo({enable:!0,labs:[e]}),this}addStealth(e){return this._addStealths({enable:!0,profiles:[e]}),this}destroyLimiters(){for(let e of this.limiters)l.destroyQueue(e.pqueue);this.limiters=[]}destroyProviderQueues(){for(let e of this.oxylabs)l.destroyQueue(e.pqueue),delete e.pqueue;for(let e of this.decodo)l.destroyQueue(e.pqueue),delete e.pqueue}getLimiters(){return this.limiters}getRandomDelay(e,r){if(!this.getDomainName(e))return;for(let a of this.limiters)if(this._hasDomain(e,a.domain)&&a.randomDelay!==void 0)return a.randomDelay;if(r){for(let a of this.limiters)if(a.isGlobal&&a.randomDelay!==void 0)return a.randomDelay}return}clearGlobalConfigs(){if(Array.isArray(this.requestHeaders))this.requestHeaders=this.requestHeaders.filter((e)=>!e.isGlobal);if(Array.isArray(this.oxylabs)){let e=l.splitConfigs(this.oxylabs,(r)=>!!r.isGlobal);l.destroyConfigQueues(e.removed),this.oxylabs=e.kept}if(Array.isArray(this.decodo)){let e=l.splitConfigs(this.decodo,(r)=>!!r.isGlobal);l.destroyConfigQueues(e.removed),this.decodo=e.kept}if(Array.isArray(this.limiters)){let e=l.splitConfigs(this.limiters,(r)=>!!r.isGlobal);l.destroyConfigQueues(e.removed),this.limiters=e.kept}if(Array.isArray(this.proxies))this.proxies=this.proxies.filter((e)=>!e.isGlobal);if(Array.isArray(this.stealthProfiles))this.stealthProfiles=this.stealthProfiles.filter((e)=>!e.isGlobal);return this}getAdapter(e,r,s,a){if(!this.getDomainName(e))return null;let i=r==="headers"?this.requestHeaders:r==="limiters"?this.limiters:r==="oxylabs"?this.oxylabs:r==="decodo"?this.decodo:r==="stealth"?this.stealthProfiles:this.proxies,o=this.selectConfigEntry(e,i,s,a);if(!o)return null;return r==="headers"?o.headers:r==="limiters"?o.pqueue:r==="oxylabs"?o.adaptar:r==="decodo"?o.adaptar:r==="stealth"?o.adaptar:o.proxy}getRetryOptions(e){if(!this.getDomainName(e))return null;for(let s=0;s<this.limiters.length;s++)if(this._hasDomain(e,this.limiters[s].domain))return this.limiters[s].retry||null;for(let s=0;s<this.limiters.length;s++)if(this.limiters[s].isGlobal&&this.limiters[s].retry)return this.limiters[s].retry;return null}rnd(e=0,r=Number.MAX_VALUE){return Math.floor(Math.random()*(r-e+1))+e}hasDomain(e,r,s){if(!this.getDomainName(e))return!1;let t=r==="headers"?this.requestHeaders:r==="limiters"?this.limiters:r==="oxylabs"?this.oxylabs:r==="decodo"?this.decodo:r==="stealth"?this.stealthProfiles:this.proxies;for(let i=0;i<t.length;i++)if(this._hasDomain(e,t[i].domain))return!0;if(s){for(let i=0;i<t.length;i++)if(t[i].isGlobal)return!0}return!1}pickHeaders(e,r,s,a){let t=this.getAdapter(e,"headers",r),i=new Headers(t??{}),o=i.count;if(s&&s instanceof Headers)for(let[d,n]of Object.entries(s.entries()))i.set(d,n);else if(s&&typeof s==="object"){for(let[d,n]of Object.entries(s))if(typeof n==="string")i.set(d,n)}if(a&&o===0&&!this.stealth)i.set("user-agent",this.getRandomUserAgent());return Object.fromEntries(i.entries())}_hasDomain(e,r){if(!r)return!1;let s=this.getDomainName(e);if(!s)return!1;let a=(i)=>{return/[\^\$\*\+\?\{\}\[\]\(\)\|\\]/.test(i)||i.startsWith("/")||i.includes(".*")||i.includes(".+")},t=(i)=>{if(i instanceof RegExp)return i.test(s)||i.test(e);let o=i.toString().trim();if(s.toLowerCase()===o.toLowerCase())return!0;if(o.includes("*")){let h=o.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\\*/g,".*"),u=new RegExp(`^${h}$`,"i");return u.test(s)||u.test(e)}if(a(o))try{let h=o,u="i",c=o.match(/^\/(.*)\/(\w*)$/);if(c)h=c[1],u=c[2]||"i";let m=new RegExp(h,u);return m.test(s)||m.test(e)}catch(h){return s.toLowerCase().includes(o.toLowerCase())}let d=s.toLowerCase(),n=o.toLowerCase();return d===n||d.endsWith("."+n)||n.endsWith("."+d)};if(Array.isArray(r)){for(let i of r)if(t(i))return!0;return!1}return t(r)}selectConfigEntry(e,r,s,a){if(!this.getDomainName(e))return null;let i=(n)=>{if(!n.length)return null;let h=a?this.rnd(0,n.length-1):0;return n[h]},o=[];for(let n of r)if(this._hasDomain(e,n.domain))o.push(n);let d=i(o);if(d)return d;if(s){let n=[];for(let h of r)if(h.isGlobal)n.push(h);return i(n)}return null}getDomainName(e){if(this.isValidUrl(e))return new URL(e).hostname;else if(this.isHostName(e))return e;return null}isHostName(e){if(!e)return!1;if(e.length>255)return!1;let r=/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+ [a-zA-Z]{2,})$/;return e=e.trim().toLowerCase(),r.test(e)&&!e.startsWith("-")&&!e.endsWith("-")}isValidUrl(e){if(!e)return!1;e=e.trim();try{let r=new URL(e);if(!r.protocol||!["http:","https:"].includes(r.protocol.toLowerCase()))return!1;if(!r.hostname)return!1;if(!/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,})$/.test(r.hostname))return!1;return!0}catch{return!1}}getRandomUserAgent(){return this.userAgents[Math.floor(Math.random()*this.userAgents.length)]}}function $(){let e=[{name:"Chrome",version:"91.0.4472.124",engine:"AppleWebKit/537.36"},{name:"Firefox",version:"89.0",engine:"Gecko/20100101"},{name:"Safari",version:"14.1.1",engine:"AppleWebKit/605.1.15"},{name:"Edge",version:"91.0.864.59",engine:"AppleWebKit/537.36"},{name:"Opera",version:"77.0.4054.277",engine:"AppleWebKit/537.36"},{name:"Vivaldi",version:"3.8.2259.42",engine:"AppleWebKit/537.36"},{name:"Brave",version:"1.26.74",engine:"AppleWebKit/537.36"},{name:"Chromium",version:"91.0.4472.101",engine:"AppleWebKit/537.36"},{name:"Yandex",version:"21.5.3.742",engine:"AppleWebKit/537.36"},{name:"Maxthon",version:"5.3.8.2000",engine:"AppleWebKit/537.36"}],r=["Windows NT 10.0","Windows NT 6.1","Macintosh; Intel Mac OS X 10_15_7","Macintosh; Intel Mac OS X 11_4_0","X11; Linux x86_64","X11; Ubuntu; Linux x86_64"],s=[];for(let a=0;a<200;a++){let t=e[Math.floor(Math.random()*e.length)],i=r[Math.floor(Math.random()*r.length)],o="";switch(t.name){case"Chrome":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36`;break;case"Firefox":o=`Mozilla/5.0 (${i}; rv:${t.version}) ${t.engine} Firefox/${t.version}`;break;case"Safari":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Version/${t.version} Safari/605.1.15`;break;case"Edge":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Edg/${t.version}`;break;case"Opera":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 OPR/${t.version}`;break;case"Vivaldi":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Vivaldi/${t.version}`;break;case"Brave":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Brave/${t.version}`;break;case"Chromium":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chromium/${t.version} Chrome/${t.version} Safari/537.36`;break;case"Yandex":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} YaBrowser/${t.version} Safari/537.36`;break;case"Maxthon":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Maxthon/${t.version}`;break}s.push(o)}return s}exports.CrawlerOptions=l;
|
|
1
|
+
var{RezoQueue:y}=require("../queue/queue.cjs"),{Oxylabs:x}=require("./addon/oxylabs/index.cjs"),v=require("node:path"),A=require("node:os"),{Decodo:p}=require("./addon/decodo/index.cjs"),{Rezo:D}=require("../core/rezo.cjs");class l{static destroyQueue(e){if(e&&typeof e.destroy==="function")e.destroy()}static destroyConfigQueues(e){for(let r of e)l.destroyQueue(r.pqueue)}static splitConfigs(e,r){let s=[],a=[];for(let t of e)if(r(t))a.push(t);else s.push(t);return{kept:s,removed:a}}static getHostname(e){try{return new URL(e).hostname||void 0}catch{return}}static createStableThroughputOptions(e){let{baseUrl:r,concurrency:s=40,scraperConcurrency:a=10,retryDelay:t=1000,maxRetryAttempts:i=2,retryOnStatusCode:o=[408,500,502,503,504],maxWaitOn429:d=15000,alwaysWaitOn429:n=!1,globalLimiter:h={concurrency:8},domain:u=l.getHostname(r),domainLimiter:c={concurrency:2,interval:1000,intervalCap:2,randomDelay:150},domainRetry:m={enable:!0,max429Retries:2,retryDelay:1000,maxRetryAttempts:2,backoff:!0},extraLimiters:g=[],overrides:b={}}=e,f=[];if(h)f.push({isGlobal:!0,options:h});if(u&&c)f.push({domain:u,options:c,retry:m||void 0});return f.push(...g),{...{baseUrl:r,autoThrottle:!1,concurrency:s,scraperConcurrency:a,retryDelay:t,maxRetryAttempts:i,retryOnStatusCode:o,maxWaitOn429:d,alwaysWaitOn429:n,limiter:f.length>0?{enable:!0,limiters:f}:!1},...b,baseUrl:r}}baseUrl;adapter;enableNavigationHistory;sessionId;rejectUnauthorized;userAgent;useRndUserAgent;timeout;maxRedirects;maxRetryAttempts;retryDelay;retryOnStatusCode;forceRevisit;retryWithoutProxyOnStatusCode;retryOnProxyError;maxRetryOnProxyError;allowRevisiting;enableCache;cacheTTL;cacheDir;throwFatalError;debug;enableSignalHandlers;concurrency;scraperConcurrency;maxDepth;maxUrls;maxResponseSize;respectRobotsTxt;followNofollow;autoThrottle;autoThrottleTargetDelay;autoThrottleMinDelay;autoThrottleMaxDelay;maxWaitOn429;alwaysWaitOn429;stealth;oxylabs=[];decodo=[];stealthProfiles=[];proxies=[];limiters=[];onLimiterAdded;onProviderAdded;requestHeaders=[];userAgents=$();constructor(e={}){this.baseUrl=e.baseUrl||"",this.adapter=e.adapter??"http",this.enableNavigationHistory=e.enableNavigationHistory??!1,this.sessionId=e.sessionId??`session_${Date.now()}_${Math.random().toString(36).slice(2,8)}`,this.rejectUnauthorized=e.rejectUnauthorized??!0,this.userAgent=e.userAgent,this.useRndUserAgent=e.useRndUserAgent??!1,this.timeout=e.timeout??30000,this.maxRedirects=e.maxRedirects??10,this.maxRetryAttempts=e.maxRetryAttempts??3,this.retryDelay=e.retryDelay??0,this.retryOnStatusCode=e.retryOnStatusCode??[408,429,500,502,503,504],this.forceRevisit=e.forceRevisit??!1,this.retryWithoutProxyOnStatusCode=e.retryWithoutProxyOnStatusCode??[407,403],this.retryOnProxyError=e.retryOnProxyError??!0,this.maxRetryOnProxyError=e.maxRetryOnProxyError??3,this.allowRevisiting=e.allowRevisiting??!1,this.enableCache=e.enableCache??!0,this.cacheTTL=e.cacheTTL??604800000,this.cacheDir=e.cacheDir??v.join(A.tmpdir(),"rezo_cache"),this.throwFatalError=e.throwFatalError??!1,this.debug=e.debug??!1,this.enableSignalHandlers=e.enableSignalHandlers??!1,this.concurrency=e.concurrency??100,this.scraperConcurrency=e.scraperConcurrency??this.concurrency,this.maxDepth=e.maxDepth??0,this.maxUrls=e.maxUrls??0,this.maxResponseSize=e.maxResponseSize??0,this.respectRobotsTxt=e.respectRobotsTxt??!1,this.followNofollow=e.followNofollow??!1,this.autoThrottle=e.autoThrottle??!0,this.autoThrottleTargetDelay=e.autoThrottleTargetDelay??1000,this.autoThrottleMinDelay=e.autoThrottleMinDelay??100,this.autoThrottleMaxDelay=e.autoThrottleMaxDelay??60000,this.maxWaitOn429=e.maxWaitOn429??1800000,this.alwaysWaitOn429=e.alwaysWaitOn429??!1,this._addHeaders(e.headers),this._addOxylabs(e.oxylabs),this._addDecodo(e.decodo),this._addProxies(e.proxy),this._addLimiters(e.limiter),this._addStealths(e.stealths),this.stealth=e.stealth}getConfiguredDomains(e){return(e==="headers"?this.requestHeaders:e==="limiters"?this.limiters:e==="oxylabs"?this.oxylabs:e==="decodo"?this.decodo:e==="stealth"?this.stealthProfiles:this.proxies).filter((s)=>s.domain).map((s)=>s.domain).filter((s,a,t)=>t.indexOf(s)===a)}removeDomain(e){this.requestHeaders=this.requestHeaders.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.proxies=this.proxies.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e));let r=l.splitConfigs(this.limiters,(t)=>!!t.domain&&this._domainsEqual(t.domain,e));l.destroyConfigQueues(r.removed),this.limiters=r.kept;let s=l.splitConfigs(this.oxylabs,(t)=>!!t.domain&&this._domainsEqual(t.domain,e));l.destroyConfigQueues(s.removed),this.oxylabs=s.kept;let a=l.splitConfigs(this.decodo,(t)=>!!t.domain&&this._domainsEqual(t.domain,e));return l.destroyConfigQueues(a.removed),this.decodo=a.kept,this.stealthProfiles=this.stealthProfiles.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this}_domainsEqual(e,r){if(Array.isArray(e)&&Array.isArray(r))return e.length===r.length&&e.every((s,a)=>s===r[a]);return e===r}getConfigurationSummary(){let e=(r)=>({total:r.length,global:r.filter((s)=>s.isGlobal).length,domainSpecific:r.filter((s)=>!s.isGlobal&&s.domain).length});return{headers:e(this.requestHeaders),proxies:e(this.proxies),limiters:e(this.limiters),oxylabs:e(this.oxylabs),decodo:e(this.decodo)}}_addHeaders(e){if(!e||!e.enable)return;for(let r of e.httpHeaders){let{domain:s,isGlobal:a,headers:t}=r;if(!s&&!a)continue;if(t instanceof Headers){let i=Object.fromEntries(t.entries());if(Object.keys(i).length<1)continue;t=i}else if(!t||Object.keys(t).length<1)continue;this.requestHeaders.push({domain:s,isGlobal:a,headers:t})}}_addProxies(e){if(!e||!e.enable)return;for(let r of e.proxies){let{domain:s,isGlobal:a,proxy:t,rotating:i}=r;if(!s&&!a)continue;if(!t||Object.keys(t).length<1)continue;this.proxies.push({domain:s,isGlobal:a,proxy:t,rotating:!!i})}}_addLimiters(e){if(!e||!e.enable)return;for(let r of e.limiters){let{domain:s,isGlobal:a,options:t,retry:i}=r;if(!s&&!a)continue;if(!t&&!i)continue;let o=t?new y(t):new y({name:"limiter"});if(this.limiters.push({domain:s,isGlobal:a,pqueue:o,randomDelay:t?.randomDelay,retry:i}),this.onLimiterAdded)this.onLimiterAdded(o)}}_addOxylabs(e){if(!e||!e.enable)return;for(let r of e.labs){let{domain:s,isGlobal:a,options:t,queueOptions:i}=r;if(!s&&!a)continue;if(!t||Object.keys(t).length<1)continue;let o=i?new y(i):void 0;if(this.oxylabs.push({domain:s,isGlobal:a,adaptar:new x(t),pqueue:o}),o&&this.onProviderAdded)this.onProviderAdded(o)}}_addDecodo(e){if(!e||!e.enable)return;for(let r of e.labs){let{domain:s,isGlobal:a,options:t,queueOptions:i}=r;if(!s&&!a)continue;if(!t||Object.keys(t).length<1)continue;let o=i?new y(i):void 0;if(this.decodo.push({domain:s,isGlobal:a,adaptar:new p(t),pqueue:o}),o&&this.onProviderAdded)this.onProviderAdded(o)}}_addStealths(e){if(!e||!e.enable)return;for(let r of e.profiles){let{domain:s,isGlobal:a,stealth:t}=r;if(!s&&!a)continue;if(!t)continue;this.stealthProfiles.push({domain:s,isGlobal:a,adaptar:new D({stealth:t})})}}addHeaders(e){return this._addHeaders({enable:!0,httpHeaders:[e]}),this}addProxy(e){return this._addProxies({enable:!0,proxies:[e]}),this}addProxies(e){return this._addProxies({enable:!0,proxies:e}),this}getProxyConfig(e,r,s){let a=this.selectConfigEntry(e,this.proxies,r,s);if(!a)return null;return{proxy:a.proxy,rotating:!!a.rotating}}addLimiter(e){return this._addLimiters({enable:!0,limiters:[e]}),this}addOxylabs(e){return this._addOxylabs({enable:!0,labs:[e]}),this}addDecodo(e){return this._addDecodo({enable:!0,labs:[e]}),this}addStealth(e){return this._addStealths({enable:!0,profiles:[e]}),this}destroyLimiters(){for(let e of this.limiters)l.destroyQueue(e.pqueue);this.limiters=[]}destroyProviderQueues(){for(let e of this.oxylabs)l.destroyQueue(e.pqueue),delete e.pqueue;for(let e of this.decodo)l.destroyQueue(e.pqueue),delete e.pqueue}getLimiters(){return this.limiters}getRandomDelay(e,r){if(!this.getDomainName(e))return;for(let a of this.limiters)if(this._hasDomain(e,a.domain)&&a.randomDelay!==void 0)return a.randomDelay;if(r){for(let a of this.limiters)if(a.isGlobal&&a.randomDelay!==void 0)return a.randomDelay}return}clearGlobalConfigs(){if(Array.isArray(this.requestHeaders))this.requestHeaders=this.requestHeaders.filter((e)=>!e.isGlobal);if(Array.isArray(this.oxylabs)){let e=l.splitConfigs(this.oxylabs,(r)=>!!r.isGlobal);l.destroyConfigQueues(e.removed),this.oxylabs=e.kept}if(Array.isArray(this.decodo)){let e=l.splitConfigs(this.decodo,(r)=>!!r.isGlobal);l.destroyConfigQueues(e.removed),this.decodo=e.kept}if(Array.isArray(this.limiters)){let e=l.splitConfigs(this.limiters,(r)=>!!r.isGlobal);l.destroyConfigQueues(e.removed),this.limiters=e.kept}if(Array.isArray(this.proxies))this.proxies=this.proxies.filter((e)=>!e.isGlobal);if(Array.isArray(this.stealthProfiles))this.stealthProfiles=this.stealthProfiles.filter((e)=>!e.isGlobal);return this}getAdapter(e,r,s,a){if(!this.getDomainName(e))return null;let i=r==="headers"?this.requestHeaders:r==="limiters"?this.limiters:r==="oxylabs"?this.oxylabs:r==="decodo"?this.decodo:r==="stealth"?this.stealthProfiles:this.proxies,o=this.selectConfigEntry(e,i,s,a);if(!o)return null;return r==="headers"?o.headers:r==="limiters"?o.pqueue:r==="oxylabs"?o.adaptar:r==="decodo"?o.adaptar:r==="stealth"?o.adaptar:o.proxy}getRetryOptions(e){if(!this.getDomainName(e))return null;for(let s=0;s<this.limiters.length;s++)if(this._hasDomain(e,this.limiters[s].domain))return this.limiters[s].retry||null;for(let s=0;s<this.limiters.length;s++)if(this.limiters[s].isGlobal&&this.limiters[s].retry)return this.limiters[s].retry;return null}rnd(e=0,r=Number.MAX_VALUE){return Math.floor(Math.random()*(r-e+1))+e}hasDomain(e,r,s){if(!this.getDomainName(e))return!1;let t=r==="headers"?this.requestHeaders:r==="limiters"?this.limiters:r==="oxylabs"?this.oxylabs:r==="decodo"?this.decodo:r==="stealth"?this.stealthProfiles:this.proxies;for(let i=0;i<t.length;i++)if(this._hasDomain(e,t[i].domain))return!0;if(s){for(let i=0;i<t.length;i++)if(t[i].isGlobal)return!0}return!1}pickHeaders(e,r,s,a){let t=this.getAdapter(e,"headers",r),i=new Headers(t??{}),o=[...i.keys()].length;if(s&&s instanceof Headers)for(let[d,n]of Object.entries(s.entries()))i.set(d,n);else if(s&&typeof s==="object"){for(let[d,n]of Object.entries(s))if(typeof n==="string")i.set(d,n)}if(a&&o===0&&!this.stealth)i.set("user-agent",this.getRandomUserAgent());return Object.fromEntries(i.entries())}_hasDomain(e,r){if(!r)return!1;let s=this.getDomainName(e);if(!s)return!1;let a=(i)=>{return/[\^\$\*\+\?\{\}\[\]\(\)\|\\]/.test(i)||i.startsWith("/")||i.includes(".*")||i.includes(".+")},t=(i)=>{if(i instanceof RegExp)return i.test(s)||i.test(e);let o=i.toString().trim();if(s.toLowerCase()===o.toLowerCase())return!0;if(o.includes("*")){let h=o.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\\*/g,".*"),u=new RegExp(`^${h}$`,"i");return u.test(s)||u.test(e)}if(a(o))try{let h=o,u="i",c=o.match(/^\/(.*)\/(\w*)$/);if(c)h=c[1],u=c[2]||"i";let m=new RegExp(h,u);return m.test(s)||m.test(e)}catch(h){return s.toLowerCase().includes(o.toLowerCase())}let d=s.toLowerCase(),n=o.toLowerCase();return d===n||d.endsWith("."+n)||n.endsWith("."+d)};if(Array.isArray(r)){for(let i of r)if(t(i))return!0;return!1}return t(r)}selectConfigEntry(e,r,s,a){if(!this.getDomainName(e))return null;let i=(n)=>{if(!n.length)return null;let h=a?this.rnd(0,n.length-1):0;return n[h]},o=[];for(let n of r)if(this._hasDomain(e,n.domain))o.push(n);let d=i(o);if(d)return d;if(s){let n=[];for(let h of r)if(h.isGlobal)n.push(h);return i(n)}return null}getDomainName(e){if(this.isValidUrl(e))return new URL(e).hostname;else if(this.isHostName(e))return e;return null}isHostName(e){if(!e)return!1;if(e.length>255)return!1;let r=/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+ [a-zA-Z]{2,})$/;return e=e.trim().toLowerCase(),r.test(e)&&!e.startsWith("-")&&!e.endsWith("-")}isValidUrl(e){if(!e)return!1;e=e.trim();try{let r=new URL(e);if(!r.protocol||!["http:","https:"].includes(r.protocol.toLowerCase()))return!1;if(!r.hostname)return!1;if(!/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,})$/.test(r.hostname))return!1;return!0}catch{return!1}}getRandomUserAgent(){return this.userAgents[Math.floor(Math.random()*this.userAgents.length)]}}function $(){let e=[{name:"Chrome",version:"91.0.4472.124",engine:"AppleWebKit/537.36"},{name:"Firefox",version:"89.0",engine:"Gecko/20100101"},{name:"Safari",version:"14.1.1",engine:"AppleWebKit/605.1.15"},{name:"Edge",version:"91.0.864.59",engine:"AppleWebKit/537.36"},{name:"Opera",version:"77.0.4054.277",engine:"AppleWebKit/537.36"},{name:"Vivaldi",version:"3.8.2259.42",engine:"AppleWebKit/537.36"},{name:"Brave",version:"1.26.74",engine:"AppleWebKit/537.36"},{name:"Chromium",version:"91.0.4472.101",engine:"AppleWebKit/537.36"},{name:"Yandex",version:"21.5.3.742",engine:"AppleWebKit/537.36"},{name:"Maxthon",version:"5.3.8.2000",engine:"AppleWebKit/537.36"}],r=["Windows NT 10.0","Windows NT 6.1","Macintosh; Intel Mac OS X 10_15_7","Macintosh; Intel Mac OS X 11_4_0","X11; Linux x86_64","X11; Ubuntu; Linux x86_64"],s=[];for(let a=0;a<200;a++){let t=e[Math.floor(Math.random()*e.length)],i=r[Math.floor(Math.random()*r.length)],o="";switch(t.name){case"Chrome":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36`;break;case"Firefox":o=`Mozilla/5.0 (${i}; rv:${t.version}) ${t.engine} Firefox/${t.version}`;break;case"Safari":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Version/${t.version} Safari/605.1.15`;break;case"Edge":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Edg/${t.version}`;break;case"Opera":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 OPR/${t.version}`;break;case"Vivaldi":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Vivaldi/${t.version}`;break;case"Brave":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Brave/${t.version}`;break;case"Chromium":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chromium/${t.version} Chrome/${t.version} Safari/537.36`;break;case"Yandex":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} YaBrowser/${t.version} Safari/537.36`;break;case"Maxthon":o=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Maxthon/${t.version}`;break}s.push(o)}return s}exports.CrawlerOptions=l;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
import{RezoQueue as y}from"../queue/queue.js";import{Oxylabs as x}from"./addon/oxylabs/index.js";import p from"node:path";import v from"node:os";import{Decodo as A}from"./addon/decodo/index.js";import{Rezo as D}from"../core/rezo.js";class l{static destroyQueue(e){if(e&&typeof e.destroy==="function")e.destroy()}static destroyConfigQueues(e){for(let r of e)l.destroyQueue(r.pqueue)}static splitConfigs(e,r){let i=[],a=[];for(let t of e)if(r(t))a.push(t);else i.push(t);return{kept:i,removed:a}}static getHostname(e){try{return new URL(e).hostname||void 0}catch{return}}static createStableThroughputOptions(e){let{baseUrl:r,concurrency:i=40,scraperConcurrency:a=10,retryDelay:t=1000,maxRetryAttempts:s=2,retryOnStatusCode:o=[408,500,502,503,504],maxWaitOn429:d=15000,alwaysWaitOn429:n=!1,globalLimiter:h={concurrency:8},domain:u=l.getHostname(r),domainLimiter:c={concurrency:2,interval:1000,intervalCap:2,randomDelay:150},domainRetry:m={enable:!0,max429Retries:2,retryDelay:1000,maxRetryAttempts:2,backoff:!0},extraLimiters:g=[],overrides:b={}}=e,f=[];if(h)f.push({isGlobal:!0,options:h});if(u&&c)f.push({domain:u,options:c,retry:m||void 0});return f.push(...g),{...{baseUrl:r,autoThrottle:!1,concurrency:i,scraperConcurrency:a,retryDelay:t,maxRetryAttempts:s,retryOnStatusCode:o,maxWaitOn429:d,alwaysWaitOn429:n,limiter:f.length>0?{enable:!0,limiters:f}:!1},...b,baseUrl:r}}baseUrl;adapter;enableNavigationHistory;sessionId;rejectUnauthorized;userAgent;useRndUserAgent;timeout;maxRedirects;maxRetryAttempts;retryDelay;retryOnStatusCode;forceRevisit;retryWithoutProxyOnStatusCode;retryOnProxyError;maxRetryOnProxyError;allowRevisiting;enableCache;cacheTTL;cacheDir;throwFatalError;debug;enableSignalHandlers;concurrency;scraperConcurrency;maxDepth;maxUrls;maxResponseSize;respectRobotsTxt;followNofollow;autoThrottle;autoThrottleTargetDelay;autoThrottleMinDelay;autoThrottleMaxDelay;maxWaitOn429;alwaysWaitOn429;stealth;oxylabs=[];decodo=[];stealthProfiles=[];proxies=[];limiters=[];onLimiterAdded;onProviderAdded;requestHeaders=[];userAgents=$();constructor(e={}){this.baseUrl=e.baseUrl||"",this.adapter=e.adapter??"http",this.enableNavigationHistory=e.enableNavigationHistory??!1,this.sessionId=e.sessionId??`session_${Date.now()}_${Math.random().toString(36).slice(2,8)}`,this.rejectUnauthorized=e.rejectUnauthorized??!0,this.userAgent=e.userAgent,this.useRndUserAgent=e.useRndUserAgent??!1,this.timeout=e.timeout??30000,this.maxRedirects=e.maxRedirects??10,this.maxRetryAttempts=e.maxRetryAttempts??3,this.retryDelay=e.retryDelay??0,this.retryOnStatusCode=e.retryOnStatusCode??[408,429,500,502,503,504],this.forceRevisit=e.forceRevisit??!1,this.retryWithoutProxyOnStatusCode=e.retryWithoutProxyOnStatusCode??[407,403],this.retryOnProxyError=e.retryOnProxyError??!0,this.maxRetryOnProxyError=e.maxRetryOnProxyError??3,this.allowRevisiting=e.allowRevisiting??!1,this.enableCache=e.enableCache??!0,this.cacheTTL=e.cacheTTL??604800000,this.cacheDir=e.cacheDir??p.join(v.tmpdir(),"rezo_cache"),this.throwFatalError=e.throwFatalError??!1,this.debug=e.debug??!1,this.enableSignalHandlers=e.enableSignalHandlers??!1,this.concurrency=e.concurrency??100,this.scraperConcurrency=e.scraperConcurrency??this.concurrency,this.maxDepth=e.maxDepth??0,this.maxUrls=e.maxUrls??0,this.maxResponseSize=e.maxResponseSize??0,this.respectRobotsTxt=e.respectRobotsTxt??!1,this.followNofollow=e.followNofollow??!1,this.autoThrottle=e.autoThrottle??!0,this.autoThrottleTargetDelay=e.autoThrottleTargetDelay??1000,this.autoThrottleMinDelay=e.autoThrottleMinDelay??100,this.autoThrottleMaxDelay=e.autoThrottleMaxDelay??60000,this.maxWaitOn429=e.maxWaitOn429??1800000,this.alwaysWaitOn429=e.alwaysWaitOn429??!1,this._addHeaders(e.headers),this._addOxylabs(e.oxylabs),this._addDecodo(e.decodo),this._addProxies(e.proxy),this._addLimiters(e.limiter),this._addStealths(e.stealths),this.stealth=e.stealth}getConfiguredDomains(e){return(e==="headers"?this.requestHeaders:e==="limiters"?this.limiters:e==="oxylabs"?this.oxylabs:e==="decodo"?this.decodo:e==="stealth"?this.stealthProfiles:this.proxies).filter((i)=>i.domain).map((i)=>i.domain).filter((i,a,t)=>t.indexOf(i)===a)}removeDomain(e){this.requestHeaders=this.requestHeaders.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.proxies=this.proxies.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e));let r=l.splitConfigs(this.limiters,(t)=>!!t.domain&&this._domainsEqual(t.domain,e));l.destroyConfigQueues(r.removed),this.limiters=r.kept;let i=l.splitConfigs(this.oxylabs,(t)=>!!t.domain&&this._domainsEqual(t.domain,e));l.destroyConfigQueues(i.removed),this.oxylabs=i.kept;let a=l.splitConfigs(this.decodo,(t)=>!!t.domain&&this._domainsEqual(t.domain,e));return l.destroyConfigQueues(a.removed),this.decodo=a.kept,this.stealthProfiles=this.stealthProfiles.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this}_domainsEqual(e,r){if(Array.isArray(e)&&Array.isArray(r))return e.length===r.length&&e.every((i,a)=>i===r[a]);return e===r}getConfigurationSummary(){let e=(r)=>({total:r.length,global:r.filter((i)=>i.isGlobal).length,domainSpecific:r.filter((i)=>!i.isGlobal&&i.domain).length});return{headers:e(this.requestHeaders),proxies:e(this.proxies),limiters:e(this.limiters),oxylabs:e(this.oxylabs),decodo:e(this.decodo)}}_addHeaders(e){if(!e||!e.enable)return;for(let r of e.httpHeaders){let{domain:i,isGlobal:a,headers:t}=r;if(!i&&!a)continue;if(t instanceof Headers){let s=Object.fromEntries(t.entries());if(Object.keys(s).length<1)continue;t=s}else if(!t||Object.keys(t).length<1)continue;this.requestHeaders.push({domain:i,isGlobal:a,headers:t})}}_addProxies(e){if(!e||!e.enable)return;for(let r of e.proxies){let{domain:i,isGlobal:a,proxy:t,rotating:s}=r;if(!i&&!a)continue;if(!t||Object.keys(t).length<1)continue;this.proxies.push({domain:i,isGlobal:a,proxy:t,rotating:!!s})}}_addLimiters(e){if(!e||!e.enable)return;for(let r of e.limiters){let{domain:i,isGlobal:a,options:t,retry:s}=r;if(!i&&!a)continue;if(!t&&!s)continue;let o=t?new y(t):new y({name:"limiter"});if(this.limiters.push({domain:i,isGlobal:a,pqueue:o,randomDelay:t?.randomDelay,retry:s}),this.onLimiterAdded)this.onLimiterAdded(o)}}_addOxylabs(e){if(!e||!e.enable)return;for(let r of e.labs){let{domain:i,isGlobal:a,options:t,queueOptions:s}=r;if(!i&&!a)continue;if(!t||Object.keys(t).length<1)continue;let o=s?new y(s):void 0;if(this.oxylabs.push({domain:i,isGlobal:a,adaptar:new x(t),pqueue:o}),o&&this.onProviderAdded)this.onProviderAdded(o)}}_addDecodo(e){if(!e||!e.enable)return;for(let r of e.labs){let{domain:i,isGlobal:a,options:t,queueOptions:s}=r;if(!i&&!a)continue;if(!t||Object.keys(t).length<1)continue;let o=s?new y(s):void 0;if(this.decodo.push({domain:i,isGlobal:a,adaptar:new A(t),pqueue:o}),o&&this.onProviderAdded)this.onProviderAdded(o)}}_addStealths(e){if(!e||!e.enable)return;for(let r of e.profiles){let{domain:i,isGlobal:a,stealth:t}=r;if(!i&&!a)continue;if(!t)continue;this.stealthProfiles.push({domain:i,isGlobal:a,adaptar:new D({stealth:t})})}}addHeaders(e){return this._addHeaders({enable:!0,httpHeaders:[e]}),this}addProxy(e){return this._addProxies({enable:!0,proxies:[e]}),this}addProxies(e){return this._addProxies({enable:!0,proxies:e}),this}getProxyConfig(e,r,i){let a=this.selectConfigEntry(e,this.proxies,r,i);if(!a)return null;return{proxy:a.proxy,rotating:!!a.rotating}}addLimiter(e){return this._addLimiters({enable:!0,limiters:[e]}),this}addOxylabs(e){return this._addOxylabs({enable:!0,labs:[e]}),this}addDecodo(e){return this._addDecodo({enable:!0,labs:[e]}),this}addStealth(e){return this._addStealths({enable:!0,profiles:[e]}),this}destroyLimiters(){for(let e of this.limiters)l.destroyQueue(e.pqueue);this.limiters=[]}destroyProviderQueues(){for(let e of this.oxylabs)l.destroyQueue(e.pqueue),delete e.pqueue;for(let e of this.decodo)l.destroyQueue(e.pqueue),delete e.pqueue}getLimiters(){return this.limiters}getRandomDelay(e,r){if(!this.getDomainName(e))return;for(let a of this.limiters)if(this._hasDomain(e,a.domain)&&a.randomDelay!==void 0)return a.randomDelay;if(r){for(let a of this.limiters)if(a.isGlobal&&a.randomDelay!==void 0)return a.randomDelay}return}clearGlobalConfigs(){if(Array.isArray(this.requestHeaders))this.requestHeaders=this.requestHeaders.filter((e)=>!e.isGlobal);if(Array.isArray(this.oxylabs)){let e=l.splitConfigs(this.oxylabs,(r)=>!!r.isGlobal);l.destroyConfigQueues(e.removed),this.oxylabs=e.kept}if(Array.isArray(this.decodo)){let e=l.splitConfigs(this.decodo,(r)=>!!r.isGlobal);l.destroyConfigQueues(e.removed),this.decodo=e.kept}if(Array.isArray(this.limiters)){let e=l.splitConfigs(this.limiters,(r)=>!!r.isGlobal);l.destroyConfigQueues(e.removed),this.limiters=e.kept}if(Array.isArray(this.proxies))this.proxies=this.proxies.filter((e)=>!e.isGlobal);if(Array.isArray(this.stealthProfiles))this.stealthProfiles=this.stealthProfiles.filter((e)=>!e.isGlobal);return this}getAdapter(e,r,i,a){if(!this.getDomainName(e))return null;let s=r==="headers"?this.requestHeaders:r==="limiters"?this.limiters:r==="oxylabs"?this.oxylabs:r==="decodo"?this.decodo:r==="stealth"?this.stealthProfiles:this.proxies,o=this.selectConfigEntry(e,s,i,a);if(!o)return null;return r==="headers"?o.headers:r==="limiters"?o.pqueue:r==="oxylabs"?o.adaptar:r==="decodo"?o.adaptar:r==="stealth"?o.adaptar:o.proxy}getRetryOptions(e){if(!this.getDomainName(e))return null;for(let i=0;i<this.limiters.length;i++)if(this._hasDomain(e,this.limiters[i].domain))return this.limiters[i].retry||null;for(let i=0;i<this.limiters.length;i++)if(this.limiters[i].isGlobal&&this.limiters[i].retry)return this.limiters[i].retry;return null}rnd(e=0,r=Number.MAX_VALUE){return Math.floor(Math.random()*(r-e+1))+e}hasDomain(e,r,i){if(!this.getDomainName(e))return!1;let t=r==="headers"?this.requestHeaders:r==="limiters"?this.limiters:r==="oxylabs"?this.oxylabs:r==="decodo"?this.decodo:r==="stealth"?this.stealthProfiles:this.proxies;for(let s=0;s<t.length;s++)if(this._hasDomain(e,t[s].domain))return!0;if(i){for(let s=0;s<t.length;s++)if(t[s].isGlobal)return!0}return!1}pickHeaders(e,r,i,a){let t=this.getAdapter(e,"headers",r),s=new Headers(t??{}),o=s.count;if(i&&i instanceof Headers)for(let[d,n]of Object.entries(i.entries()))s.set(d,n);else if(i&&typeof i==="object"){for(let[d,n]of Object.entries(i))if(typeof n==="string")s.set(d,n)}if(a&&o===0&&!this.stealth)s.set("user-agent",this.getRandomUserAgent());return Object.fromEntries(s.entries())}_hasDomain(e,r){if(!r)return!1;let i=this.getDomainName(e);if(!i)return!1;let a=(s)=>{return/[\^\$\*\+\?\{\}\[\]\(\)\|\\]/.test(s)||s.startsWith("/")||s.includes(".*")||s.includes(".+")},t=(s)=>{if(s instanceof RegExp)return s.test(i)||s.test(e);let o=s.toString().trim();if(i.toLowerCase()===o.toLowerCase())return!0;if(o.includes("*")){let h=o.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\\*/g,".*"),u=new RegExp(`^${h}$`,"i");return u.test(i)||u.test(e)}if(a(o))try{let h=o,u="i",c=o.match(/^\/(.*)\/(\w*)$/);if(c)h=c[1],u=c[2]||"i";let m=new RegExp(h,u);return m.test(i)||m.test(e)}catch(h){return i.toLowerCase().includes(o.toLowerCase())}let d=i.toLowerCase(),n=o.toLowerCase();return d===n||d.endsWith("."+n)||n.endsWith("."+d)};if(Array.isArray(r)){for(let s of r)if(t(s))return!0;return!1}return t(r)}selectConfigEntry(e,r,i,a){if(!this.getDomainName(e))return null;let s=(n)=>{if(!n.length)return null;let h=a?this.rnd(0,n.length-1):0;return n[h]},o=[];for(let n of r)if(this._hasDomain(e,n.domain))o.push(n);let d=s(o);if(d)return d;if(i){let n=[];for(let h of r)if(h.isGlobal)n.push(h);return s(n)}return null}getDomainName(e){if(this.isValidUrl(e))return new URL(e).hostname;else if(this.isHostName(e))return e;return null}isHostName(e){if(!e)return!1;if(e.length>255)return!1;let r=/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+ [a-zA-Z]{2,})$/;return e=e.trim().toLowerCase(),r.test(e)&&!e.startsWith("-")&&!e.endsWith("-")}isValidUrl(e){if(!e)return!1;e=e.trim();try{let r=new URL(e);if(!r.protocol||!["http:","https:"].includes(r.protocol.toLowerCase()))return!1;if(!r.hostname)return!1;if(!/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,})$/.test(r.hostname))return!1;return!0}catch{return!1}}getRandomUserAgent(){return this.userAgents[Math.floor(Math.random()*this.userAgents.length)]}}function $(){let e=[{name:"Chrome",version:"91.0.4472.124",engine:"AppleWebKit/537.36"},{name:"Firefox",version:"89.0",engine:"Gecko/20100101"},{name:"Safari",version:"14.1.1",engine:"AppleWebKit/605.1.15"},{name:"Edge",version:"91.0.864.59",engine:"AppleWebKit/537.36"},{name:"Opera",version:"77.0.4054.277",engine:"AppleWebKit/537.36"},{name:"Vivaldi",version:"3.8.2259.42",engine:"AppleWebKit/537.36"},{name:"Brave",version:"1.26.74",engine:"AppleWebKit/537.36"},{name:"Chromium",version:"91.0.4472.101",engine:"AppleWebKit/537.36"},{name:"Yandex",version:"21.5.3.742",engine:"AppleWebKit/537.36"},{name:"Maxthon",version:"5.3.8.2000",engine:"AppleWebKit/537.36"}],r=["Windows NT 10.0","Windows NT 6.1","Macintosh; Intel Mac OS X 10_15_7","Macintosh; Intel Mac OS X 11_4_0","X11; Linux x86_64","X11; Ubuntu; Linux x86_64"],i=[];for(let a=0;a<200;a++){let t=e[Math.floor(Math.random()*e.length)],s=r[Math.floor(Math.random()*r.length)],o="";switch(t.name){case"Chrome":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36`;break;case"Firefox":o=`Mozilla/5.0 (${s}; rv:${t.version}) ${t.engine} Firefox/${t.version}`;break;case"Safari":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Version/${t.version} Safari/605.1.15`;break;case"Edge":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Edg/${t.version}`;break;case"Opera":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 OPR/${t.version}`;break;case"Vivaldi":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Vivaldi/${t.version}`;break;case"Brave":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Brave/${t.version}`;break;case"Chromium":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chromium/${t.version} Chrome/${t.version} Safari/537.36`;break;case"Yandex":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} YaBrowser/${t.version} Safari/537.36`;break;case"Maxthon":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Maxthon/${t.version}`;break}i.push(o)}return i}export{l as CrawlerOptions};
|
|
1
|
+
import{RezoQueue as y}from"../queue/queue.js";import{Oxylabs as x}from"./addon/oxylabs/index.js";import p from"node:path";import v from"node:os";import{Decodo as A}from"./addon/decodo/index.js";import{Rezo as D}from"../core/rezo.js";class l{static destroyQueue(e){if(e&&typeof e.destroy==="function")e.destroy()}static destroyConfigQueues(e){for(let r of e)l.destroyQueue(r.pqueue)}static splitConfigs(e,r){let i=[],a=[];for(let t of e)if(r(t))a.push(t);else i.push(t);return{kept:i,removed:a}}static getHostname(e){try{return new URL(e).hostname||void 0}catch{return}}static createStableThroughputOptions(e){let{baseUrl:r,concurrency:i=40,scraperConcurrency:a=10,retryDelay:t=1000,maxRetryAttempts:s=2,retryOnStatusCode:o=[408,500,502,503,504],maxWaitOn429:d=15000,alwaysWaitOn429:n=!1,globalLimiter:h={concurrency:8},domain:u=l.getHostname(r),domainLimiter:c={concurrency:2,interval:1000,intervalCap:2,randomDelay:150},domainRetry:m={enable:!0,max429Retries:2,retryDelay:1000,maxRetryAttempts:2,backoff:!0},extraLimiters:g=[],overrides:b={}}=e,f=[];if(h)f.push({isGlobal:!0,options:h});if(u&&c)f.push({domain:u,options:c,retry:m||void 0});return f.push(...g),{...{baseUrl:r,autoThrottle:!1,concurrency:i,scraperConcurrency:a,retryDelay:t,maxRetryAttempts:s,retryOnStatusCode:o,maxWaitOn429:d,alwaysWaitOn429:n,limiter:f.length>0?{enable:!0,limiters:f}:!1},...b,baseUrl:r}}baseUrl;adapter;enableNavigationHistory;sessionId;rejectUnauthorized;userAgent;useRndUserAgent;timeout;maxRedirects;maxRetryAttempts;retryDelay;retryOnStatusCode;forceRevisit;retryWithoutProxyOnStatusCode;retryOnProxyError;maxRetryOnProxyError;allowRevisiting;enableCache;cacheTTL;cacheDir;throwFatalError;debug;enableSignalHandlers;concurrency;scraperConcurrency;maxDepth;maxUrls;maxResponseSize;respectRobotsTxt;followNofollow;autoThrottle;autoThrottleTargetDelay;autoThrottleMinDelay;autoThrottleMaxDelay;maxWaitOn429;alwaysWaitOn429;stealth;oxylabs=[];decodo=[];stealthProfiles=[];proxies=[];limiters=[];onLimiterAdded;onProviderAdded;requestHeaders=[];userAgents=$();constructor(e={}){this.baseUrl=e.baseUrl||"",this.adapter=e.adapter??"http",this.enableNavigationHistory=e.enableNavigationHistory??!1,this.sessionId=e.sessionId??`session_${Date.now()}_${Math.random().toString(36).slice(2,8)}`,this.rejectUnauthorized=e.rejectUnauthorized??!0,this.userAgent=e.userAgent,this.useRndUserAgent=e.useRndUserAgent??!1,this.timeout=e.timeout??30000,this.maxRedirects=e.maxRedirects??10,this.maxRetryAttempts=e.maxRetryAttempts??3,this.retryDelay=e.retryDelay??0,this.retryOnStatusCode=e.retryOnStatusCode??[408,429,500,502,503,504],this.forceRevisit=e.forceRevisit??!1,this.retryWithoutProxyOnStatusCode=e.retryWithoutProxyOnStatusCode??[407,403],this.retryOnProxyError=e.retryOnProxyError??!0,this.maxRetryOnProxyError=e.maxRetryOnProxyError??3,this.allowRevisiting=e.allowRevisiting??!1,this.enableCache=e.enableCache??!0,this.cacheTTL=e.cacheTTL??604800000,this.cacheDir=e.cacheDir??p.join(v.tmpdir(),"rezo_cache"),this.throwFatalError=e.throwFatalError??!1,this.debug=e.debug??!1,this.enableSignalHandlers=e.enableSignalHandlers??!1,this.concurrency=e.concurrency??100,this.scraperConcurrency=e.scraperConcurrency??this.concurrency,this.maxDepth=e.maxDepth??0,this.maxUrls=e.maxUrls??0,this.maxResponseSize=e.maxResponseSize??0,this.respectRobotsTxt=e.respectRobotsTxt??!1,this.followNofollow=e.followNofollow??!1,this.autoThrottle=e.autoThrottle??!0,this.autoThrottleTargetDelay=e.autoThrottleTargetDelay??1000,this.autoThrottleMinDelay=e.autoThrottleMinDelay??100,this.autoThrottleMaxDelay=e.autoThrottleMaxDelay??60000,this.maxWaitOn429=e.maxWaitOn429??1800000,this.alwaysWaitOn429=e.alwaysWaitOn429??!1,this._addHeaders(e.headers),this._addOxylabs(e.oxylabs),this._addDecodo(e.decodo),this._addProxies(e.proxy),this._addLimiters(e.limiter),this._addStealths(e.stealths),this.stealth=e.stealth}getConfiguredDomains(e){return(e==="headers"?this.requestHeaders:e==="limiters"?this.limiters:e==="oxylabs"?this.oxylabs:e==="decodo"?this.decodo:e==="stealth"?this.stealthProfiles:this.proxies).filter((i)=>i.domain).map((i)=>i.domain).filter((i,a,t)=>t.indexOf(i)===a)}removeDomain(e){this.requestHeaders=this.requestHeaders.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this.proxies=this.proxies.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e));let r=l.splitConfigs(this.limiters,(t)=>!!t.domain&&this._domainsEqual(t.domain,e));l.destroyConfigQueues(r.removed),this.limiters=r.kept;let i=l.splitConfigs(this.oxylabs,(t)=>!!t.domain&&this._domainsEqual(t.domain,e));l.destroyConfigQueues(i.removed),this.oxylabs=i.kept;let a=l.splitConfigs(this.decodo,(t)=>!!t.domain&&this._domainsEqual(t.domain,e));return l.destroyConfigQueues(a.removed),this.decodo=a.kept,this.stealthProfiles=this.stealthProfiles.filter((t)=>!t.domain||!this._domainsEqual(t.domain,e)),this}_domainsEqual(e,r){if(Array.isArray(e)&&Array.isArray(r))return e.length===r.length&&e.every((i,a)=>i===r[a]);return e===r}getConfigurationSummary(){let e=(r)=>({total:r.length,global:r.filter((i)=>i.isGlobal).length,domainSpecific:r.filter((i)=>!i.isGlobal&&i.domain).length});return{headers:e(this.requestHeaders),proxies:e(this.proxies),limiters:e(this.limiters),oxylabs:e(this.oxylabs),decodo:e(this.decodo)}}_addHeaders(e){if(!e||!e.enable)return;for(let r of e.httpHeaders){let{domain:i,isGlobal:a,headers:t}=r;if(!i&&!a)continue;if(t instanceof Headers){let s=Object.fromEntries(t.entries());if(Object.keys(s).length<1)continue;t=s}else if(!t||Object.keys(t).length<1)continue;this.requestHeaders.push({domain:i,isGlobal:a,headers:t})}}_addProxies(e){if(!e||!e.enable)return;for(let r of e.proxies){let{domain:i,isGlobal:a,proxy:t,rotating:s}=r;if(!i&&!a)continue;if(!t||Object.keys(t).length<1)continue;this.proxies.push({domain:i,isGlobal:a,proxy:t,rotating:!!s})}}_addLimiters(e){if(!e||!e.enable)return;for(let r of e.limiters){let{domain:i,isGlobal:a,options:t,retry:s}=r;if(!i&&!a)continue;if(!t&&!s)continue;let o=t?new y(t):new y({name:"limiter"});if(this.limiters.push({domain:i,isGlobal:a,pqueue:o,randomDelay:t?.randomDelay,retry:s}),this.onLimiterAdded)this.onLimiterAdded(o)}}_addOxylabs(e){if(!e||!e.enable)return;for(let r of e.labs){let{domain:i,isGlobal:a,options:t,queueOptions:s}=r;if(!i&&!a)continue;if(!t||Object.keys(t).length<1)continue;let o=s?new y(s):void 0;if(this.oxylabs.push({domain:i,isGlobal:a,adaptar:new x(t),pqueue:o}),o&&this.onProviderAdded)this.onProviderAdded(o)}}_addDecodo(e){if(!e||!e.enable)return;for(let r of e.labs){let{domain:i,isGlobal:a,options:t,queueOptions:s}=r;if(!i&&!a)continue;if(!t||Object.keys(t).length<1)continue;let o=s?new y(s):void 0;if(this.decodo.push({domain:i,isGlobal:a,adaptar:new A(t),pqueue:o}),o&&this.onProviderAdded)this.onProviderAdded(o)}}_addStealths(e){if(!e||!e.enable)return;for(let r of e.profiles){let{domain:i,isGlobal:a,stealth:t}=r;if(!i&&!a)continue;if(!t)continue;this.stealthProfiles.push({domain:i,isGlobal:a,adaptar:new D({stealth:t})})}}addHeaders(e){return this._addHeaders({enable:!0,httpHeaders:[e]}),this}addProxy(e){return this._addProxies({enable:!0,proxies:[e]}),this}addProxies(e){return this._addProxies({enable:!0,proxies:e}),this}getProxyConfig(e,r,i){let a=this.selectConfigEntry(e,this.proxies,r,i);if(!a)return null;return{proxy:a.proxy,rotating:!!a.rotating}}addLimiter(e){return this._addLimiters({enable:!0,limiters:[e]}),this}addOxylabs(e){return this._addOxylabs({enable:!0,labs:[e]}),this}addDecodo(e){return this._addDecodo({enable:!0,labs:[e]}),this}addStealth(e){return this._addStealths({enable:!0,profiles:[e]}),this}destroyLimiters(){for(let e of this.limiters)l.destroyQueue(e.pqueue);this.limiters=[]}destroyProviderQueues(){for(let e of this.oxylabs)l.destroyQueue(e.pqueue),delete e.pqueue;for(let e of this.decodo)l.destroyQueue(e.pqueue),delete e.pqueue}getLimiters(){return this.limiters}getRandomDelay(e,r){if(!this.getDomainName(e))return;for(let a of this.limiters)if(this._hasDomain(e,a.domain)&&a.randomDelay!==void 0)return a.randomDelay;if(r){for(let a of this.limiters)if(a.isGlobal&&a.randomDelay!==void 0)return a.randomDelay}return}clearGlobalConfigs(){if(Array.isArray(this.requestHeaders))this.requestHeaders=this.requestHeaders.filter((e)=>!e.isGlobal);if(Array.isArray(this.oxylabs)){let e=l.splitConfigs(this.oxylabs,(r)=>!!r.isGlobal);l.destroyConfigQueues(e.removed),this.oxylabs=e.kept}if(Array.isArray(this.decodo)){let e=l.splitConfigs(this.decodo,(r)=>!!r.isGlobal);l.destroyConfigQueues(e.removed),this.decodo=e.kept}if(Array.isArray(this.limiters)){let e=l.splitConfigs(this.limiters,(r)=>!!r.isGlobal);l.destroyConfigQueues(e.removed),this.limiters=e.kept}if(Array.isArray(this.proxies))this.proxies=this.proxies.filter((e)=>!e.isGlobal);if(Array.isArray(this.stealthProfiles))this.stealthProfiles=this.stealthProfiles.filter((e)=>!e.isGlobal);return this}getAdapter(e,r,i,a){if(!this.getDomainName(e))return null;let s=r==="headers"?this.requestHeaders:r==="limiters"?this.limiters:r==="oxylabs"?this.oxylabs:r==="decodo"?this.decodo:r==="stealth"?this.stealthProfiles:this.proxies,o=this.selectConfigEntry(e,s,i,a);if(!o)return null;return r==="headers"?o.headers:r==="limiters"?o.pqueue:r==="oxylabs"?o.adaptar:r==="decodo"?o.adaptar:r==="stealth"?o.adaptar:o.proxy}getRetryOptions(e){if(!this.getDomainName(e))return null;for(let i=0;i<this.limiters.length;i++)if(this._hasDomain(e,this.limiters[i].domain))return this.limiters[i].retry||null;for(let i=0;i<this.limiters.length;i++)if(this.limiters[i].isGlobal&&this.limiters[i].retry)return this.limiters[i].retry;return null}rnd(e=0,r=Number.MAX_VALUE){return Math.floor(Math.random()*(r-e+1))+e}hasDomain(e,r,i){if(!this.getDomainName(e))return!1;let t=r==="headers"?this.requestHeaders:r==="limiters"?this.limiters:r==="oxylabs"?this.oxylabs:r==="decodo"?this.decodo:r==="stealth"?this.stealthProfiles:this.proxies;for(let s=0;s<t.length;s++)if(this._hasDomain(e,t[s].domain))return!0;if(i){for(let s=0;s<t.length;s++)if(t[s].isGlobal)return!0}return!1}pickHeaders(e,r,i,a){let t=this.getAdapter(e,"headers",r),s=new Headers(t??{}),o=[...s.keys()].length;if(i&&i instanceof Headers)for(let[d,n]of Object.entries(i.entries()))s.set(d,n);else if(i&&typeof i==="object"){for(let[d,n]of Object.entries(i))if(typeof n==="string")s.set(d,n)}if(a&&o===0&&!this.stealth)s.set("user-agent",this.getRandomUserAgent());return Object.fromEntries(s.entries())}_hasDomain(e,r){if(!r)return!1;let i=this.getDomainName(e);if(!i)return!1;let a=(s)=>{return/[\^\$\*\+\?\{\}\[\]\(\)\|\\]/.test(s)||s.startsWith("/")||s.includes(".*")||s.includes(".+")},t=(s)=>{if(s instanceof RegExp)return s.test(i)||s.test(e);let o=s.toString().trim();if(i.toLowerCase()===o.toLowerCase())return!0;if(o.includes("*")){let h=o.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\\*/g,".*"),u=new RegExp(`^${h}$`,"i");return u.test(i)||u.test(e)}if(a(o))try{let h=o,u="i",c=o.match(/^\/(.*)\/(\w*)$/);if(c)h=c[1],u=c[2]||"i";let m=new RegExp(h,u);return m.test(i)||m.test(e)}catch(h){return i.toLowerCase().includes(o.toLowerCase())}let d=i.toLowerCase(),n=o.toLowerCase();return d===n||d.endsWith("."+n)||n.endsWith("."+d)};if(Array.isArray(r)){for(let s of r)if(t(s))return!0;return!1}return t(r)}selectConfigEntry(e,r,i,a){if(!this.getDomainName(e))return null;let s=(n)=>{if(!n.length)return null;let h=a?this.rnd(0,n.length-1):0;return n[h]},o=[];for(let n of r)if(this._hasDomain(e,n.domain))o.push(n);let d=s(o);if(d)return d;if(i){let n=[];for(let h of r)if(h.isGlobal)n.push(h);return s(n)}return null}getDomainName(e){if(this.isValidUrl(e))return new URL(e).hostname;else if(this.isHostName(e))return e;return null}isHostName(e){if(!e)return!1;if(e.length>255)return!1;let r=/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+ [a-zA-Z]{2,})$/;return e=e.trim().toLowerCase(),r.test(e)&&!e.startsWith("-")&&!e.endsWith("-")}isValidUrl(e){if(!e)return!1;e=e.trim();try{let r=new URL(e);if(!r.protocol||!["http:","https:"].includes(r.protocol.toLowerCase()))return!1;if(!r.hostname)return!1;if(!/^(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,})$/.test(r.hostname))return!1;return!0}catch{return!1}}getRandomUserAgent(){return this.userAgents[Math.floor(Math.random()*this.userAgents.length)]}}function $(){let e=[{name:"Chrome",version:"91.0.4472.124",engine:"AppleWebKit/537.36"},{name:"Firefox",version:"89.0",engine:"Gecko/20100101"},{name:"Safari",version:"14.1.1",engine:"AppleWebKit/605.1.15"},{name:"Edge",version:"91.0.864.59",engine:"AppleWebKit/537.36"},{name:"Opera",version:"77.0.4054.277",engine:"AppleWebKit/537.36"},{name:"Vivaldi",version:"3.8.2259.42",engine:"AppleWebKit/537.36"},{name:"Brave",version:"1.26.74",engine:"AppleWebKit/537.36"},{name:"Chromium",version:"91.0.4472.101",engine:"AppleWebKit/537.36"},{name:"Yandex",version:"21.5.3.742",engine:"AppleWebKit/537.36"},{name:"Maxthon",version:"5.3.8.2000",engine:"AppleWebKit/537.36"}],r=["Windows NT 10.0","Windows NT 6.1","Macintosh; Intel Mac OS X 10_15_7","Macintosh; Intel Mac OS X 11_4_0","X11; Linux x86_64","X11; Ubuntu; Linux x86_64"],i=[];for(let a=0;a<200;a++){let t=e[Math.floor(Math.random()*e.length)],s=r[Math.floor(Math.random()*r.length)],o="";switch(t.name){case"Chrome":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36`;break;case"Firefox":o=`Mozilla/5.0 (${s}; rv:${t.version}) ${t.engine} Firefox/${t.version}`;break;case"Safari":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Version/${t.version} Safari/605.1.15`;break;case"Edge":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Edg/${t.version}`;break;case"Opera":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 OPR/${t.version}`;break;case"Vivaldi":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Vivaldi/${t.version}`;break;case"Brave":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Brave/${t.version}`;break;case"Chromium":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chromium/${t.version} Chrome/${t.version} Safari/537.36`;break;case"Yandex":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} YaBrowser/${t.version} Safari/537.36`;break;case"Maxthon":o=`Mozilla/5.0 (${s}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Maxthon/${t.version}`;break}i.push(o)}return i}export{l as CrawlerOptions};
|
package/dist/crawler/crawler.cjs
CHANGED
|
@@ -927,6 +927,18 @@ class Crawler {
|
|
|
927
927
|
}
|
|
928
928
|
}
|
|
929
929
|
}
|
|
930
|
+
dispatchErrorEvents(err) {
|
|
931
|
+
for (let i = 0;i < this.errorEvents.length; i++) {
|
|
932
|
+
const event = this.errorEvents[i];
|
|
933
|
+
try {
|
|
934
|
+
this[event.handler](...event.attr, err);
|
|
935
|
+
} catch (handlerErr) {
|
|
936
|
+
if (this.config.debug) {
|
|
937
|
+
console.error("[Crawler] error-event handler threw:", handlerErr);
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
}
|
|
930
942
|
_runHandler(handler, arg) {
|
|
931
943
|
this.eventCount++;
|
|
932
944
|
new Promise(async (resolve) => {
|
|
@@ -1410,8 +1422,10 @@ class Crawler {
|
|
|
1410
1422
|
await this.waitForNavigationHistory();
|
|
1411
1423
|
await this.executeHttp(method, url, body, options, proxyRotating, isEmail, forceRevisit, oxylabsOptions, oxylabsInstance, oxylabsQueue, decodoInstance, decodoOptions, decodoQueue, 0, undefined, skipCache, emailMetadata, stealthInstance);
|
|
1412
1424
|
}).catch((err) => {
|
|
1413
|
-
|
|
1414
|
-
|
|
1425
|
+
console.error(`[Crawler] execute() task error for ${url}:`, err?.message ?? err);
|
|
1426
|
+
if (this.config.debug && err?.stack)
|
|
1427
|
+
console.error(err.stack);
|
|
1428
|
+
this.dispatchErrorEvents(err);
|
|
1415
1429
|
});
|
|
1416
1430
|
}
|
|
1417
1431
|
async execute2(method, url, body, options = {}, forceRevisit, emailMetadata) {
|
|
@@ -1436,8 +1450,10 @@ class Crawler {
|
|
|
1436
1450
|
emailMetadata
|
|
1437
1451
|
}, forceRevisit, true);
|
|
1438
1452
|
}).catch((err) => {
|
|
1439
|
-
|
|
1440
|
-
|
|
1453
|
+
console.error(`[Crawler] execute2() task error for ${url}:`, err?.message ?? err);
|
|
1454
|
+
if (this.config.debug && err?.stack)
|
|
1455
|
+
console.error(err.stack);
|
|
1456
|
+
this.dispatchErrorEvents(err);
|
|
1441
1457
|
});
|
|
1442
1458
|
}
|
|
1443
1459
|
async executeHttp(method, url, body, options = {}, proxyRotating = false, isEmail, forceRevisit, oxylabsOptions, oxylabsInstance, oxylabsQueue, decodoInstance, decodoOptions, decodoQueue, retryCount = 0, parentUrl, skipCache, emailMetadata, stealthInstance) {
|
|
@@ -1726,6 +1742,10 @@ class Crawler {
|
|
|
1726
1742
|
this.rawResponseEvents.length = 0;
|
|
1727
1743
|
this.emailDiscoveredEvents.length = 0;
|
|
1728
1744
|
this.emailLeadsEvents.length = 0;
|
|
1745
|
+
this.startHandlers.length = 0;
|
|
1746
|
+
this.finishHandlers.length = 0;
|
|
1747
|
+
this.redirectHandlers.length = 0;
|
|
1748
|
+
this.queueChangeHandlers.length = 0;
|
|
1729
1749
|
this.domainResponseTimes.clear();
|
|
1730
1750
|
this.domainCurrentDelay.clear();
|
|
1731
1751
|
this.urlDepthMap.clear();
|
package/dist/crawler/crawler.js
CHANGED
|
@@ -927,6 +927,18 @@ export class Crawler {
|
|
|
927
927
|
}
|
|
928
928
|
}
|
|
929
929
|
}
|
|
930
|
+
dispatchErrorEvents(err) {
|
|
931
|
+
for (let i = 0;i < this.errorEvents.length; i++) {
|
|
932
|
+
const event = this.errorEvents[i];
|
|
933
|
+
try {
|
|
934
|
+
this[event.handler](...event.attr, err);
|
|
935
|
+
} catch (handlerErr) {
|
|
936
|
+
if (this.config.debug) {
|
|
937
|
+
console.error("[Crawler] error-event handler threw:", handlerErr);
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
}
|
|
930
942
|
_runHandler(handler, arg) {
|
|
931
943
|
this.eventCount++;
|
|
932
944
|
new Promise(async (resolve) => {
|
|
@@ -1410,8 +1422,10 @@ export class Crawler {
|
|
|
1410
1422
|
await this.waitForNavigationHistory();
|
|
1411
1423
|
await this.executeHttp(method, url, body, options, proxyRotating, isEmail, forceRevisit, oxylabsOptions, oxylabsInstance, oxylabsQueue, decodoInstance, decodoOptions, decodoQueue, 0, undefined, skipCache, emailMetadata, stealthInstance);
|
|
1412
1424
|
}).catch((err) => {
|
|
1413
|
-
|
|
1414
|
-
|
|
1425
|
+
console.error(`[Crawler] execute() task error for ${url}:`, err?.message ?? err);
|
|
1426
|
+
if (this.config.debug && err?.stack)
|
|
1427
|
+
console.error(err.stack);
|
|
1428
|
+
this.dispatchErrorEvents(err);
|
|
1415
1429
|
});
|
|
1416
1430
|
}
|
|
1417
1431
|
async execute2(method, url, body, options = {}, forceRevisit, emailMetadata) {
|
|
@@ -1436,8 +1450,10 @@ export class Crawler {
|
|
|
1436
1450
|
emailMetadata
|
|
1437
1451
|
}, forceRevisit, true);
|
|
1438
1452
|
}).catch((err) => {
|
|
1439
|
-
|
|
1440
|
-
|
|
1453
|
+
console.error(`[Crawler] execute2() task error for ${url}:`, err?.message ?? err);
|
|
1454
|
+
if (this.config.debug && err?.stack)
|
|
1455
|
+
console.error(err.stack);
|
|
1456
|
+
this.dispatchErrorEvents(err);
|
|
1441
1457
|
});
|
|
1442
1458
|
}
|
|
1443
1459
|
async executeHttp(method, url, body, options = {}, proxyRotating = false, isEmail, forceRevisit, oxylabsOptions, oxylabsInstance, oxylabsQueue, decodoInstance, decodoOptions, decodoQueue, retryCount = 0, parentUrl, skipCache, emailMetadata, stealthInstance) {
|
|
@@ -1726,6 +1742,10 @@ export class Crawler {
|
|
|
1726
1742
|
this.rawResponseEvents.length = 0;
|
|
1727
1743
|
this.emailDiscoveredEvents.length = 0;
|
|
1728
1744
|
this.emailLeadsEvents.length = 0;
|
|
1745
|
+
this.startHandlers.length = 0;
|
|
1746
|
+
this.finishHandlers.length = 0;
|
|
1747
|
+
this.redirectHandlers.length = 0;
|
|
1748
|
+
this.queueChangeHandlers.length = 0;
|
|
1729
1749
|
this.domainResponseTimes.clear();
|
|
1730
1750
|
this.domainCurrentDelay.clear();
|
|
1731
1751
|
this.urlDepthMap.clear();
|
package/dist/crawler/index.cjs
CHANGED
|
@@ -1,42 +1,42 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.Crawler =
|
|
3
|
-
const
|
|
4
|
-
exports.CrawlerOptions =
|
|
5
|
-
const
|
|
6
|
-
exports.RobotsTxt =
|
|
7
|
-
const
|
|
8
|
-
exports.FileCacher =
|
|
9
|
-
const
|
|
10
|
-
exports.UrlStore =
|
|
11
|
-
const
|
|
12
|
-
exports.NavigationHistory =
|
|
13
|
-
const
|
|
14
|
-
exports.Oxylabs =
|
|
15
|
-
const
|
|
16
|
-
exports.OXYLABS_BROWSER_TYPES =
|
|
17
|
-
exports.OXYLABS_COMMON_LOCALES =
|
|
18
|
-
exports.OXYLABS_COMMON_GEO_LOCATIONS =
|
|
19
|
-
exports.OXYLABS_US_STATES =
|
|
20
|
-
exports.OXYLABS_EUROPEAN_COUNTRIES =
|
|
21
|
-
exports.OXYLABS_ASIAN_COUNTRIES =
|
|
22
|
-
exports.getRandomOxylabsBrowserType =
|
|
23
|
-
exports.getRandomOxylabsLocale =
|
|
24
|
-
exports.getRandomOxylabsGeoLocation =
|
|
25
|
-
const
|
|
26
|
-
exports.isRestrictedDomain =
|
|
27
|
-
const
|
|
28
|
-
exports.Decodo =
|
|
29
|
-
const
|
|
30
|
-
exports.DECODO_DEVICE_TYPES =
|
|
31
|
-
exports.DECODO_HEADLESS_MODES =
|
|
32
|
-
exports.DECODO_COMMON_LOCALES =
|
|
33
|
-
exports.DECODO_COMMON_COUNTRIES =
|
|
34
|
-
exports.DECODO_EUROPEAN_COUNTRIES =
|
|
35
|
-
exports.DECODO_ASIAN_COUNTRIES =
|
|
36
|
-
exports.DECODO_US_STATES =
|
|
37
|
-
exports.DECODO_COMMON_CITIES =
|
|
38
|
-
exports.getRandomDecodoDeviceType =
|
|
39
|
-
exports.getRandomDecodoLocale =
|
|
40
|
-
exports.getRandomDecodoCountry =
|
|
41
|
-
exports.getRandomDecodoCity =
|
|
42
|
-
exports.generateDecodoSessionId =
|
|
1
|
+
const _mod_0d9h80 = require('./crawler.cjs');
|
|
2
|
+
exports.Crawler = _mod_0d9h80.Crawler;;
|
|
3
|
+
const _mod_flsl96 = require('./crawler-options.cjs');
|
|
4
|
+
exports.CrawlerOptions = _mod_flsl96.CrawlerOptions;;
|
|
5
|
+
const _mod_hshb8k = require('./plugin/robots-txt.cjs');
|
|
6
|
+
exports.RobotsTxt = _mod_hshb8k.RobotsTxt;;
|
|
7
|
+
const _mod_qmo1zd = require('./plugin/file-cacher.cjs');
|
|
8
|
+
exports.FileCacher = _mod_qmo1zd.FileCacher;;
|
|
9
|
+
const _mod_lktnzz = require('./plugin/url-store.cjs');
|
|
10
|
+
exports.UrlStore = _mod_lktnzz.UrlStore;;
|
|
11
|
+
const _mod_g62vuv = require('./plugin/navigation-history.cjs');
|
|
12
|
+
exports.NavigationHistory = _mod_g62vuv.NavigationHistory;;
|
|
13
|
+
const _mod_dfvupb = require('./addon/oxylabs/index.cjs');
|
|
14
|
+
exports.Oxylabs = _mod_dfvupb.Oxylabs;;
|
|
15
|
+
const _mod_lffstj = require('./addon/oxylabs/options.cjs');
|
|
16
|
+
exports.OXYLABS_BROWSER_TYPES = _mod_lffstj.OXYLABS_BROWSER_TYPES;
|
|
17
|
+
exports.OXYLABS_COMMON_LOCALES = _mod_lffstj.OXYLABS_COMMON_LOCALES;
|
|
18
|
+
exports.OXYLABS_COMMON_GEO_LOCATIONS = _mod_lffstj.OXYLABS_COMMON_GEO_LOCATIONS;
|
|
19
|
+
exports.OXYLABS_US_STATES = _mod_lffstj.OXYLABS_US_STATES;
|
|
20
|
+
exports.OXYLABS_EUROPEAN_COUNTRIES = _mod_lffstj.OXYLABS_EUROPEAN_COUNTRIES;
|
|
21
|
+
exports.OXYLABS_ASIAN_COUNTRIES = _mod_lffstj.OXYLABS_ASIAN_COUNTRIES;
|
|
22
|
+
exports.getRandomOxylabsBrowserType = _mod_lffstj.getRandomBrowserType;
|
|
23
|
+
exports.getRandomOxylabsLocale = _mod_lffstj.getRandomLocale;
|
|
24
|
+
exports.getRandomOxylabsGeoLocation = _mod_lffstj.getRandomGeoLocation;;
|
|
25
|
+
const _mod_6v3fut = require('./scraper.cjs');
|
|
26
|
+
exports.isRestrictedDomain = _mod_6v3fut.isRestrictedDomain;;
|
|
27
|
+
const _mod_jcv8ev = require('./addon/decodo/index.cjs');
|
|
28
|
+
exports.Decodo = _mod_jcv8ev.Decodo;;
|
|
29
|
+
const _mod_quo7cv = require('./addon/decodo/options.cjs');
|
|
30
|
+
exports.DECODO_DEVICE_TYPES = _mod_quo7cv.DECODO_DEVICE_TYPES;
|
|
31
|
+
exports.DECODO_HEADLESS_MODES = _mod_quo7cv.DECODO_HEADLESS_MODES;
|
|
32
|
+
exports.DECODO_COMMON_LOCALES = _mod_quo7cv.DECODO_COMMON_LOCALES;
|
|
33
|
+
exports.DECODO_COMMON_COUNTRIES = _mod_quo7cv.DECODO_COMMON_COUNTRIES;
|
|
34
|
+
exports.DECODO_EUROPEAN_COUNTRIES = _mod_quo7cv.DECODO_EUROPEAN_COUNTRIES;
|
|
35
|
+
exports.DECODO_ASIAN_COUNTRIES = _mod_quo7cv.DECODO_ASIAN_COUNTRIES;
|
|
36
|
+
exports.DECODO_US_STATES = _mod_quo7cv.DECODO_US_STATES;
|
|
37
|
+
exports.DECODO_COMMON_CITIES = _mod_quo7cv.DECODO_COMMON_CITIES;
|
|
38
|
+
exports.getRandomDecodoDeviceType = _mod_quo7cv.getRandomDeviceType;
|
|
39
|
+
exports.getRandomDecodoLocale = _mod_quo7cv.getRandomLocale;
|
|
40
|
+
exports.getRandomDecodoCountry = _mod_quo7cv.getRandomCountry;
|
|
41
|
+
exports.getRandomDecodoCity = _mod_quo7cv.getRandomCity;
|
|
42
|
+
exports.generateDecodoSessionId = _mod_quo7cv.generateSessionId;;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
var e=require("./file-cacher.cjs");exports.FileCacher=e.FileCacher;var r=require("./url-store.cjs");exports.UrlStore=r.UrlStore;var o=require("./result-stream.cjs");exports.ResultStream=o.ResultStream;var t=require("./memory-monitor.cjs");exports.MemoryMonitor=t.MemoryMonitor;var a=require("./health-metrics.cjs");exports.HealthMetrics=a.HealthMetrics;var i=require("./capped-map.cjs");exports.CappedMap=i.CappedMap;var
|
|
1
|
+
var e=require("./file-cacher.cjs");exports.FileCacher=e.FileCacher;var r=require("./url-store.cjs");exports.UrlStore=r.UrlStore;var o=require("./result-stream.cjs");exports.ResultStream=o.ResultStream;var t=require("./memory-monitor.cjs");exports.MemoryMonitor=t.MemoryMonitor;var a=require("./health-metrics.cjs");exports.HealthMetrics=a.HealthMetrics;var i=require("./capped-map.cjs");exports.CappedMap=i.CappedMap;var d=require("./capped-array.cjs");exports.CappedArray=d.CappedArray;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
var
|
|
1
|
+
var d=exports.isBun=typeof globalThis.Bun<"u",f={maxRetries:5,initialDelayMs:50,maxDelayMs:1000};function y(a){if(!a)return!1;let e=a.message||a.toString();return e.includes("SQLITE_BUSY")||e.includes("database is locked")||e.includes("database table is locked")}function g(a){return new Promise((e)=>setTimeout(e,a))}function M(a){if(a<=0)return;try{let e=m??(m=new Int32Array(new SharedArrayBuffer(4)));Atomics.wait(e,0,0,a)}catch{let e=Date.now();while(Date.now()-e<a);}}var m=null;function u(a,e={}){let c={...f,...e},o;for(let r=0;r<=c.maxRetries;r++)try{return a()}catch(i){if(o=i,!y(i)||r>=c.maxRetries)throw i;let t=Math.min(c.initialDelayMs*Math.pow(2,r)+Math.random()*50,c.maxDelayMs);M(t)}throw o}async function x(a,e={}){let c={...f,...e},o;for(let r=0;r<=c.maxRetries;r++)try{return await a()}catch(i){if(o=i,!y(i)||r>=c.maxRetries)throw i;let t=Math.min(c.initialDelayMs*Math.pow(2,r)+Math.random()*50,c.maxDelayMs);await g(t)}throw o}async function D(a,e={}){if(d){let{Database:t}=await import("bun:sqlite"),n=new t(a),w=new Map,h=(s)=>{let l=w.get(s);if(!l)l=n.query(s),w.set(s,l);return l};return{run:(s,...l)=>u(()=>{if(l.length===0)n.run(s);else h(s).run(...l)},e),get:(s,...l)=>u(()=>h(s).get(...l),e),all:(s,...l)=>u(()=>h(s).all(...l),e),exec:(s)=>u(()=>n.exec(s),e),close:()=>{w.clear(),n.close()}}}let{DatabaseSync:c}=await import("node:sqlite"),o=new c(a),r=new Map,i=(t)=>{let n=r.get(t);if(!n)n=o.prepare(t),r.set(t,n);return n};return{run:(t,...n)=>u(()=>{if(n.length===0)o.exec(t);else i(t).run(...n)},e),get:(t,...n)=>u(()=>{return i(t).get(...n)},e),all:(t,...n)=>u(()=>{return i(t).all(...n)},e),exec:(t)=>u(()=>o.exec(t),e),close:()=>{r.clear(),o.close()}}}exports.withRetrySync=u;exports.withRetryAsync=x;exports.createDatabase=D;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
import{createRequire as d}from"node:module";var
|
|
1
|
+
import{createRequire as d}from"node:module";var f=d(import.meta.url);var g=typeof globalThis.Bun<"u",x={maxRetries:5,initialDelayMs:50,maxDelayMs:1000};function y(a){if(!a)return!1;let e=a.message||a.toString();return e.includes("SQLITE_BUSY")||e.includes("database is locked")||e.includes("database table is locked")}function M(a){return new Promise((e)=>setTimeout(e,a))}function D(a){if(a<=0)return;try{let e=h??(h=new Int32Array(new SharedArrayBuffer(4)));Atomics.wait(e,0,0,a)}catch{let e=Date.now();while(Date.now()-e<a);}}var h=null;function u(a,e={}){let c={...x,...e},l;for(let r=0;r<=c.maxRetries;r++)try{return a()}catch(o){if(l=o,!y(o)||r>=c.maxRetries)throw o;let t=Math.min(c.initialDelayMs*Math.pow(2,r)+Math.random()*50,c.maxDelayMs);D(t)}throw l}async function R(a,e={}){let c={...x,...e},l;for(let r=0;r<=c.maxRetries;r++)try{return await a()}catch(o){if(l=o,!y(o)||r>=c.maxRetries)throw o;let t=Math.min(c.initialDelayMs*Math.pow(2,r)+Math.random()*50,c.maxDelayMs);await M(t)}throw l}async function b(a,e={}){if(g){let{Database:t}=await import("bun:sqlite"),n=new t(a),w=new Map,m=(s)=>{let i=w.get(s);if(!i)i=n.query(s),w.set(s,i);return i};return{run:(s,...i)=>u(()=>{if(i.length===0)n.run(s);else m(s).run(...i)},e),get:(s,...i)=>u(()=>m(s).get(...i),e),all:(s,...i)=>u(()=>m(s).all(...i),e),exec:(s)=>u(()=>n.exec(s),e),close:()=>{w.clear(),n.close()}}}let{DatabaseSync:c}=await import("node:sqlite"),l=new c(a),r=new Map,o=(t)=>{let n=r.get(t);if(!n)n=l.prepare(t),r.set(t,n);return n};return{run:(t,...n)=>u(()=>{if(n.length===0)l.exec(t);else o(t).run(...n)},e),get:(t,...n)=>u(()=>{return o(t).get(...n)},e),all:(t,...n)=>u(()=>{return o(t).all(...n)},e),exec:(t)=>u(()=>l.exec(t),e),close:()=>{r.clear(),l.close()}}}export{u as withRetrySync,R as withRetryAsync,g as isBun,b as createDatabase};
|
package/dist/crawler.d.ts
CHANGED
|
@@ -9372,6 +9372,13 @@ export declare class Crawler {
|
|
|
9372
9372
|
* Called when onQueueChange handlers are registered to ensure limiter events are captured.
|
|
9373
9373
|
*/
|
|
9374
9374
|
private _subscribeToManagedQueues;
|
|
9375
|
+
/**
|
|
9376
|
+
* Invokes every registered error-event handler with the given error.
|
|
9377
|
+
* Swallows per-handler throws so one misbehaving handler can't mask others.
|
|
9378
|
+
* Used from the fire-and-forget execute/execute2 catch blocks so those
|
|
9379
|
+
* errors reach the same handler chain as in-crawl errors.
|
|
9380
|
+
*/
|
|
9381
|
+
private dispatchErrorEvents;
|
|
9375
9382
|
/**
|
|
9376
9383
|
* Run a handler with event tracking (not through queue).
|
|
9377
9384
|
* Increments eventCount before running, decrements after completion.
|
package/dist/dom/index.cjs
CHANGED
|
@@ -1,23 +1 @@
|
|
|
1
|
-
|
|
2
|
-
function getLinkedom() {
|
|
3
|
-
if (_linkedom)
|
|
4
|
-
return _linkedom;
|
|
5
|
-
try {
|
|
6
|
-
_linkedom = require("linkedom");
|
|
7
|
-
return _linkedom;
|
|
8
|
-
} catch {
|
|
9
|
-
throw new Error(`linkedom is required for DOM parsing but is not installed.
|
|
10
|
-
` + `Install it with: npm install linkedom
|
|
11
|
-
` + "Or: bun add linkedom");
|
|
12
|
-
}
|
|
13
|
-
}
|
|
14
|
-
function parseHTML(html) {
|
|
15
|
-
return getLinkedom().parseHTML(html);
|
|
16
|
-
}
|
|
17
|
-
function createDOMParser() {
|
|
18
|
-
const Linkedom = getLinkedom();
|
|
19
|
-
return new Linkedom.DOMParser;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
exports.parseHTML = parseHTML;
|
|
23
|
-
exports.createDOMParser = createDOMParser;
|
|
1
|
+
Object.assign(exports, require("linkedom"));
|