@isdk/web-fetcher 0.2.9 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/README.cn.md +10 -0
  2. package/README.engine.cn.md +16 -4
  3. package/README.engine.md +16 -4
  4. package/README.md +10 -0
  5. package/dist/index.d.mts +49 -6
  6. package/dist/index.d.ts +49 -6
  7. package/dist/index.js +1 -1
  8. package/dist/index.mjs +1 -1
  9. package/docs/README.md +10 -0
  10. package/docs/_media/README.cn.md +10 -0
  11. package/docs/_media/README.engine.md +16 -4
  12. package/docs/classes/CheerioFetchEngine.md +125 -61
  13. package/docs/classes/ClickAction.md +23 -23
  14. package/docs/classes/ExtractAction.md +23 -23
  15. package/docs/classes/FetchAction.md +23 -23
  16. package/docs/classes/FetchEngine.md +105 -61
  17. package/docs/classes/FetchSession.md +10 -10
  18. package/docs/classes/FillAction.md +23 -23
  19. package/docs/classes/GetContentAction.md +23 -23
  20. package/docs/classes/GotoAction.md +23 -23
  21. package/docs/classes/PauseAction.md +23 -23
  22. package/docs/classes/PlaywrightFetchEngine.md +125 -61
  23. package/docs/classes/SubmitAction.md +23 -23
  24. package/docs/classes/WaitForAction.md +23 -23
  25. package/docs/classes/WebFetcher.md +5 -5
  26. package/docs/enumerations/FetchActionResultStatus.md +4 -4
  27. package/docs/functions/fetchWeb.md +2 -2
  28. package/docs/globals.md +1 -0
  29. package/docs/interfaces/BaseFetchActionProperties.md +11 -11
  30. package/docs/interfaces/BaseFetchCollectorActionProperties.md +15 -15
  31. package/docs/interfaces/BaseFetcherProperties.md +58 -32
  32. package/docs/interfaces/DispatchedEngineAction.md +4 -4
  33. package/docs/interfaces/ExtractActionProperties.md +11 -11
  34. package/docs/interfaces/FetchActionInContext.md +15 -15
  35. package/docs/interfaces/FetchActionProperties.md +12 -12
  36. package/docs/interfaces/FetchActionResult.md +6 -6
  37. package/docs/interfaces/FetchContext.md +80 -46
  38. package/docs/interfaces/FetchEngineContext.md +75 -41
  39. package/docs/interfaces/FetchMetadata.md +5 -5
  40. package/docs/interfaces/FetchResponse.md +14 -14
  41. package/docs/interfaces/FetchReturnTypeRegistry.md +7 -7
  42. package/docs/interfaces/FetchSite.md +73 -39
  43. package/docs/interfaces/FetcherOptions.md +72 -38
  44. package/docs/interfaces/GotoActionOptions.md +6 -6
  45. package/docs/interfaces/PendingEngineRequest.md +3 -3
  46. package/docs/interfaces/StorageOptions.md +61 -0
  47. package/docs/interfaces/SubmitActionOptions.md +2 -2
  48. package/docs/interfaces/WaitForActionOptions.md +5 -5
  49. package/docs/type-aliases/BaseFetchActionOptions.md +1 -1
  50. package/docs/type-aliases/BaseFetchCollectorOptions.md +1 -1
  51. package/docs/type-aliases/BrowserEngine.md +1 -1
  52. package/docs/type-aliases/FetchActionCapabilities.md +1 -1
  53. package/docs/type-aliases/FetchActionCapabilityMode.md +1 -1
  54. package/docs/type-aliases/FetchActionOptions.md +1 -1
  55. package/docs/type-aliases/FetchEngineAction.md +1 -1
  56. package/docs/type-aliases/FetchEngineType.md +1 -1
  57. package/docs/type-aliases/FetchReturnType.md +1 -1
  58. package/docs/type-aliases/FetchReturnTypeFor.md +1 -1
  59. package/docs/type-aliases/OnFetchPauseCallback.md +1 -1
  60. package/docs/type-aliases/ResourceType.md +1 -1
  61. package/docs/variables/DefaultFetcherProperties.md +1 -1
  62. package/docs/variables/FetcherOptionKeys.md +1 -1
  63. package/package.json +1 -1
package/dist/index.mjs CHANGED
@@ -1 +1 @@
1
- var t={engine:"auto",enableSmart:!0,useSiteRegistry:!0,antibot:!1,headers:{},cookies:[],reuseCookies:!0,throwHttpErrors:void 0,proxy:[],blockResources:[],ignoreSslErrors:!0,browser:{engine:"playwright",headless:!0,waitUntil:"domcontentloaded"},http:{method:"GET"},timeoutMs:6e4,requestHandlerTimeoutSecs:void 0,maxConcurrency:1,maxRequestsPerMinute:1e3,delayBetweenRequestsMs:0,retries:0,sites:[]},e=Object.keys(t).concat(["actions","onPause"]),i=(t=>(t[t.Failed=0]="Failed",t[t.Success=1]="Success",t[t.Skipped=2]="Skipped",t))(i||{}),s=class t{static register(t){const e=t.id;if(!e)throw new Error("FetchAction.register: actionClass.id is required");this.registry.set(e,t)}static get(t){return this.registry.get(t)}static create(e){const i="string"==typeof e?e:e.id||e.name||e.action;if(!i)throw new Error("Action must have id, name or action");const s=i instanceof t?i.constructor:this.registry.get(i);return s?new s:void 0}static has(t){return this.registry.has(t)}static list(){return Array.from(this.registry.keys())}static getCapability(t){return this.capabilities[t]??"noop"}getCapability(t){return this.constructor.getCapability(t)}get id(){return this.constructor.id}get returnType(){return this.constructor.returnType}get capabilities(){return this.constructor.capabilities}async delegateToEngine(t,e,...i){const s=t.internal.engine;if(!s)throw new Error("No engine available");if("function"!=typeof s[e])throw new Error(`Engine does not have a method named '${String(e)}'`);return await s[e](...i)}installCollectors(e,i){const s=i?.collectors;if(!s?.length)return;const r=[],c=new Set;for(const i of s){const s=n(i.activateOn),l=n(i.collectOn),h=n(i.deactivateOn),u=!(i.background??!0),w=t.create(i);if(!w)continue;let f=!1,d=!1,p=0;const y=async t=>{if(!f&&!d){f=!0;try{await(w.onBeforeExec?.(e,i))}catch(t){e.eventBus.emit("collector:error",{action:this.id,collector:w.id,phase:"before",error:t})}}},m=async(t,s)=>{if(!d){f||await y(s);try{const r=Promise.resolve(w.onExecute?.(e,i,s)).then(s=>{var r,n;if(i.storeAs){((r=e.outputs)[n=i.storeAs]||(r[n]=[])).push(s)}return e.eventBus.emit("collector:result",{action:this.id,collector:i.id||i.name,event:t,result:s}),s}).catch(s=>{e.eventBus.emit("collector:error",{action:this.id,collector:i.id||i.name,event:t,phase:"exec",error:s})}).finally(()=>{p++});u&&(c.add(r),r.finally(()=>c.delete(r)))}catch(i){e.eventBus.emit("collector:error",{action:this.id,collector:w.id,event:t,phase:"exec",error:i})}}},g=async()=>{if(!d){0===p&&m("collector:after"),d=!0;try{await(w.onAfterExec?.(e,i))}catch(t){e.eventBus.emit("collector:error",{action:this.id,collector:i.id||i.name,phase:"after",error:t})}finally{e.eventBus.emit("collector:end",{action:this.id,collector:i.id||i.name}),x.forEach(t=>t())}}},v=o(e,s,y),x=a(e,l,m),b=o(e,h,g);if(r.push(...v,...x,...b),!s.length&&!l.length&&!h.length){const t=()=>{g()};e.eventBus.once(`action:${this.id}.end`,t),r.push(()=>e.eventBus.off("fetcher:action:end",t))}}return r.length||c.size>0?{cleanup:()=>r.forEach(t=>t()),awaitExecPendings:async()=>{c.size>0&&await Promise.allSettled(Array.from(c))}}:void 0}async beforeExec(t,e){t.internal.actionStack||(t.internal.actionStack=[]);const i=t.internal.actionStack,s=i.length,r=i.length>0?i[i.length-1].id:void 0,n={...e,id:this.id,depth:s,parent:r};i.push(n),t.currentAction=n;const o={action:this,context:t,options:e,depth:s,stack:[...i]};t.eventBus.emit(`action:${this.id}.start`,o),t.eventBus.emit("action:start",o),await(this.onBeforeExec?.(t,e));return{entry:o,collectors:this.installCollectors(t,e)}}async afterExec(t,e,i,s){const r=t.internal.actionStack,n=r.length-1,o=s?.collectors;try{await(o?.awaitExecPendings()),t.lastResult=i,"response"!==i?.returnType||i.error||(t.lastResponse=i.result),e?.storeAs&&(t.outputs[e.storeAs]=i?.result),i?.error&&(t.currentAction.error=i.error),await(this.onAfterExec?.(t,e));const s={action:this,context:t,options:e,result:i,depth:n,stack:[...r]};i?.error&&(s.error=i.error);try{t.eventBus.emit(`action:${this.id}.end`,s)}catch(t){}try{t.eventBus.emit("action:end",s)}catch(t){}}finally{try{o?.cleanup()}finally{r.pop();const e=r.length;t.currentAction=e>0?r[e-1]:void 0}}}async execute(t,e){e?.args&&!e.params&&(e.params=e.args);const i=await this.beforeExec(t,e),s=e?.failOnError??!0;let r;try{return t.throwHttpErrors=s,r=await this.onExecute(t,e),r&&r.returnType||(r={status:1,returnType:this.returnType??"any",result:r}),r}catch(e){if(r={status:0,error:e,meta:{id:this.id,engineType:t.engine,capability:this.getCapability(t.engine)}},s)throw e;return r}finally{await this.afterExec(t,e,r,i)}}};s.registry=new Map,s.returnType="any",s.capabilities={http:"noop",browser:"noop"};var r=s;function n(t){return t?Array.isArray(t)?t:[t]:[]}function o(t,e,i){const s=[];for(const r of e)if("string"==typeof r||r instanceof RegExp){const e=(...t)=>{i(t[0])};t.eventBus.once(r,e),s.push(()=>t.eventBus.off(r,e))}return s}function a(t,e,i){const s=[];for(const r of e)if("string"==typeof r||r instanceof RegExp){const e=t=>i(r,t);t.eventBus.on(r,e),s.push(()=>t.eventBus.off(r,e))}return s}import{EventEmitter as c}from"events-ex";import{defaultsDeep as l}from"lodash-es";import{customAlphabet as h}from"nanoid";var u=h("0123456789abcdefghijklmnopqrstuvwxyz",12);import{defaultsDeep as w,merge as f}from"lodash-es";import{EventEmitter as d}from"events-ex";import{CommonError as p}from"@isdk/common-error";import{Configuration as y,KeyValueStore as m,PERSIST_STATE_KEY as g,RequestQueue as v}from"crawlee";function x(){let t=()=>{};const e=new Promise(e=>{t=e});return e.release=t,e}y.getGlobalConfig().set("persistStorage",!1);var b=class{constructor(){this.hdrs={},this._initializedSessions=new Set,this.pendingRequests=new Map,this.requestCounter=0,this.actionEmitter=new d,this.isPageActive=!1,this.navigationLock=function(){const t=x();return t.release(),t}(),this.blockedTypes=new Set}static register(t){const e=t.id;if(!e)throw new Error("Engine must define static id");if(this.registry.has(e))throw new Error(`Engine id duplicated: ${e}`);this.registry.set(e,t)}static get(t){return this.registry.get(t)}static getByMode(t){for(const[e,i]of this.registry.entries())if(i.mode===t)return i}static async create(e,i){const s=w(i,e,t),r=s.engine??e.engine,n=r?this.get(r)??this.getByMode(r):null;if(n){const t=new n;return await t.initialize(e,s),t}}async _extract(t,e){const i=t.type;if(!e)return"array"===i?[]:null;if("object"===i){const{selector:i,properties:s}=t;let r=e;if(i){const t=await this._querySelectorAll(e,i);r=t.length>0?t[0]:null}if(!r)return null;const n={};for(const t in s)n[t]=await this._extract(s[t],r);return n}if("array"===i){const{selector:i,items:s}=t,r=i?await this._querySelectorAll(e,i):[e],n=[];for(const t of r)n.push(await this._extract(s,t));return n}const{selector:s}=t;let r=e;if(s){const t=await this._querySelectorAll(e,s);r=t.length>0?t[0]:null}return r?this._extractValue(t,r):null}async buildResponse(t){const e=await this._buildResponse(t),i=e.headers["content-type"]||"";return e.contentType=i.split(";")[0].trim(),!e.cookies&&t.session&&(e.cookies=t.session.getCookies(t.request.url)),this.crawler?.sessionPool&&(e.sessionState=await this.crawler.sessionPool.getState()),e}waitFor(t){return this.dispatchAction({type:"waitFor",options:t})}click(t){return this.dispatchAction({type:"click",selector:t})}fill(t,e){return this.dispatchAction({type:"fill",selector:t,value:e})}submit(t,e){return this.dispatchAction({type:"submit",selector:t,options:e})}pause(t){return this.dispatchAction({type:"pause",message:t})}extract(t){const e=this._normalizeSchema(t);return this.dispatchAction({type:"extract",schema:e})}_normalizeSchema(t){const e=JSON.parse(JSON.stringify(t));if(e.properties)for(const t in e.properties)e.properties[t]=this._normalizeSchema(e.properties[t]);if(e.items&&(e.items=this._normalizeSchema(e.items)),"array"===e.type&&(e.attribute&&!e.items&&(e.items={attribute:e.attribute},delete e.attribute),e.items||(e.items={type:"string"})),e.selector&&(e.has||e.exclude)){const{selector:t,has:i,exclude:s}=e,r=t.split(",").map(t=>{let e=t.trim();return i&&(e=`${e}:has(${i})`),s&&(e=`${e}:not(${s})`),e}).join(", ");e.selector=r,delete e.has,delete e.exclude}return e}get id(){return this.constructor.id}async getState(){return{cookies:await this.cookies(),sessionState:await(this.crawler?.sessionPool?.getState())}}get mode(){return this.constructor.mode}get context(){return this.ctx}async initialize(t,e){if(this.ctx)return;f(t,e),this.ctx=t,this.opts=t,this.hdrs=function(t){const e={};if(t&&"object"==typeof t)for(const[i,s]of Object.entries(t))e[i.toLowerCase()]=s;return e}(t.headers),this._initialCookies=[...t.cookies??[]],t.internal||(t.internal={}),t.internal.engine=this,t.engine=this.mode,this.actionEmitter.setMaxListeners(100),this.requestQueue=await v.open();const i=await this._getSpecificCrawlerOptions(t),s=w({persistenceOptions:{enable:!0}},t.sessionPoolOptions,{maxPoolSize:1,sessionOptions:{maxUsageCount:1e3,maxErrorScore:3}});t.sessionState&&t.cookies&&t.cookies.length>0&&console.warn('[FetchEngine] Warning: Both "sessionState" and "cookies" are provided. Explicit "cookies" will override any conflicting cookies restored from "sessionState".');const r={...w(i,{requestQueue:this.requestQueue,maxConcurrency:1,minConcurrency:1,useSessionPool:!0,persistCookiesPerSession:!0,sessionPoolOptions:s}),requestHandler:this._requestHandler.bind(this),errorHandler:this._failedRequestHandler.bind(this),failedRequestHandler:this._failedRequestHandler.bind(this)};r.preNavigationHooks||(r.preNavigationHooks=[]),r.preNavigationHooks.unshift(({crawler:t,session:e,request:i},s)=>{if(this.currentSession=e,e&&!this._initializedSessions.has(e.id)){if(this._initialCookies&&this._initialCookies.length>0){const t=this._initialCookies.map(t=>{const e={...t};return"no_restriction"===e.sameSite&&(e.sameSite="None"),e});e.setCookies(t,i.url)}this._initializedSessions.add(e.id)}});const n=this.crawler=this._createCrawler(r),o=await m.open(null,{config:n.config}),a=await o.getValue(g);!t.sessionState||a&&!t.overrideSessionState||await o.setValue(g,t.sessionState),this.crawler.run().then(()=>{this.isCrawlerReady=!0}).catch(t=>{this.isCrawlerReady=!1,console.error("Crawler background error:",t)})}async cleanup(){await(this._cleanup?.()),await this._commonCleanup();const t=this.ctx;t&&t.internal?.engine===this&&(t.internal.engine=void 0),this.ctx=void 0,this.opts=void 0}async _executePendingActions(t){await new Promise(e=>{const i=async({action:e,resolve:i,reject:s})=>{try{if("dispose"===e.type)return this.actionEmitter.emit("dispose"),void i();i(await this.executeAction(t,e))}catch(t){s(t)}};this.actionEmitter.on("dispatch",i),this.actionEmitter.once("dispose",()=>{this.actionEmitter.removeListener("dispatch",i),e()})})}async _sharedRequestHandler(t){const{request:e}=t;try{this.currentSession=t.session,this.isPageActive=!0;const i=this.pendingRequests.get(e.userData.requestId);if(i){const s=await this.buildResponse(t),r=!s.statusCode||s.statusCode>=400;if(this.ctx?.throwHttpErrors&&r){const t=new p(`Request for ${s.finalUrl} failed with status ${s.statusCode||"N/A"}`,"request",s.statusCode);i.reject(t)}else this.lastResponse=s,i.resolve(s);this.pendingRequests.delete(e.userData.requestId)}await this._executePendingActions(t)}finally{if(this.currentSession){const t=this.currentSession.getCookies(e.url);t&&(this._initialCookies=t)}this.isPageActive=!1,this.navigationLock.release()}}async _sharedFailedRequestHandler(t,e){const{request:i}=t,s=this.pendingRequests.get(i.userData.requestId);if(s&&e&&this.ctx?.throwHttpErrors){this.pendingRequests.delete(i.userData.requestId);const t=e.response,r=t?.statusCode||500,n=t?.url?t.url:i.url,o=new p(`Request${n?" for "+n:""} failed: ${e.message}`,"request",r);s.reject(o)}return this._sharedRequestHandler(t)}async dispatchAction(t){if(!this.isPageActive)throw new Error("No active page. Call goto() before performing actions.");return new Promise((e,i)=>{this.actionEmitter.emit("dispatch",{action:t,resolve:e,reject:i})})}async _requestHandler(t){await this._sharedRequestHandler(t)}async _failedRequestHandler(t,e){await this._sharedFailedRequestHandler(t,e)}async _commonCleanup(){if(this._initializedSessions.clear(),this.isPageActive&&await this.dispatchAction({type:"dispose"}).catch(()=>{}),this.pendingRequests.size>0)for(const[,t]of this.pendingRequests)t.reject(new Error("Cleanup:Request cancelled"));if(this.actionEmitter.removeAllListeners(),this.crawler){try{await(this.crawler.teardown?.())}catch(t){console.error("ccrawler teardown error:",t)}this.crawler=void 0}this.isCrawlerReady=void 0,this.requestQueue&&(await this.requestQueue.drop(),this.requestQueue=void 0),this.pendingRequests.clear()}async blockResources(t,e){return e&&this.blockedTypes.clear(),t.forEach(t=>this.blockedTypes.add(t)),t.length}getContent(){return this.lastResponse?Promise.resolve(this.lastResponse):Promise.reject(new Error("No content fetched yet. Call goto() first."))}async headers(t,e){if(void 0===t)return{...this.hdrs};if("string"==typeof t&&void 0===e)return this.hdrs[t.toLowerCase()]||"";if(null!==t&&"object"==typeof t){const i={};for(const[e,s]of Object.entries(t))i[e.toLowerCase()]=String(s);return this.hdrs=!0===e?i:{...this.hdrs,...i},!0}return"string"==typeof t&&("string"==typeof e?this.hdrs[t.toLowerCase()]=e:null===e&&delete this.hdrs[t.toLowerCase()],!0)}async cookies(t){const e=this.lastResponse?.url||"";if(Array.isArray(t))return this.currentSession?this.currentSession.setCookies(t,e):this._initialCookies=[...t],!0;if(null===t)return this.currentSession,this._initialCookies=[],!0;if(this.currentSession){return this.currentSession.getCookies(e)}return[...this._initialCookies||[]]}async dispose(){await this.cleanup()}};async function E(t,e){let i;if(e?.engine){if(i=await b.create(t,{engine:e.engine}),!i)throw new Error(`No engine available for ${e.engine}`);return i}const s=function(t,e){if(!t||!e?.length)return null;const i=new URL(t);let s=e.find(t=>t.domain===i.hostname);s||(s=e.find(t=>i.hostname.endsWith(t.domain)));if(!s)return null;if(s.pathScope?.length){if(!s.pathScope.some(t=>i.pathname.startsWith(t)))return null}return s}(e?.url||t.url,t.sites),r=t.engine||s?.engine||"auto";return i=await b.create(t,{engine:r}),i||(i=await b.create(t,{engine:"http"})),i}b.registry=new Map;var S=class{constructor(t={},e){this.options=t,this.engine=e,this.closed=!1,this.id=u(),this.context=this.createContext(t)}async execute(t){await this.ensureEngine(t);const e=r.create(t);if(!e)throw new Error(`Unknown action: ${t.id||t.name}`);let i,s;this.context.internal.actionIndex=(this.context.internal.actionIndex||0)+1,this.context.currentAction={...t,index:this.context.internal.actionIndex,startedAt:Date.now()};try{return i=await e.execute(this.context,t),i}catch(t){throw s=t,s}finally{this.context.currentAction=void 0}}async executeAll(t){let e=0;try{for(;e<t.length;){const i=t[e];await this.execute(i),e++}const i=await this.execute({id:"getContent"});return{result:i?.result,outputs:this.getOutputs()}}catch(t){throw t.actionIndex=e,t}}getOutputs(){return this.context.outputs}async getState(){return this.context.internal.engine?.getState()}async dispose(){if(this.closed)return;const t=this.context.eventBus;t.emit("session:closing",{sessionId:this.id});try{await(this.context.internal.engine?.dispose())}finally{this.closed=!0}t.emit("session:closed",{sessionId:this.id})}async ensureEngine(t){if(this.closed)throw new Error("Session is closed");if(!this.context.internal.engine){const e=t?.params?.url??this.context.url;if(!await E(this.context,{url:e,engine:this.engine}))throw new Error("No engine found")}}createContext(e=this.options){const i=new c;return l({...e,id:this.id,eventBus:i,outputs:{},internal:{},execute:async t=>this.execute(t),action:async function(t,e,i){return this.execute({name:t,params:e,...i})}},t)}},k=class{constructor(t={}){this.defaults=t}async createSession(t){const e={...this.defaults,...t||{}};return new S(e)}async fetch(t,e){"string"!=typeof t&&(t=(e=t).url);const i=await this.createSession(e);try{const s=e?.actions||[];t&&0!==s.findIndex(e=>"goto"===e.id&&e.params?.url===t)&&s.unshift({id:"goto",params:{url:t}});return await i.executeAll(s)}finally{await i.dispose()}}};import{CheerioCrawler as q,ProxyConfiguration as C}from"crawlee";import*as $ from"cheerio";import{CommonError as R,ErrorCode as P,NotFoundError as T}from"@isdk/common-error";var A=class extends b{_ensureCheerioContext(t){if(!t.$&&t.body){let e="string"==typeof t.body?t.body:Buffer.isBuffer(t.body)?t.body.toString("utf-8"):JSON.stringify(t.body);e.trim().startsWith("<")||(e=`<html><body><pre>${e}</pre></body></html>`),t.$=$.load(e)}}async _buildResponse(t){this._ensureCheerioContext(t);const{request:e,response:i,body:s,$:r}=t,n=r?.html();let o="string"==typeof s?s:Buffer.isBuffer(s)?s.toString("utf-8"):String(s??"");n&&n!==o&&(o=n);let a=i?.headers;if(!a&&i?.rawHeaders){a={};const t=i.rawHeaders;for(let e=0;e<t.length;e+=2)a[t[e].toLowerCase()]=t[e+1]}return{url:e.url,finalUrl:e.loadedUrl||e.url,statusCode:i?.statusCode??200,statusText:i?.statusMessage,headers:a||{},body:s,html:o,text:o}}async _querySelectorAll(t,e){const{$:i,el:s}=t;return s.find(e).toArray().map(t=>({$:i,el:i(t)}))}async _extractValue(t,e){const{el:i}=e,{attribute:s,type:r="string"}=t;if(0===i.length)return null;let n="";if(n=s?i.attr(s)??null:"html"===r?i.html():i.text().trim(),null===n)return null;switch(r){case"number":return parseFloat(n.replace(/[^0-9.-]+/g,""))||null;case"boolean":const t=n.toLowerCase();return"true"===t||"1"===t;default:return n}}async executeAction(t,e){const{$:i}=t;switch(e.type){case"dispose":return;case"extract":if(!i)throw new R(`Cheerio context not available for action: ${e.type}`,"extract");return this._extract(e.schema,{$:i,el:i.root()});case"click":{if(!i)throw new R(`Cheerio context not available for action: ${e.type}`,"click");const s=e.selector,r=i(s).first();let n;if(0===r.length)try{n=new URL(s,t.request.loadedUrl||t.request.url).href}catch{throw new R(`click: selector not found or invalid URL: ${s}`,"click")}else{if(!r.is("a")||!r.attr("href")){if(r.is('input[type="submit"], button[type="submit"], button, input')){const e=r.closest("form");if(e.length)return this.executeAction(t,{type:"submit",selector:e});throw new R("click: submit-like element without form","click")}throw new R(`click: unsupported element for http simulate. Selector: ${s}`,"click")}{const e=r.attr("href");n=new URL(e,t.request.loadedUrl||t.request.url).href}}const o=await t.sendRequest({url:n});return void await this._updateStateAfterNavigation(t,o)}case"fill":{if(!i)throw new R(`Cheerio context not available for action: ${e.type}`),"fill";const s=i(e.selector).first();if(0===s.length)throw new R(`fill: selector not found: ${e.selector}`);if(!s.is("input, textarea, select"))throw new R(`fill: not a form field: ${e.selector}`);return s.val(e.value),void(this.lastResponse=await this.buildResponse(t))}case"waitFor":return void(e.options?.ms&&await new Promise(t=>setTimeout(t,e.options.ms)));case"pause":const s=this.ctx?.onPause;return void(s?(console.info(e.message||"Execution paused for manual intervention."),await s({message:e.message}),console.info("Resuming execution...")):console.warn("[PauseAction] was called, but no `onPause` handler was provided in fetchWeb options. Skipped."));case"submit":{if(!i)throw new R(`Cheerio context not available for action: ${e.type}`,"submit");const s="string"==typeof e.selector?i(e.selector).first():null!=e.selector?e.selector:i("form").first();if(0===s.length)throw new T(e.selector,"submit");const r=s.attr("action")||t.request.loadedUrl||t.request.url,n=(s.attr("method")||"GET").toUpperCase(),o=new URL(r,t.request.loadedUrl||t.request.url).href,a={};let c;if(s.find("input, select, textarea").each((t,e)=>{const s=i(e),r=s.attr("name");if(!r)return;const n=s.val();null!=n&&(a[r]=String(n))}),"GET"===n){const e=new URL(o);Object.entries(a).forEach(([t,i])=>e.searchParams.set(t,i)),c=await t.sendRequest({url:e.href,method:"GET"})}else{let i;const r={};"application/json"===(e.options?.enctype||s.attr("enctype")||"application/x-www-form-urlencoded")?(i=JSON.stringify(a),r["Content-Type"]="application/json"):(i=new URLSearchParams(a).toString(),r["Content-Type"]="application/x-www-form-urlencoded"),c=await t.sendRequest({url:o,method:"POST",body:i,headers:r})}return void await this._updateStateAfterNavigation(t,c)}case"getContent":return this.buildResponse(t);default:throw new R(`Unknown action type: ${e.type}`,"CheerioFetchEngine.executeAction",P.NotSupported)}}async _updateStateAfterNavigation(t,e){const i=e;t.response=i,t.body=i.body,t.$=void 0,i.url&&(t.request.loadedUrl=i.url),this.lastResponse=await this.buildResponse(t)}_createCrawler(t){return new q(t)}_getSpecificCrawlerOptions(t){const e=this.opts?.proxy?"string"==typeof this.opts.proxy?[this.opts.proxy]:this.opts.proxy:void 0,i=e?.length?new C({proxyUrls:e}):void 0;return{additionalMimeTypes:["text/plain"],maxRequestRetries:1,requestHandlerTimeoutSecs:t.requestHandlerTimeoutSecs,proxyConfiguration:i,preNavigationHooks:[({session:e,request:i},s)=>{s.throwHttpErrors=t.throwHttpErrors,this.opts?.timeoutMs&&(s.timeout={request:this.opts.timeoutMs})}]}}async goto(t,e){this.isPageActive&&this.dispatchAction({type:"dispose"}).catch(()=>{});const i="req-"+ ++this.requestCounter,s=new Promise((t,s)=>{const r=e?.timeoutMs||this.opts?.timeoutMs||3e4,n=setTimeout(()=>{this.pendingRequests.delete(i),this.navigationLock.release(),s(new R(`goto timed out after ${r}ms.`,"gotoTimeout",P.RequestTimeout))},r);this.pendingRequests.set(i,{resolve:e=>{clearTimeout(n),t(e)},reject:t=>{clearTimeout(n),s(t)}})});return this.requestQueue.addRequest({...e,url:t,headers:{...this.hdrs,...e?.headers},userData:{requestId:i},uniqueKey:`${t}-${i}`}).catch(t=>{const e=this.pendingRequests.get(i);e&&(this.pendingRequests.delete(i),this.navigationLock.release(),e.reject(t))}),await this.navigationLock,this.navigationLock=x(),s}};A.id="cheerio",A.mode="http",b.register(A);import{PlaywrightCrawler as _}from"crawlee";import{firefox as U}from"playwright";import{CommonError as O,ErrorCode as j,NotFoundError as N}from"@isdk/common-error";var F=class extends b{async _buildResponse(t){const{page:e,response:i,request:s,session:r}=t;if(!e||e.isClosed())return{url:s.url,finalUrl:s.loadedUrl||s.url,statusCode:i?.status(),statusText:i?.statusText(),headers:await(i?.allHeaders())||{},body:"",html:"",text:""};const n=await e.content(),o=await e.textContent("body"),a=await e.context().cookies();return r&&r.setCookies(a,s.url),{url:e.url(),finalUrl:e.url(),statusCode:i?.status(),statusText:i?.statusText(),headers:await(i?.allHeaders())||{},cookies:a,body:n,html:n,text:o||""}}async _querySelectorAll(t,e){return t.locator(e).all()}async _extractValue(t,e){const{attribute:i,type:s="string"}=t;if(0===await e.count())return null;let r="";if(r=i?await e.getAttribute(i):"html"===s?await e.innerHTML():await e.textContent(),null===r)return null;switch(r=r.trim(),s){case"number":return parseFloat(r.replace(/[^0-9.-]+/g,""))||null;case"boolean":const t=r.toLowerCase();return"true"===t||"1"===t;default:return r}}async executeAction(t,e){const{page:i}=t,s=this.opts?.timeoutMs||3e4;switch(e.type){case"navigate":{const s=await i.goto(e.url,{waitUntil:e.opts?.waitUntil||"domcontentloaded",timeout:this.opts?.timeoutMs||3e4});s&&(t={...t,response:s});const r=await this.buildResponse(t);return this.lastResponse=r,r}case"extract":{const s=await this._extract(e.schema,i.locator("body"));return this.lastResponse=await this.buildResponse(t),s}case"click":{await i.click(e.selector,{timeout:s}),await i.waitForLoadState("networkidle",{timeout:s});const r=await this.buildResponse(t);return void(this.lastResponse=r)}case"fill":await i.fill(e.selector,e.value,{timeout:s});const r=await this.buildResponse(t);return void(this.lastResponse=r);case"waitFor":try{e.options?.selector&&await i.waitForSelector(e.options.selector,{timeout:s}),e.options?.networkIdle&&await i.waitForLoadState("networkidle",{timeout:s})}catch(t){if(!1!==e.options?.failOnTimeout)throw t}return void(e.options?.ms&&await i.waitForTimeout(e.options.ms));case"submit":{const r=e.selector||"form",n=i.locator(r).first();if(0===await n.count())throw new N(r,"submit");if("application/json"===(e.options?.enctype||"application/x-www-form-urlencoded")){const t=await n.elementHandle();if(!t)throw new O(`submit: could not get form handle for ${r}`,"submit");const e=await t.evaluate(async t=>{const e=new FormData(t),i={};e.forEach((t,e)=>{i[e]=t.toString()});const s=await fetch(t.action,{method:t.method,headers:{"Content-Type":"application/json"},body:JSON.stringify(i)}),r=await s.text();return{status:s.status,statusText:s.statusText,headers:Object.fromEntries(s.headers.entries()),body:r,html:r,text:r,url:t.action,finalUrl:s.url}});return await t.dispose(),await i.setContent(e.html),void(this.lastResponse=e)}return await n.evaluate(t=>t.submit()),await i.waitForLoadState("networkidle",{timeout:s}),void(this.lastResponse=await this.buildResponse(t))}case"pause":{const t=this.ctx?.onPause;return void(t?(console.info(e.message||"Execution paused for manual intervention."),await t({message:e.message}),console.info("Resuming execution...")):console.warn("[PauseAction] was called, but no `onPause` handler was provided in fetchWeb options. Skipped."))}case"getContent":return this.buildResponse(t);default:throw new O(`Unknown action type: ${e.type}`,"PlaywrightFetchEngine.executeAction",j.NotSupported)}}_createCrawler(t){return new _(t)}async _getSpecificCrawlerOptions(t){const e=t.browser?.headless??!0,i={maxRequestRetries:t.retries||3,headless:e,requestHandlerTimeoutSecs:t.requestHandlerTimeoutSecs,preNavigationHooks:[async({page:e,request:i},s)=>{s.throwHttpErrors=t.throwHttpErrors;const r=this.blockedTypes;r.size>0&&await e.route("**/*",t=>{r.has(t.request().resourceType())?t.abort():t.continue()})}]};if(this.opts?.antibot){i.browserPoolOptions={useFingerprints:!1};const{launchOptions:t}=await import("camoufox-js"),s=await t({headless:e});i.launchContext={launcher:U,launchOptions:s},i.postNavigationHooks=[async({page:t,handleCloudflareChallenge:e})=>{await e()}]}return i}async goto(t,e){if(this.isPageActive)return this.dispatchAction({type:"navigate",url:t,opts:e});if(!this.requestQueue)throw new O("RequestQueue not initialized","goto");const i="req-"+ ++this.requestCounter,s=new Promise((t,e)=>{this.pendingRequests.set(i,{resolve:t,reject:e})});return await this.requestQueue.addRequest({url:t,headers:this.hdrs,userData:{requestId:i,waitUntil:e?.waitUntil||"domcontentloaded"},uniqueKey:`${t}-${i}`}),s}};F.id="playwright",F.mode="browser",b.register(F);var H=class extends r{async onExecute(t,e){const{selector:i,...s}=e?.params||{};if(!i)throw new Error("Selector is required for click action");await this.delegateToEngine(t,"click",i,s)}};H.id="click",H.returnType="none",H.capabilities={http:"simulate",browser:"native"},r.register(H);var L=class extends r{async onExecute(t,e){const{selector:i,value:s,...r}=e?.params||{};if(!i)throw new Error("Selector is required for fill action");if(void 0===s)throw new Error("Value is required for fill action");await this.delegateToEngine(t,"fill",i,s,r)}};L.id="fill",L.returnType="none",L.capabilities={http:"simulate",browser:"native"},r.register(L);var M=class extends r{async onExecute(t,e){return await this.delegateToEngine(t,"getContent",e?.params)}};M.id="getContent",M.returnType="response",M.capabilities={http:"native",browser:"native"},r.register(M);var B=class extends r{async onExecute(t,e,i){const s=e?.params,r=s?.url||t.url;if(!r)throw new Error("URL is required for goto action");const n=t.internal.engine;if(!n)throw new Error("No engine available");t.url=r;return await n.goto(r,s)}};B.id="goto",B.returnType="response",B.capabilities={http:"native",browser:"native"},r.register(B);var z=class extends r{async onExecute(t,e){const{selector:i,...s}=e?.params||{};await this.delegateToEngine(t,"submit",i,s)}};z.id="submit",z.returnType="none",z.capabilities={http:"simulate",browser:"native"},r.register(z);var D=class extends r{async onExecute(t,e){const i=t.internal.engine;if(!i)throw new Error("No engine available");await i.waitFor(e?.params)}};D.id="waitFor",D.returnType="none",D.capabilities={http:"native",browser:"native"},r.register(D);var J=class extends r{async onExecute(t,e){const i=e?.params;if(!i)throw new Error("Schema is required for extract action");return this.delegateToEngine(t,"extract",i)}};J.id="extract",J.returnType="any",J.capabilities={http:"native",browser:"native"},r.register(J);var G=class extends r{async onExecute(t,e){const{selector:i,message:s,attribute:r}=e?.params||{},n=t.internal.engine;if("browser"===n?.mode){if(i){if(!await(n?.extract({selector:i,attribute:r})))return}n&&"pause"in n?await n.pause(s):console.warn("[PauseAction] was called, but the current engine does not support `pause`. Skipped.")}else console.warn("[PauseAction] can only run in browser engine. Skipped.")}};async function I(t,e){return(new k).fetch(t,e)}G.id="pause",G.capabilities={http:"native",browser:"native"},G.returnType="none",r.register(G);export{A as CheerioFetchEngine,H as ClickAction,t as DefaultFetcherProperties,J as ExtractAction,r as FetchAction,i as FetchActionResultStatus,b as FetchEngine,S as FetchSession,e as FetcherOptionKeys,L as FillAction,M as GetContentAction,B as GotoAction,G as PauseAction,F as PlaywrightFetchEngine,z as SubmitAction,D as WaitForAction,k as WebFetcher,I as fetchWeb};
1
+ var t={engine:"auto",enableSmart:!0,useSiteRegistry:!0,antibot:!1,debug:!1,headers:{},cookies:[],throwHttpErrors:void 0,output:{cookies:!0,sessionState:!0},proxy:[],blockResources:[],storage:{purge:!0},ignoreSslErrors:!0,browser:{engine:"playwright",headless:!0,waitUntil:"domcontentloaded"},http:{method:"GET"},timeoutMs:6e4,requestHandlerTimeoutSecs:void 0,maxConcurrency:1,maxRequestsPerMinute:1e3,delayBetweenRequestsMs:0,retries:0,sites:[]},e=Object.keys(t).concat(["actions","onPause"]),i=(t=>(t[t.Failed=0]="Failed",t[t.Success=1]="Success",t[t.Skipped=2]="Skipped",t))(i||{}),s=class t{static register(t){const e=t.id;if(!e)throw new Error("FetchAction.register: actionClass.id is required");this.registry.set(e,t)}static get(t){return this.registry.get(t)}static create(e){const i="string"==typeof e?e:e.id||e.name||e.action;if(!i)throw new Error("Action must have id, name or action");const s=i instanceof t?i.constructor:this.registry.get(i);return s?new s:void 0}static has(t){return this.registry.has(t)}static list(){return Array.from(this.registry.keys())}static getCapability(t){return this.capabilities[t]??"noop"}getCapability(t){return this.constructor.getCapability(t)}get id(){return this.constructor.id}get returnType(){return this.constructor.returnType}get capabilities(){return this.constructor.capabilities}async delegateToEngine(t,e,...i){const s=t.internal.engine;if(!s)throw new Error("No engine available");if("function"!=typeof s[e])throw new Error(`Engine does not have a method named '${String(e)}'`);return await s[e](...i)}installCollectors(e,i){const s=i?.collectors;if(!s?.length)return;const r=[],c=new Set;for(const i of s){const s=n(i.activateOn),h=n(i.collectOn),l=n(i.deactivateOn),u=!(i.background??!0),w=t.create(i);if(!w)continue;let f=!1,d=!1,p=0;const y=async t=>{if(!f&&!d){f=!0;try{await(w.onBeforeExec?.(e,i))}catch(t){e.eventBus.emit("collector:error",{action:this.id,collector:w.id,phase:"before",error:t})}}},m=async(t,s)=>{if(!d){f||await y(s);try{const r=Promise.resolve(w.onExecute?.(e,i,s)).then(s=>{var r,n;if(i.storeAs){((r=e.outputs)[n=i.storeAs]||(r[n]=[])).push(s)}return e.eventBus.emit("collector:result",{action:this.id,collector:i.id||i.name,event:t,result:s}),s}).catch(s=>{e.eventBus.emit("collector:error",{action:this.id,collector:i.id||i.name,event:t,phase:"exec",error:s})}).finally(()=>{p++});u&&(c.add(r),r.finally(()=>c.delete(r)))}catch(i){e.eventBus.emit("collector:error",{action:this.id,collector:w.id,event:t,phase:"exec",error:i})}}},g=async()=>{if(!d){0===p&&m("collector:after"),d=!0;try{await(w.onAfterExec?.(e,i))}catch(t){e.eventBus.emit("collector:error",{action:this.id,collector:i.id||i.name,phase:"after",error:t})}finally{e.eventBus.emit("collector:end",{action:this.id,collector:i.id||i.name}),x.forEach(t=>t())}}},v=o(e,s,y),x=a(e,h,m),b=o(e,l,g);if(r.push(...v,...x,...b),!s.length&&!h.length&&!l.length){const t=()=>{g()};e.eventBus.once(`action:${this.id}.end`,t),r.push(()=>e.eventBus.off("fetcher:action:end",t))}}return r.length||c.size>0?{cleanup:()=>r.forEach(t=>t()),awaitExecPendings:async()=>{c.size>0&&await Promise.allSettled(Array.from(c))}}:void 0}async beforeExec(t,e){t.internal.actionStack||(t.internal.actionStack=[]);const i=t.internal.actionStack,s=i.length,r=i.length>0?i[i.length-1].id:void 0,n={...e,id:this.id,depth:s,parent:r};i.push(n),t.currentAction=n;const o={action:this,context:t,options:e,depth:s,stack:[...i]};t.eventBus.emit(`action:${this.id}.start`,o),t.eventBus.emit("action:start",o),await(this.onBeforeExec?.(t,e));return{entry:o,collectors:this.installCollectors(t,e)}}async afterExec(t,e,i,s){const r=t.internal.actionStack,n=r.length-1,o=s?.collectors;try{await(o?.awaitExecPendings()),t.lastResult=i,"response"!==i?.returnType||i.error||(t.lastResponse=i.result),e?.storeAs&&(t.outputs[e.storeAs]=i?.result),i?.error&&(t.currentAction.error=i.error),await(this.onAfterExec?.(t,e));const s={action:this,context:t,options:e,result:i,depth:n,stack:[...r]};i?.error&&(s.error=i.error);try{t.eventBus.emit(`action:${this.id}.end`,s)}catch(t){}try{t.eventBus.emit("action:end",s)}catch(t){}}finally{try{o?.cleanup()}finally{r.pop();const e=r.length;t.currentAction=e>0?r[e-1]:void 0}}}async execute(t,e){e?.args&&!e.params&&(e.params=e.args);const i=await this.beforeExec(t,e),s=e?.failOnError??!0;let r;try{return t.throwHttpErrors=s,r=await this.onExecute(t,e),r&&r.returnType||(r={status:1,returnType:this.returnType??"any",result:r}),r}catch(e){if(r={status:0,error:e,meta:{id:this.id,engineType:t.engine,capability:this.getCapability(t.engine)}},s)throw e;return r}finally{await this.afterExec(t,e,r,i)}}};s.registry=new Map,s.returnType="any",s.capabilities={http:"noop",browser:"noop"};var r=s;function n(t){return t?Array.isArray(t)?t:[t]:[]}function o(t,e,i){const s=[];for(const r of e)if("string"==typeof r||r instanceof RegExp){const e=(...t)=>{i(t[0])};t.eventBus.once(r,e),s.push(()=>t.eventBus.off(r,e))}return s}function a(t,e,i){const s=[];for(const r of e)if("string"==typeof r||r instanceof RegExp){const e=t=>i(r,t);t.eventBus.on(r,e),s.push(()=>t.eventBus.off(r,e))}return s}import{EventEmitter as c}from"events-ex";import{defaultsDeep as h}from"lodash-es";import{customAlphabet as l}from"nanoid";var u=l("0123456789abcdefghijklmnopqrstuvwxyz",12);import{defaultsDeep as w,merge as f}from"lodash-es";import{EventEmitter as d}from"events-ex";import{CommonError as p}from"@isdk/common-error";import{Configuration as y,KeyValueStore as m,PERSIST_STATE_KEY as g,RequestQueue as v,ProxyConfiguration as x}from"crawlee";function b(){let t=()=>{};const e=new Promise(e=>{t=e});return e.release=t,e}y.getGlobalConfig().set("persistStorage",!1);var E=class{constructor(){this.hdrs={},this._initializedSessions=new Set,this.pendingRequests=new Map,this.requestCounter=0,this.actionEmitter=new d,this.isPageActive=!1,this.isEngineDisposed=!1,this.navigationLock=function(){const t=b();return t.release(),t}(),this.blockedTypes=new Set}static register(t){const e=t.id;if(!e)throw new Error("Engine must define static id");if(this.registry.has(e))throw new Error(`Engine id duplicated: ${e}`);this.registry.set(e,t)}static get(t){return this.registry.get(t)}static getByMode(t){for(const[e,i]of this.registry.entries())if(i.mode===t)return i}static async create(e,i){const s=w(i,e,t),r=s.engine??e.engine,n=r?this.get(r)??this.getByMode(r):null;if(n){const t=new n;return await t.initialize(e,s),t}}async _extract(t,e){const i=t.type;if(!e)return"array"===i?[]:null;if("object"===i){const{selector:i,properties:s}=t;let r=e;if(i){const t=await this._querySelectorAll(e,i);r=t.length>0?t[0]:null}if(!r)return null;const n={};for(const t in s)n[t]=await this._extract(s[t],r);return n}if("array"===i){const{selector:i,items:s}=t,r=i?await this._querySelectorAll(e,i):[e],n=[];for(const t of r)n.push(await this._extract(s,t));return n}const{selector:s}=t;let r=e;if(s){const t=await this._querySelectorAll(e,s);r=t.length>0?t[0]:null}return r?this._extractValue(t,r):null}async buildResponse(t){const e=await this._buildResponse(t),i=e.headers["content-type"]||"";return e.contentType=i.split(";")[0].trim(),!1!==this.opts?.output?.cookies?!e.cookies&&t.session&&(e.cookies=t.session.getCookies(t.request.url)):delete e.cookies,!1!==this.opts?.output?.sessionState?this.crawler?.sessionPool&&(e.sessionState=await this.crawler.sessionPool.getState()):delete e.sessionState,this.opts?.debug&&(e.metadata={...e.metadata,mode:this.mode,engine:this.id,proxy:t.proxyInfo?.url||("string"==typeof this.opts.proxy?this.opts.proxy:Array.isArray(this.opts.proxy)?this.opts.proxy[0]:void 0)}),e}waitFor(t){return this.dispatchAction({type:"waitFor",options:t})}click(t){return this.dispatchAction({type:"click",selector:t})}fill(t,e){return this.dispatchAction({type:"fill",selector:t,value:e})}submit(t,e){return this.dispatchAction({type:"submit",selector:t,options:e})}pause(t){return this.dispatchAction({type:"pause",message:t})}extract(t){const e=this._normalizeSchema(t);return this.dispatchAction({type:"extract",schema:e})}_normalizeSchema(t){const e=JSON.parse(JSON.stringify(t));if(e.properties)for(const t in e.properties)e.properties[t]=this._normalizeSchema(e.properties[t]);if(e.items&&(e.items=this._normalizeSchema(e.items)),"array"===e.type&&(e.attribute&&!e.items&&(e.items={attribute:e.attribute},delete e.attribute),e.items||(e.items={type:"string"})),e.selector&&(e.has||e.exclude)){const{selector:t,has:i,exclude:s}=e,r=t.split(",").map(t=>{let e=t.trim();return i&&(e=`${e}:has(${i})`),s&&(e=`${e}:not(${s})`),e}).join(", ");e.selector=r,delete e.has,delete e.exclude}return e}get id(){return this.constructor.id}async getState(){return{cookies:await this.cookies(),sessionState:await(this.crawler?.sessionPool?.getState())}}get mode(){return this.constructor.mode}get context(){return this.ctx}async initialize(t,e){if(this.ctx)return;f(t,e),this.ctx=t,this.opts=t,this.hdrs=function(t){const e={};if(t&&"object"==typeof t)for(const[i,s]of Object.entries(t))e[i.toLowerCase()]=s;return e}(t.headers),this._initialCookies=[...t.cookies??[]],t.internal||(t.internal={}),t.internal.engine=this,t.engine=this.mode,this.actionEmitter.setMaxListeners(100);const i=t.storage||{},s=i.persist??!1,r=this.config=new y({persistStorage:s,storageClientOptions:{persistStorage:s,...i.config},...i.config}),n=i.id||t.id;this.requestQueue=await v.open(n,{config:r});const o=this.opts?.proxy?"string"==typeof this.opts.proxy?[this.opts.proxy]:this.opts.proxy:void 0;o?.length&&(this.proxyConfiguration=new x({proxyUrls:o}));const a=await this._getSpecificCrawlerOptions(t),c=w({persistenceOptions:{enable:!0,storeId:n},persistStateKeyValueStoreId:n},t.sessionPoolOptions,{maxPoolSize:1,sessionOptions:{maxUsageCount:1e3,maxErrorScore:3}});t.sessionState&&t.cookies&&t.cookies.length>0&&console.warn('[FetchEngine] Warning: Both "sessionState" and "cookies" are provided. Explicit "cookies" will override any conflicting cookies restored from "sessionState".');const h={...w(a,{requestQueue:this.requestQueue,maxConcurrency:1,minConcurrency:1,useSessionPool:!0,persistCookiesPerSession:!0,sessionPoolOptions:c}),requestHandler:this._requestHandler.bind(this),errorHandler:this._failedRequestHandler.bind(this),failedRequestHandler:this._failedRequestHandler.bind(this)};h.preNavigationHooks||(h.preNavigationHooks=[]),h.preNavigationHooks.unshift(({crawler:t,session:e,request:i},s)=>{if(this.currentSession=e,e&&!this._initializedSessions.has(e.id)){if(this._initialCookies&&this._initialCookies.length>0){const t=this._initialCookies.map(t=>{const e={...t};return"no_restriction"===e.sameSite&&(e.sameSite="None"),e});e.setCookies(t,i.url)}this._initializedSessions.add(e.id)}});const l=this.crawler=this._createCrawler(h,r),u=this.kvStore=await m.open(n,{config:r}),d=await u.getValue(g);!t.sessionState||d&&!t.overrideSessionState||await u.setValue(g,t.sessionState),this.isCrawlerReady=!0,this.crawlerRunPromise=l.run(),this.crawlerRunPromise.finally(()=>{this.isCrawlerReady=!1}).catch(t=>{console.error("Crawler background error:",t)})}async cleanup(){await(this._cleanup?.()),await this._commonCleanup();const t=this.ctx;t&&t.internal?.engine===this&&(t.internal.engine=void 0),this.ctx=void 0,this.opts=void 0}async _executePendingActions(t){this.isEngineDisposed||await new Promise(e=>{const i=async({action:e,resolve:i,reject:s})=>{try{if("dispose"===e.type)return this.actionEmitter.emit("dispose"),void i();i(await this.executeAction(t,e))}catch(t){s(t)}},s=()=>{this.actionEmitter.removeListener("dispatch",i),e()};this.actionEmitter.on("dispatch",i),this.actionEmitter.once("dispose",s),this.isEngineDisposed&&(s(),this.actionEmitter.removeListener("dispose",s))})}async _sharedRequestHandler(t){const{request:e}=t;try{this.currentSession=t.session,this.isPageActive=!0;const i=this.pendingRequests.get(e.userData.requestId);if(i){const s=await this.buildResponse(t),r=!s.statusCode||s.statusCode>=400;if(this.ctx?.throwHttpErrors&&r){const t=new p(`Request for ${s.finalUrl} failed with status ${s.statusCode||"N/A"}`,"request",s.statusCode);i.reject(t)}else this.lastResponse=s,i.resolve(s);this.pendingRequests.delete(e.userData.requestId)}await this._executePendingActions(t)}finally{if(this.currentSession){const t=this.currentSession.getCookies(e.url);t&&(this._initialCookies=t)}this.isPageActive=!1,this.navigationLock.release()}}async _sharedFailedRequestHandler(t,e){const{request:i}=t,s=this.pendingRequests.get(i.userData.requestId);if(s&&e&&this.ctx?.throwHttpErrors){this.pendingRequests.delete(i.userData.requestId);const t=e.response,r=t?.statusCode||500,n=t?.url?t.url:i.url,o=new p(`Request${n?" for "+n:""} failed: ${e.message}`,"request",r);s.reject(o)}return this._sharedRequestHandler(t)}async dispatchAction(t){if(!this.isPageActive)throw new Error("No active page. Call goto() before performing actions.");return new Promise((e,i)=>{this.actionEmitter.emit("dispatch",{action:t,resolve:e,reject:i})})}async _requestHandler(t){await this._sharedRequestHandler(t)}async _failedRequestHandler(t,e){await this._sharedFailedRequestHandler(t,e)}async _commonCleanup(){if(this.isEngineDisposed=!0,this._initializedSessions.clear(),this.actionEmitter.emit("dispose"),this.navigationLock?.release(),this.pendingRequests.size>0){for(const[,t]of this.pendingRequests)t.reject(new Error("Cleanup:Request cancelled"));this.pendingRequests.clear()}if(this.crawler){try{await(this.crawler.teardown?.())}catch(t){console.error("crawler teardown error:",t)}this.crawler=void 0}this.crawlerRunPromise=void 0,this.isCrawlerReady=void 0;const t=(this.opts?.storage||{}).purge??!0;this.requestQueue&&(t&&await this.requestQueue.drop().catch(t=>console.error("Error dropping requestQueue:",t)),this.requestQueue=void 0),this.kvStore&&(t&&await this.kvStore.drop().catch(t=>console.error("Error dropping kvStore:",t)),this.kvStore=void 0),this.actionEmitter.removeAllListeners(),this.pendingRequests.clear(),this.config=void 0}async blockResources(t,e){return e&&this.blockedTypes.clear(),t.forEach(t=>this.blockedTypes.add(t)),t.length}getContent(){return this.lastResponse?Promise.resolve(this.lastResponse):Promise.reject(new Error("No content fetched yet. Call goto() first."))}async headers(t,e){if(void 0===t)return{...this.hdrs};if("string"==typeof t&&void 0===e)return this.hdrs[t.toLowerCase()]||"";if(null!==t&&"object"==typeof t){const i={};for(const[e,s]of Object.entries(t))i[e.toLowerCase()]=String(s);return this.hdrs=!0===e?i:{...this.hdrs,...i},!0}return"string"==typeof t&&("string"==typeof e?this.hdrs[t.toLowerCase()]=e:null===e&&delete this.hdrs[t.toLowerCase()],!0)}async cookies(t){const e=this.lastResponse?.url||"";if(Array.isArray(t))return this.currentSession?this.currentSession.setCookies(t,e):this._initialCookies=[...t],!0;if(null===t)return this.currentSession,this._initialCookies=[],!0;if(this.currentSession){return this.currentSession.getCookies(e)}return[...this._initialCookies||[]]}async dispose(){await this.cleanup()}};async function S(t,e){let i;if(e?.engine){if(i=await E.create(t,{engine:e.engine}),!i)throw new Error(`No engine available for ${e.engine}`);return i}const s=function(t,e){if(!t||!e?.length)return null;const i=new URL(t);let s=e.find(t=>t.domain===i.hostname);s||(s=e.find(t=>i.hostname.endsWith(t.domain)));if(!s)return null;if(s.pathScope?.length){if(!s.pathScope.some(t=>i.pathname.startsWith(t)))return null}return s}(e?.url||t.url,t.sites),r=t.engine||s?.engine||"auto";return i=await E.create(t,{engine:r}),i||(i=await E.create(t,{engine:"http"})),i}E.registry=new Map;var k=class{constructor(t={},e){this.options=t,this.engine=e,this.closed=!1,this.id=u(),this.context=this.createContext(t)}async execute(t){await this.ensureEngine(t);const e=r.create(t);if(!e)throw new Error(`Unknown action: ${t.id||t.name}`);let i,s;this.context.internal.actionIndex=(this.context.internal.actionIndex||0)+1,this.context.currentAction={...t,index:this.context.internal.actionIndex,startedAt:Date.now()};try{return i=await e.execute(this.context,t),i}catch(t){throw s=t,s}finally{this.context.currentAction=void 0}}async executeAll(t){let e=0;try{for(;e<t.length;){const i=t[e];await this.execute(i),e++}const i=await this.execute({id:"getContent"});return{result:i?.result,outputs:this.getOutputs()}}catch(t){throw t.actionIndex=e,t}}getOutputs(){return this.context.outputs}async getState(){return this.context.internal.engine?.getState()}async dispose(){if(this.closed)return;const t=this.context.eventBus;t.emit("session:closing",{sessionId:this.id});try{await(this.context.internal.engine?.dispose())}finally{this.closed=!0}t.emit("session:closed",{sessionId:this.id})}async ensureEngine(t){if(this.closed)throw new Error("Session is closed");if(!this.context.internal.engine){const e=t?.params?.url??this.context.url;if(!await S(this.context,{url:e,engine:this.engine}))throw new Error("No engine found")}}createContext(e=this.options){const i=new c;return h({...e,id:this.id,eventBus:i,outputs:{},internal:{},execute:async t=>this.execute(t),action:async function(t,e,i){return this.execute({name:t,params:e,...i})}},t)}},q=class{constructor(t={}){this.defaults=t}async createSession(t){const e={...this.defaults,...t||{}};return new k(e)}async fetch(t,e){"string"!=typeof t&&(t=(e=t).url);const i=await this.createSession(e);try{const s=e?.actions||[];t&&0!==s.findIndex(e=>"goto"===e.id&&e.params?.url===t)&&s.unshift({id:"goto",params:{url:t}});return await i.executeAll(s)}finally{await i.dispose()}}};import{CheerioCrawler as C}from"crawlee";import*as $ from"cheerio";import{CommonError as R,ErrorCode as P,NotFoundError as A}from"@isdk/common-error";var T=class extends E{_ensureCheerioContext(t){if(!t.$&&t.body){let e="string"==typeof t.body?t.body:Buffer.isBuffer(t.body)?t.body.toString("utf-8"):JSON.stringify(t.body);e.trim().startsWith("<")||(e=`<html><body><pre>${e}</pre></body></html>`),t.$=$.load(e)}}async _buildResponse(t){this._ensureCheerioContext(t);const{request:e,response:i,body:s,$:r}=t,n=r?.html();let o="string"==typeof s?s:Buffer.isBuffer(s)?s.toString("utf-8"):String(s??"");n&&n!==o&&(o=n);let a=i?.headers;if(!a&&i?.rawHeaders){a={};const t=i.rawHeaders;for(let e=0;e<t.length;e+=2)a[t[e].toLowerCase()]=t[e+1]}const c={url:e.url,finalUrl:e.loadedUrl||e.url,statusCode:i?.statusCode??200,statusText:i?.statusMessage,headers:a||{},body:s,html:o,text:o};if(this.opts?.debug&&i?.timings){const t=i.timings;c.metadata={timings:{start:t.start,total:t.phases?.total,ttfb:t.phases?.firstByte,dns:t.phases?.dns,tcp:t.phases?.tcp,download:t.phases?.download}}}return c}async _querySelectorAll(t,e){const{$:i,el:s}=t;return s.find(e).toArray().map(t=>({$:i,el:i(t)}))}async _extractValue(t,e){const{el:i}=e,{attribute:s,type:r="string"}=t;if(0===i.length)return null;let n="";if(n=s?i.attr(s)??null:"html"===r?i.html():i.text().trim(),null===n)return null;switch(r){case"number":return parseFloat(n.replace(/[^0-9.-]+/g,""))||null;case"boolean":const t=n.toLowerCase();return"true"===t||"1"===t;default:return n}}async executeAction(t,e){const{$:i}=t;switch(e.type){case"dispose":return;case"extract":if(!i)throw new R(`Cheerio context not available for action: ${e.type}`,"extract");return this._extract(e.schema,{$:i,el:i.root()});case"click":{if(!i)throw new R(`Cheerio context not available for action: ${e.type}`,"click");const s=e.selector,r=i(s).first();let n;if(0===r.length)try{n=new URL(s,t.request.loadedUrl||t.request.url).href}catch{throw new R(`click: selector not found or invalid URL: ${s}`,"click")}else{if(!r.is("a")||!r.attr("href")){if(r.is('input[type="submit"], button[type="submit"], button, input')){const e=r.closest("form");if(e.length)return this.executeAction(t,{type:"submit",selector:e});throw new R("click: submit-like element without form","click")}throw new R(`click: unsupported element for http simulate. Selector: ${s}`,"click")}{const e=r.attr("href");n=new URL(e,t.request.loadedUrl||t.request.url).href}}const o=await t.sendRequest({url:n});return void await this._updateStateAfterNavigation(t,o)}case"fill":{if(!i)throw new R(`Cheerio context not available for action: ${e.type}`),"fill";const s=i(e.selector).first();if(0===s.length)throw new R(`fill: selector not found: ${e.selector}`);if(!s.is("input, textarea, select"))throw new R(`fill: not a form field: ${e.selector}`);return s.val(e.value),void(this.lastResponse=await this.buildResponse(t))}case"waitFor":return void(e.options?.ms&&await new Promise(t=>setTimeout(t,e.options.ms)));case"pause":const s=this.ctx?.onPause;return void(s?(console.info(e.message||"Execution paused for manual intervention."),await s({message:e.message}),console.info("Resuming execution...")):console.warn("[PauseAction] was called, but no `onPause` handler was provided in fetchWeb options. Skipped."));case"submit":{if(!i)throw new R(`Cheerio context not available for action: ${e.type}`,"submit");const s="string"==typeof e.selector?i(e.selector).first():null!=e.selector?e.selector:i("form").first();if(0===s.length)throw new A(e.selector,"submit");const r=s.attr("action")||t.request.loadedUrl||t.request.url,n=(s.attr("method")||"GET").toUpperCase(),o=new URL(r,t.request.loadedUrl||t.request.url).href,a={};let c;if(s.find("input, select, textarea").each((t,e)=>{const s=i(e),r=s.attr("name");if(!r)return;const n=s.val();null!=n&&(a[r]=String(n))}),"GET"===n){const e=new URL(o);Object.entries(a).forEach(([t,i])=>e.searchParams.set(t,i)),c=await t.sendRequest({url:e.href,method:"GET"})}else{let i;const r={};"application/json"===(e.options?.enctype||s.attr("enctype")||"application/x-www-form-urlencoded")?(i=JSON.stringify(a),r["Content-Type"]="application/json"):(i=new URLSearchParams(a).toString(),r["Content-Type"]="application/x-www-form-urlencoded"),c=await t.sendRequest({url:o,method:"POST",body:i,headers:r})}return void await this._updateStateAfterNavigation(t,c)}case"getContent":return this.buildResponse(t);default:throw new R(`Unknown action type: ${e.type}`,"CheerioFetchEngine.executeAction",P.NotSupported)}}async _updateStateAfterNavigation(t,e){const i=e;t.response=i,t.body=i.body,t.$=void 0,i.url&&(t.request.loadedUrl=i.url),this.lastResponse=await this.buildResponse(t)}_createCrawler(t,e){return new C(t,e)}_getSpecificCrawlerOptions(t){return{additionalMimeTypes:["text/plain"],maxRequestRetries:1,requestHandlerTimeoutSecs:t.requestHandlerTimeoutSecs,proxyConfiguration:this.proxyConfiguration,preNavigationHooks:[({session:e,request:i},s)=>{s.throwHttpErrors=t.throwHttpErrors,this.opts?.timeoutMs&&(s.timeout={request:this.opts.timeoutMs})}]}}async goto(t,e){this.isPageActive&&this.dispatchAction({type:"dispose"}).catch(()=>{});const i="req-"+ ++this.requestCounter,s=new Promise((t,s)=>{const r=e?.timeoutMs||this.opts?.timeoutMs||3e4,n=setTimeout(()=>{this.pendingRequests.delete(i),this.navigationLock.release(),s(new R(`goto timed out after ${r}ms.`,"gotoTimeout",P.RequestTimeout))},r);this.pendingRequests.set(i,{resolve:e=>{clearTimeout(n),t(e)},reject:t=>{clearTimeout(n),s(t)}})});return this.requestQueue.addRequest({...e,url:t,headers:{...this.hdrs,...e?.headers},userData:{requestId:i},uniqueKey:`${t}-${i}`}).catch(t=>{const e=this.pendingRequests.get(i);e&&(this.pendingRequests.delete(i),this.navigationLock.release(),e.reject(t))}),await this.navigationLock,this.navigationLock=b(),s}};T.id="cheerio",T.mode="http",E.register(T);import{PlaywrightCrawler as _}from"crawlee";import{firefox as O}from"playwright";import{CommonError as U,ErrorCode as j,NotFoundError as N}from"@isdk/common-error";var F=class extends E{async _buildResponse(t){const{page:e,response:i,request:s,session:r}=t;if(!e||e.isClosed())return{url:s.url,finalUrl:s.loadedUrl||s.url,statusCode:i?.status(),statusText:i?.statusText(),headers:await(i?.allHeaders())||{},body:"",html:"",text:""};const n=await e.content(),o=await e.textContent("body"),a=await e.context().cookies();r&&r.setCookies(a,s.url);const c={url:e.url(),finalUrl:e.url(),statusCode:i?.status(),statusText:i?.statusText(),headers:await(i?.allHeaders())||{},body:n,html:n,text:o||""};if(this.opts?.debug&&i){const t="function"==typeof i.request?i.request():i.request;if(t&&"function"==typeof t.timing){const e=t.timing();c.metadata={timings:{start:e.startTime,total:e.responseEnd-e.startTime,ttfb:e.responseStart-e.requestStart,dns:e.domainLookupEnd-e.domainLookupStart,tcp:e.connectEnd-e.connectStart,download:e.responseEnd-e.responseStart}}}}return!1!==this.opts?.output?.cookies&&(c.cookies=a),c}async _querySelectorAll(t,e){return t.locator(e).all()}async _extractValue(t,e){const{attribute:i,type:s="string"}=t;if(0===await e.count())return null;let r="";if(r=i?await e.getAttribute(i):"html"===s?await e.innerHTML():await e.textContent(),null===r)return null;switch(r=r.trim(),s){case"number":return parseFloat(r.replace(/[^0-9.-]+/g,""))||null;case"boolean":const t=r.toLowerCase();return"true"===t||"1"===t;default:return r}}async executeAction(t,e){const{page:i}=t,s=this.opts?.timeoutMs||3e4;switch(e.type){case"navigate":{const s=await i.goto(e.url,{waitUntil:e.opts?.waitUntil||"domcontentloaded",timeout:this.opts?.timeoutMs||3e4});s&&(t={...t,response:s});const r=await this.buildResponse(t);return this.lastResponse=r,r}case"extract":{const s=await this._extract(e.schema,i.locator("body"));return this.lastResponse=await this.buildResponse(t),s}case"click":{await i.click(e.selector,{timeout:s}),await i.waitForLoadState("networkidle",{timeout:s});const r=await this.buildResponse(t);return void(this.lastResponse=r)}case"fill":await i.fill(e.selector,e.value,{timeout:s});const r=await this.buildResponse(t);return void(this.lastResponse=r);case"waitFor":try{e.options?.selector&&await i.waitForSelector(e.options.selector,{timeout:s}),e.options?.networkIdle&&await i.waitForLoadState("networkidle",{timeout:s})}catch(t){if(!1!==e.options?.failOnTimeout)throw t}return void(e.options?.ms&&await i.waitForTimeout(e.options.ms));case"submit":{const r=e.selector||"form",n=i.locator(r).first();if(0===await n.count())throw new N(r,"submit");if("application/json"===(e.options?.enctype||"application/x-www-form-urlencoded")){const t=await n.elementHandle();if(!t)throw new U(`submit: could not get form handle for ${r}`,"submit");const e=await t.evaluate(async t=>{const e=new FormData(t),i={};e.forEach((t,e)=>{i[e]=t.toString()});const s=await fetch(t.action,{method:t.method,headers:{"Content-Type":"application/json"},body:JSON.stringify(i)}),r=await s.text();return{status:s.status,statusText:s.statusText,headers:Object.fromEntries(s.headers.entries()),body:r,html:r,text:r,url:t.action,finalUrl:s.url}});return await t.dispose(),await i.setContent(e.html),void(this.lastResponse=e)}return await n.evaluate(t=>t.submit()),await i.waitForLoadState("networkidle",{timeout:s}),void(this.lastResponse=await this.buildResponse(t))}case"pause":{const t=this.ctx?.onPause;return void(t?(console.info(e.message||"Execution paused for manual intervention."),await t({message:e.message}),console.info("Resuming execution...")):console.warn("[PauseAction] was called, but no `onPause` handler was provided in fetchWeb options. Skipped."))}case"getContent":return this.buildResponse(t);default:throw new U(`Unknown action type: ${e.type}`,"PlaywrightFetchEngine.executeAction",j.NotSupported)}}_createCrawler(t,e){return new _(t,e)}async _getSpecificCrawlerOptions(t){const e=t.browser?.headless??!0,i={maxRequestRetries:t.retries||3,headless:e,proxyConfiguration:this.proxyConfiguration,requestHandlerTimeoutSecs:t.requestHandlerTimeoutSecs,preNavigationHooks:[async({page:e,request:i},s)=>{s.throwHttpErrors=t.throwHttpErrors;const r=this.blockedTypes;r.size>0&&await e.route("**/*",t=>{r.has(t.request().resourceType())?t.abort():t.continue()})}]};if(this.opts?.antibot){i.browserPoolOptions={useFingerprints:!1};const{launchOptions:t}=await import("camoufox-js"),s=await t({headless:e});i.launchContext={launcher:O,launchOptions:s},i.postNavigationHooks=[async({page:t,handleCloudflareChallenge:e})=>{await e()}]}return i}async goto(t,e){if(this.isPageActive)return this.dispatchAction({type:"navigate",url:t,opts:e});if(!this.requestQueue)throw new U("RequestQueue not initialized","goto");const i="req-"+ ++this.requestCounter,s=new Promise((t,e)=>{this.pendingRequests.set(i,{resolve:t,reject:e})});return await this.requestQueue.addRequest({url:t,headers:this.hdrs,userData:{requestId:i,waitUntil:e?.waitUntil||"domcontentloaded"},uniqueKey:`${t}-${i}`}),s}};F.id="playwright",F.mode="browser",E.register(F);var H=class extends r{async onExecute(t,e){const{selector:i,...s}=e?.params||{};if(!i)throw new Error("Selector is required for click action");await this.delegateToEngine(t,"click",i,s)}};H.id="click",H.returnType="none",H.capabilities={http:"simulate",browser:"native"},r.register(H);var L=class extends r{async onExecute(t,e){const{selector:i,value:s,...r}=e?.params||{};if(!i)throw new Error("Selector is required for fill action");if(void 0===s)throw new Error("Value is required for fill action");await this.delegateToEngine(t,"fill",i,s,r)}};L.id="fill",L.returnType="none",L.capabilities={http:"simulate",browser:"native"},r.register(L);var M=class extends r{async onExecute(t,e){return await this.delegateToEngine(t,"getContent",e?.params)}};M.id="getContent",M.returnType="response",M.capabilities={http:"native",browser:"native"},r.register(M);var B=class extends r{async onExecute(t,e,i){const s=e?.params,r=s?.url||t.url;if(!r)throw new Error("URL is required for goto action");const n=t.internal.engine;if(!n)throw new Error("No engine available");t.url=r;return await n.goto(r,s)}};B.id="goto",B.returnType="response",B.capabilities={http:"native",browser:"native"},r.register(B);var I=class extends r{async onExecute(t,e){const{selector:i,...s}=e?.params||{};await this.delegateToEngine(t,"submit",i,s)}};I.id="submit",I.returnType="none",I.capabilities={http:"simulate",browser:"native"},r.register(I);var z=class extends r{async onExecute(t,e){const i=t.internal.engine;if(!i)throw new Error("No engine available");await i.waitFor(e?.params)}};z.id="waitFor",z.returnType="none",z.capabilities={http:"native",browser:"native"},r.register(z);var D=class extends r{async onExecute(t,e){const i=e?.params;if(!i)throw new Error("Schema is required for extract action");return this.delegateToEngine(t,"extract",i)}};D.id="extract",D.returnType="any",D.capabilities={http:"native",browser:"native"},r.register(D);var J=class extends r{async onExecute(t,e){const{selector:i,message:s,attribute:r}=e?.params||{},n=t.internal.engine;if("browser"===n?.mode){if(i){if(!await(n?.extract({selector:i,attribute:r})))return}n&&"pause"in n?await n.pause(s):console.warn("[PauseAction] was called, but the current engine does not support `pause`. Skipped.")}else console.warn("[PauseAction] can only run in browser engine. Skipped.")}};async function G(t,e){return(new q).fetch(t,e)}J.id="pause",J.capabilities={http:"native",browser:"native"},J.returnType="none",r.register(J);export{T as CheerioFetchEngine,H as ClickAction,t as DefaultFetcherProperties,D as ExtractAction,r as FetchAction,i as FetchActionResultStatus,E as FetchEngine,k as FetchSession,e as FetcherOptionKeys,L as FillAction,M as GetContentAction,B as GotoAction,J as PauseAction,F as PlaywrightFetchEngine,I as SubmitAction,z as WaitForAction,q as WebFetcher,G as fetchWeb};
package/docs/README.md CHANGED
@@ -135,10 +135,20 @@ This is the main entry point for the library.
135
135
 
136
136
  * `url` (string): The initial URL to navigate to.
137
137
  * `engine` ('http' | 'browser' | 'auto'): The engine to use. Defaults to `auto`.
138
+ * `proxy` (string | string[]): Proxy URL(s) to use for requests.
139
+ * `debug` (boolean): Enable detailed execution metadata (timings, engine used, etc.) in response.
138
140
  * `actions` (FetchActionOptions[]): An array of action objects to execute. (Supports `action`/`name` as alias for `id`, and `args` as alias for `params`)
139
141
  * `headers` (Record<string, string>): Headers to use for all requests.
140
142
  * `cookies` (Cookie[]): Array of cookies to use.
141
143
  * `sessionState` (any): Crawlee session state to restore.
144
+ * `storage` (StorageOptions): Controls session isolation, persistence, and cleanup.
145
+ * `id` (string): Shared storage ID for cross-session data reuse.
146
+ * `persist` (boolean): Whether to save data to disk.
147
+ * `purge` (boolean): Whether to delete data on cleanup (defaults to `true`).
148
+ * `config` (object): Raw Crawlee configuration (e.g., `{ localDataDirectory: './data' }`).
149
+ * `output` (object): Controls the output fields in `FetchResponse`.
150
+ * `cookies` (boolean): Whether to include cookies in the response (default: `true`).
151
+ * `sessionState` (boolean): Whether to include session state in the response (default: `true`).
142
152
  * `sessionPoolOptions` (SessionPoolOptions): Advanced configuration for the underlying Crawlee SessionPool.
143
153
  * ...and many other options for proxy, retries, etc.
144
154
 
@@ -131,10 +131,20 @@ searchGoogle('gemini');
131
131
 
132
132
  * `url` (string): 要导航的初始 URL。
133
133
  * `engine` ('http' | 'browser' | 'auto'): 要使用的引擎。默认为 `auto`。
134
+ * `proxy` (string | string[]): 用于请求的代理 URL。
135
+ * `debug` (boolean): 在响应中启用详细的执行元数据(耗时、使用的引擎等)。
134
136
  * `actions` (FetchActionOptions[]): 要执行的动作对象数组。(支持 `action`/`name` 作为 `id` 的别名,`args` 作为 `params` 的别名)
135
137
  * `headers` (Record<string, string>): 用于所有请求的头信息。
136
138
  * `cookies` (Cookie[]): 要使用的 Cookie 数组。
137
139
  * `sessionState` (any): 要恢复的 Crawlee 会话状态。
140
+ * `storage` (StorageOptions): 控制会话隔离、持久化和清理。
141
+ * `id` (string): 共享存储 ID,用于跨会话重用数据。
142
+ * `persist` (boolean): 是否将数据保存到磁盘。
143
+ * `purge` (boolean): 是否在清理时删除数据(默认为 `true`)。
144
+ * `config` (object): 原生 Crawlee 配置(例如 `{ localDataDirectory: './data' }`)。
145
+ * `output` (object): 控制 `FetchResponse` 中的输出字段。
146
+ * `cookies` (boolean): 是否在响应中包含 Cookie(默认:`true`)。
147
+ * `sessionState` (boolean): 是否在响应中包含会话状态(默认:`true`)。
138
148
  * `sessionPoolOptions` (SessionPoolOptions): 底层 Crawlee SessionPool 的高级配置。
139
149
  * ...以及许多其他用于代理、重试等的选项。
140
150
 
@@ -48,9 +48,17 @@ When the library determines which engine to use (via internal `maybeCreateEngine
48
48
 
49
49
  The engine supports persisting and restoring session state (primarily cookies) between executions.
50
50
 
51
+ * **Flexible Session Isolation & Storage**: The library provides fine-grained control over how session data is stored and isolated via the `storage` configuration:
52
+ * **`id`**: A custom string to identify the storage.
53
+ * **Isolation (Default)**: If omitted, each session gets a unique ID, ensuring complete isolation of `RequestQueue`, `KeyValueStore`, and `SessionPool`.
54
+ * **Sharing**: Providing the same `id` across sessions allows them to share the same underlying storage, useful for persistent login sessions.
55
+ * **`persist`**: (boolean) Whether to enable disk persistence (Crawlee's `persistStorage`). Defaults to `false` (in-memory).
56
+ * **`purge`**: (boolean) Whether to delete the storage (drop `RequestQueue` and `KeyValueStore`) when the session is closed. Defaults to `true`.
57
+ * Set `purge: false` and provide a fixed `id` to create a truly persistent session that survives across application restarts.
58
+ * **`config`**: Allows passing raw configuration to the underlying Crawlee instance.
59
+ * **Note**: When `persist` is true, use `localDataDirectory` in the config to specify the storage path (e.g., `storage: { persist: true, config: { localDataDirectory: './my-data' } }`).
51
60
  * **`sessionState`**: A comprehensive state object (derived from Crawlee's SessionPool) that can be used to fully restore a previous session. This state is **automatically included in every `FetchResponse`**, making it easy to persist and later provide back to the engine during initialization.
52
61
  * **`sessionPoolOptions`**: Allows advanced configuration of the underlying Crawlee `SessionPool` (e.g., `maxUsageCount`, `maxPoolSize`).
53
- > **Note**: `persistenceOptions.enable` is forced to `true` to ensure proper session state management.
54
62
  * **`overrideSessionState`**: If set to `true`, it forces the engine to overwrite any existing persistent state in the storage with the provided `sessionState`. This is useful when you want to ensure the session starts with the exact state provided, ignoring any stale data in the persistence layer.
55
63
  * **`cookies`**: An array of explicit cookies to use for the session.
56
64
 
@@ -79,8 +87,8 @@ Our engine solves this by creating a bridge between the external API calls and t
79
87
 
80
88
  **The workflow is as follows:**
81
89
 
82
- 1. **Initialization**: A consumer calls `FetchEngine.create()`, which initializes a Crawlee crawler (e.g., `PlaywrightCrawler`) that runs in the background.
83
- 2. **Navigation (`goto`)**: The consumer calls `await engine.goto(url)`. This adds the URL to Crawlee's `RequestQueue` and returns a `Promise` that will resolve when the page is loaded.
90
+ 1. **Initialization**: A consumer calls `FetchEngine.create()`, which initializes a private `Configuration` and starts a Crawlee crawler (e.g., `PlaywrightCrawler`) that runs in the background.
91
+ 2. **Navigation (`goto`)**: The consumer calls `await engine.goto(url)`. This adds the URL to the engine's private `RequestQueue` and returns a `Promise` that will resolve when the page is loaded.
84
92
  3. **Crawlee Processing**: The background crawler picks up the request and invokes the engine's `requestHandler`, passing it the crucial page context.
85
93
  4. **Page Activation & Action Loop**: Inside the `requestHandler`:
86
94
  * The page context is used to resolve the `Promise` from the `goto()` call.
@@ -88,7 +96,11 @@ Our engine solves this by creating a bridge between the external API calls and t
88
96
  * Crucially, before the `requestHandler` returns, it starts an **action loop** (`_executePendingActions`). This loop effectively **pauses the `requestHandler`** by listening for events on an `EventEmitter`, keeping the page context alive.
89
97
  5. **Interactive Actions (`click`, `fill`, etc.)**: The consumer can now call `await engine.click(...)`. This dispatches an action to the `EventEmitter` and returns a new `Promise`.
90
98
  6. **Action Execution**: The action loop, still running within the original `requestHandler`'s scope, hears the event. Because it has access to the page context, it can perform the *actual* interaction (e.g., `page.click(...)`).
91
- 7. **Cleanup**: The loop continues until a `dispose` action is dispatched (e.g., by a new `goto()` call), which terminates the loop and allows the `requestHandler` to finally complete.
99
+ 7. **Robust Cleanup**: When `dispose()` or `cleanup()` is called:
100
+ * An `isEngineDisposed` flag is set to prevent new actions.
101
+ * A `dispose` signal is emitted to wake up and terminate the action loop.
102
+ * All active locks (`navigationLock`) are released.
103
+ * The crawler is torn down (`teardown`), and the private `RequestQueue` and `KeyValueStore` are dropped to ensure a clean state.
92
104
 
93
105
  ---
94
106