@isdk/web-fetcher 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/README.action.cn.md +469 -0
  2. package/README.action.md +452 -0
  3. package/README.cn.md +147 -0
  4. package/README.engine.cn.md +262 -0
  5. package/README.engine.md +262 -0
  6. package/README.md +147 -0
  7. package/dist/index.d.mts +1603 -0
  8. package/dist/index.d.ts +1603 -0
  9. package/dist/index.js +1 -0
  10. package/dist/index.mjs +1 -0
  11. package/docs/README.md +151 -0
  12. package/docs/_media/LICENSE-MIT +22 -0
  13. package/docs/_media/README.action.md +452 -0
  14. package/docs/_media/README.cn.md +147 -0
  15. package/docs/_media/README.engine.md +262 -0
  16. package/docs/classes/CheerioFetchEngine.md +1447 -0
  17. package/docs/classes/ClickAction.md +533 -0
  18. package/docs/classes/ExtractAction.md +533 -0
  19. package/docs/classes/FetchAction.md +444 -0
  20. package/docs/classes/FetchEngine.md +1230 -0
  21. package/docs/classes/FetchSession.md +111 -0
  22. package/docs/classes/FillAction.md +533 -0
  23. package/docs/classes/GetContentAction.md +533 -0
  24. package/docs/classes/GotoAction.md +537 -0
  25. package/docs/classes/PauseAction.md +533 -0
  26. package/docs/classes/PlaywrightFetchEngine.md +1437 -0
  27. package/docs/classes/SubmitAction.md +533 -0
  28. package/docs/classes/WaitForAction.md +533 -0
  29. package/docs/classes/WebFetcher.md +85 -0
  30. package/docs/enumerations/FetchActionResultStatus.md +40 -0
  31. package/docs/functions/fetchWeb.md +43 -0
  32. package/docs/globals.md +72 -0
  33. package/docs/interfaces/BaseFetchActionProperties.md +83 -0
  34. package/docs/interfaces/BaseFetchCollectorActionProperties.md +145 -0
  35. package/docs/interfaces/BaseFetcherProperties.md +206 -0
  36. package/docs/interfaces/Cookie.md +142 -0
  37. package/docs/interfaces/DispatchedEngineAction.md +60 -0
  38. package/docs/interfaces/ExtractActionProperties.md +113 -0
  39. package/docs/interfaces/FetchActionInContext.md +149 -0
  40. package/docs/interfaces/FetchActionProperties.md +125 -0
  41. package/docs/interfaces/FetchActionResult.md +55 -0
  42. package/docs/interfaces/FetchContext.md +424 -0
  43. package/docs/interfaces/FetchEngineContext.md +328 -0
  44. package/docs/interfaces/FetchMetadata.md +73 -0
  45. package/docs/interfaces/FetchResponse.md +105 -0
  46. package/docs/interfaces/FetchReturnTypeRegistry.md +57 -0
  47. package/docs/interfaces/FetchSite.md +320 -0
  48. package/docs/interfaces/FetcherOptions.md +300 -0
  49. package/docs/interfaces/GotoActionOptions.md +66 -0
  50. package/docs/interfaces/PendingEngineRequest.md +51 -0
  51. package/docs/interfaces/SubmitActionOptions.md +23 -0
  52. package/docs/interfaces/WaitForActionOptions.md +39 -0
  53. package/docs/type-aliases/BaseFetchActionOptions.md +11 -0
  54. package/docs/type-aliases/BaseFetchCollectorOptions.md +11 -0
  55. package/docs/type-aliases/BrowserEngine.md +11 -0
  56. package/docs/type-aliases/FetchActionCapabilities.md +11 -0
  57. package/docs/type-aliases/FetchActionCapabilityMode.md +11 -0
  58. package/docs/type-aliases/FetchActionOptions.md +11 -0
  59. package/docs/type-aliases/FetchEngineAction.md +18 -0
  60. package/docs/type-aliases/FetchEngineType.md +11 -0
  61. package/docs/type-aliases/FetchReturnType.md +11 -0
  62. package/docs/type-aliases/FetchReturnTypeFor.md +17 -0
  63. package/docs/type-aliases/OnFetchPauseCallback.md +23 -0
  64. package/docs/type-aliases/ResourceType.md +11 -0
  65. package/docs/variables/DefaultFetcherProperties.md +11 -0
  66. package/package.json +90 -0
package/dist/index.js ADDED
@@ -0,0 +1 @@
1
+ "use strict";var t,e=Object.create,i=Object.defineProperty,s=Object.getOwnPropertyDescriptor,n=Object.getOwnPropertyNames,r=Object.getPrototypeOf,o=Object.prototype.hasOwnProperty,a=(t,e,r,a)=>{if(e&&"object"==typeof e||"function"==typeof e)for(let c of n(e))o.call(t,c)||c===r||i(t,c,{get:()=>e[c],enumerable:!(a=s(e,c))||a.enumerable});return t},c=(t,s,n)=>(n=null!=t?e(r(t)):{},a(!s&&t&&t.__esModule?n:i(n,"default",{value:t,enumerable:!0}),t)),l={};((t,e)=>{for(var s in e)i(t,s,{get:e[s],enumerable:!0})})(l,{CheerioFetchEngine:()=>_,ClickAction:()=>H,DefaultFetcherProperties:()=>u,ExtractAction:()=>V,FetchAction:()=>f,FetchActionResultStatus:()=>h,FetchEngine:()=>A,FetchSession:()=>U,FillAction:()=>L,GetContentAction:()=>z,GotoAction:()=>I,PauseAction:()=>K,PlaywrightFetchEngine:()=>D,SubmitAction:()=>W,WaitForAction:()=>J,WebFetcher:()=>j,fetchWeb:()=>Q}),module.exports=(t=l,a(i({},"__esModule",{value:!0}),t));var u={enableSmart:!0,useSiteRegistry:!0,antibot:!1,headers:{},cookies:[],reuseCookies:!0,proxy:[],blockResources:[],ignoreSslErrors:!0,browser:{engine:"playwright",headless:!0,waitUntil:"domcontentloaded"},http:{method:"GET"},timeoutMs:6e4,maxConcurrency:1,maxRequestsPerMinute:1e3,delayBetweenRequestsMs:0,retries:0,sites:[]},h=(t=>(t[t.Failed=0]="Failed",t[t.Success=1]="Success",t[t.Skipped=2]="Skipped",t))(h||{}),w=class t{static register(t){const e=t.id;if(!e)throw new Error("FetchAction.register: actionClass.id is required");this.registry.set(e,t)}static get(t){return this.registry.get(t)}static create(t){const e="string"==typeof t?t:t.id||t.name;if(!e)throw new Error("Action must have id or name");const i=this.registry.get(e);return i?new i:void 0}static has(t){return this.registry.has(t)}static list(){return Array.from(this.registry.keys())}static getCapability(t){return this.capabilities[t]??"noop"}getCapability(t){return this.constructor.getCapability(t)}get id(){return this.constructor.id}get returnType(){return this.constructor.returnType}get capabilities(){return this.constructor.capabilities}async delegateToEngine(t,e,...i){const s=t.internal.engine;if(!s)throw new Error("No engine available");if("function"!=typeof s[e])throw new Error(`Engine does not have a method named '${String(e)}'`);return await s[e](...i)}installCollectors(e,i){const s=i?.collectors;if(!s?.length)return;const n=[],r=new Set;for(const i of s){const s=d(i.activateOn),o=d(i.collectOn),a=d(i.deactivateOn),c=!(i.background??!0),l=t.create(i);if(!l)continue;let u=!1,h=!1,w=0;const f=async t=>{if(!u&&!h){u=!0;try{await(l.onBeforeExec?.(e,i))}catch(t){e.eventBus.emit("collector:error",{action:this.id,collector:l.id,phase:"before",error:t})}}},m=async(t,s)=>{if(!h){u||await f(s);try{const n=Promise.resolve(l.onExecute?.(e,i,s)).then(s=>{var n,r;if(i.storeAs){((n=e.outputs)[r=i.storeAs]||(n[r]=[])).push(s)}return e.eventBus.emit("collector:result",{action:this.id,collector:i.id||i.name,event:t,result:s}),s}).catch(s=>{e.eventBus.emit("collector:error",{action:this.id,collector:i.id||i.name,event:t,phase:"exec",error:s})}).finally(()=>{w++});c&&(r.add(n),n.finally(()=>r.delete(n)))}catch(i){e.eventBus.emit("collector:error",{action:this.id,collector:l.id,event:t,phase:"exec",error:i})}}},g=async()=>{if(!h){0===w&&m("collector:after"),h=!0;try{await(l.onAfterExec?.(e,i))}catch(t){e.eventBus.emit("collector:error",{action:this.id,collector:i.id||i.name,phase:"after",error:t})}finally{e.eventBus.emit("collector:end",{action:this.id,collector:i.id||i.name}),v.forEach(t=>t())}}},b=p(e,s,f),v=y(e,o,m),x=p(e,a,g);if(n.push(...b,...v,...x),!s.length&&!o.length&&!a.length){const t=()=>{g()};e.eventBus.once(`action:${this.id}.end`,t),n.push(()=>e.eventBus.off("fetcher:action:end",t))}}return n.length||r.size>0?{cleanup:()=>n.forEach(t=>t()),awaitExecPendings:async()=>{r.size>0&&await Promise.allSettled(Array.from(r))}}:void 0}async beforeExec(t,e){t.internal.actionStack||(t.internal.actionStack=[]);const i=t.internal.actionStack,s=i.length,n=i.length>0?i[i.length-1].id:void 0,r={...e,id:this.id,depth:s,parent:n};i.push(r),t.currentAction=r;const o={action:this,context:t,options:e,depth:s,stack:[...i]};t.eventBus.emit(`action:${this.id}.start`,o),t.eventBus.emit("action:start",o),await(this.onBeforeExec?.(t,e));return{entry:o,collectors:this.installCollectors(t,e)}}async afterExec(t,e,i,s){const n=t.internal.actionStack,r=n.length-1,o=s?.collectors;try{await(o?.awaitExecPendings()),t.lastResult=i,"response"!==i?.returnType||i.error||(t.lastResponse=i.result),e?.storeAs&&(t.outputs[e.storeAs]=i?.result),i?.error&&(t.currentAction.error=i.error),await(this.onAfterExec?.(t,e));const s={action:this,context:t,options:e,result:i,depth:r,stack:[...n]};i?.error&&(s.error=i.error);try{t.eventBus.emit(`action:${this.id}.end`,s)}catch(t){}try{t.eventBus.emit("action:end",s)}catch(t){}}finally{try{o?.cleanup()}finally{n.pop();const e=n.length;t.currentAction=e>0?n[e-1]:void 0}}}async execute(t,e){const i=await this.beforeExec(t,e);let s;try{const i=e?.failOnError??!0;return t.throwHttpErrors=i,s=await this.onExecute(t,e),s&&s.returnType||(s={status:1,returnType:this.returnType??"any",result:s}),s}catch(i){if(s={status:0,error:i,meta:{id:this.id,engineType:t.engine,capability:this.getCapability(t.engine)}},e?.failOnError)throw i;return s}finally{await this.afterExec(t,e,s,i)}}};w.registry=new Map,w.returnType="any",w.capabilities={http:"noop",browser:"noop"};var f=w;function d(t){return t?Array.isArray(t)?t:[t]:[]}function p(t,e,i){const s=[];for(const n of e)if("string"==typeof n||n instanceof RegExp){const e=(...t)=>{i(t[0])};t.eventBus.once(n,e),s.push(()=>t.eventBus.off(n,e))}return s}function y(t,e,i){const s=[];for(const n of e)if("string"==typeof n||n instanceof RegExp){const e=t=>i(n,t);t.eventBus.on(n,e),s.push(()=>t.eventBus.off(n,e))}return s}var m=require("events-ex");var g,b,v=require("lodash-es"),x=c(require("crypto"),1),E=t=>((t=>{!g||g.length<t?(g=Buffer.allocUnsafe(128*t),x.default.randomFillSync(g),b=0):b+t>g.length&&(x.default.randomFillSync(g),b=0),b+=t})(t|=0),g.subarray(b-t,b)),q=((t,e=21)=>((t,e,i)=>{let s=(2<<31-Math.clz32(t.length-1|1))-1,n=Math.ceil(1.6*s*e/t.length);return(r=e)=>{let o="";for(;;){let e=i(n),a=n;for(;a--;)if(o+=t[e[a]&s]||"",o.length===r)return o}}})(t,e,E))("0123456789abcdefghijklmnopqrstuvwxyz",12);var C=require("lodash-es"),k=require("events-ex"),S=require("@isdk/common-error"),$=require("crawlee");function R(){let t=()=>{};const e=new Promise(e=>{t=e});return e.release=t,e}$.Configuration.getGlobalConfig().set("persistStorage",!1);var A=class{constructor(){this.hdrs={},this.jar=[],this.pendingRequests=new Map,this.requestCounter=0,this.actionEmitter=new k.EventEmitter,this.isPageActive=!1,this.navigationLock=function(){const t=R();return t.release(),t}(),this.blockedTypes=new Set}static register(t){const e=t.id;if(!e)throw new Error("Engine must define static id");if(this.registry.has(e))throw new Error(`Engine id duplicated: ${e}`);this.registry.set(e,t)}static get(t){return this.registry.get(t)}static getByMode(t){for(const[e,i]of this.registry.entries())if(i.mode===t)return i}static async create(t,e){const i=(0,C.defaultsDeep)(e,t,u),s=i.engine??t.engine,n=s?this.get(s)??this.getByMode(s):null;if(n){const e=new n;return await e.initialize(t,i),e}}async _extract(t,e){const i=t.type;if(!e)return"array"===i?[]:null;if("object"===i){const{selector:i,properties:s}=t;let n=e;if(i){const t=await this._querySelectorAll(e,i);n=t.length>0?t[0]:null}if(!n)return null;const r={};for(const t in s)r[t]=await this._extract(s[t],n);return r}if("array"===i){const{selector:i,items:s}=t,n=i?await this._querySelectorAll(e,i):[e],r=[];for(const t of n)r.push(await this._extract(s,t));return r}const{selector:s}=t;let n=e;if(s){const t=await this._querySelectorAll(e,s);n=t.length>0?t[0]:null}return n?this._extractValue(t,n):null}waitFor(t){return this.dispatchAction({type:"waitFor",options:t})}click(t){return this.dispatchAction({type:"click",selector:t})}fill(t,e){return this.dispatchAction({type:"fill",selector:t,value:e})}submit(t,e){return this.dispatchAction({type:"submit",selector:t,options:e})}pause(t){return this.dispatchAction({type:"pause",message:t})}extract(t){const e=this._normalizeSchema(t);return this.dispatchAction({type:"extract",schema:e})}_normalizeSchema(t){const e=JSON.parse(JSON.stringify(t));if(e.properties)for(const t in e.properties)e.properties[t]=this._normalizeSchema(e.properties[t]);if(e.items&&(e.items=this._normalizeSchema(e.items)),"array"===e.type&&(e.attribute&&!e.items&&(e.items={attribute:e.attribute},delete e.attribute),e.items||(e.items={type:"string"})),e.selector&&(e.has||e.exclude)){const{selector:t,has:i,exclude:s}=e,n=t.split(",").map(t=>{let e=t.trim();return i&&(e=`${e}:has(${i})`),s&&(e=`${e}:not(${s})`),e}).join(", ");e.selector=n,delete e.has,delete e.exclude}return e}get id(){return this.constructor.id}get mode(){return this.constructor.mode}get context(){return this.ctx}async initialize(t,e){if(this.ctx)return;(0,C.merge)(t,e),this.ctx=t,this.opts=t,this.hdrs=function(t){const e={};if(t&&"object"==typeof t)for(const[i,s]of Object.entries(t))e[i.toLowerCase()]=s;return e}(t.headers),this.jar=[...t.cookies??[]],t.internal||(t.internal={}),t.internal.engine=this,t.engine=this.mode,this.requestQueue=await $.RequestQueue.open();const i=await this._getSpecificCrawlerOptions(t),s={...(0,C.defaultsDeep)(i,{requestQueue:this.requestQueue,maxConcurrency:1,minConcurrency:1,useSessionPool:!0,persistCookiesPerSession:!0,sessionPoolOptions:{maxPoolSize:1,persistenceOptions:{enable:!1},sessionOptions:{maxUsageCount:1e3,maxErrorScore:3}}}),requestHandler:this._requestHandler.bind(this),errorHandler:this._failedRequestHandler.bind(this),failedRequestHandler:this._failedRequestHandler.bind(this)};this.crawler=this._createCrawler(s),this.crawler.run().then(()=>{this.isCrawlerReady=!0}).catch(t=>{this.isCrawlerReady=!1,console.error("Crawler background error:",t)})}async cleanup(){await(this._cleanup?.()),await this._commonCleanup();const t=this.ctx;t&&t.internal?.engine===this&&(t.internal.engine=void 0),this.ctx=void 0,this.opts=void 0}async _executePendingActions(t){await new Promise(e=>{const i=async({action:e,resolve:i,reject:s})=>{try{if("dispose"===e.type)return this.actionEmitter.emit("dispose"),void i();i(await this.executeAction(t,e))}catch(t){s(t)}};this.actionEmitter.on("dispatch",i),this.actionEmitter.once("dispose",()=>{this.actionEmitter.removeListener("dispatch",i),e()})})}async _sharedRequestHandler(t){try{const{request:e}=t;this.isPageActive=!0;const i=this.pendingRequests.get(e.userData.requestId);if(i){const s=await this.buildResponse(t),n=!s.statusCode||s.statusCode>=400;if(this.ctx?.throwHttpErrors&&n){const t=new S.CommonError(`Request for ${s.finalUrl} failed with status ${s.statusCode||"N/A"}`,"request",s.statusCode);i.reject(t)}else this.lastResponse=s,i.resolve(s);this.pendingRequests.delete(e.userData.requestId)}await this._executePendingActions(t)}finally{this.isPageActive=!1,this.navigationLock.release()}}async _sharedFailedRequestHandler(t,e){const{request:i}=t,s=this.pendingRequests.get(i.userData.requestId);if(s&&e&&this.ctx?.throwHttpErrors){this.pendingRequests.delete(i.userData.requestId);const t=e.response,n=t?.statusCode||500,r=t?.url?t.url:i.url,o=new S.CommonError(`Request${r?" for "+r:""} failed: ${e.message}`,"request",n);s.reject(o)}return this._sharedRequestHandler(t)}async dispatchAction(t){if(!this.isPageActive)throw new Error("No active page. Call goto() before performing actions.");return new Promise((e,i)=>{this.actionEmitter.emit("dispatch",{action:t,resolve:e,reject:i})})}async _requestHandler(t){await this._sharedRequestHandler(t)}async _failedRequestHandler(t,e){await this._sharedFailedRequestHandler(t,e)}async _commonCleanup(){if(this.isPageActive&&await this.dispatchAction({type:"dispose"}).catch(()=>{}),this.pendingRequests.size>0)for(const[,t]of this.pendingRequests)t.reject(new Error("Cleanup:Request cancelled"));if(this.actionEmitter.removeAllListeners(),this.crawler){try{await(this.crawler.teardown?.())}catch(t){console.error("ccrawler teardown error:",t)}this.crawler=void 0}this.isCrawlerReady=void 0,this.requestQueue&&(await this.requestQueue.drop(),this.requestQueue=void 0),this.pendingRequests.clear()}async blockResources(t,e){return e&&this.blockedTypes.clear(),t.forEach(t=>this.blockedTypes.add(t)),t.length}getContent(){return this.lastResponse?Promise.resolve(this.lastResponse):Promise.reject(new Error("No content fetched yet. Call goto() first."))}async headers(t,e){if(void 0===t)return{...this.hdrs};if("string"==typeof t&&void 0===e)return this.hdrs[t.toLowerCase()]||"";if(null!==t&&"object"==typeof t){const i={};for(const[e,s]of Object.entries(t))i[e.toLowerCase()]=String(s);return this.hdrs=!0===e?i:{...this.hdrs,...i},!0}return"string"==typeof t&&("string"==typeof e?this.hdrs[t.toLowerCase()]=e:null===e&&delete this.hdrs[t.toLowerCase()],!0)}async cookies(t){return Array.isArray(t)?(this.jar=[...t],!0):null===t?(this.jar=[],!0):[...this.jar]}async dispose(){await this.cleanup()}};async function P(t,e){const i=function(t,e){if(!t||!e?.length)return null;const i=new URL(t);let s=e.find(t=>t.domain===i.hostname);s||(s=e.find(t=>i.hostname.endsWith(t.domain)));if(!s)return null;if(s.pathScope?.length){if(!s.pathScope.some(t=>i.pathname.startsWith(t)))return null}return s}(e?.url||t.url,t.sites),s=t.engine||i?.engine||"auto";let n=await A.create(t,{engine:s});return n||(n=await A.create(t,{engine:"http"})),n}A.registry=new Map;var U=class{constructor(t={}){this.options=t,this.closed=!1,this.id=q(),this.context=this.createContext(t)}async execute(t){await this.ensureEngine(t);const e=f.create(t);if(!e)throw new Error(`Unknown action: ${t.id||t.name}`);let i,s;this.context.internal.actionIndex=(this.context.internal.actionIndex||0)+1,this.context.currentAction={...t,index:this.context.internal.actionIndex,startedAt:Date.now()};try{return i=await e.execute(this.context,t),i}catch(t){throw s=t,s}finally{this.context.currentAction=void 0}}async executeAll(t){try{for(let e=0;e<t.length;e++){const i=t[e];await this.execute(i)}const e=await this.execute({id:"getContent"});return{result:e?.result,outputs:this.getOutputs()}}catch(t){throw t}}getOutputs(){return this.context.outputs}async dispose(){if(this.closed)return;const t=this.context.eventBus;t.emit("session:closing",{sessionId:this.id});try{await(this.context.internal.engine?.dispose())}finally{this.closed=!0}t.emit("session:closed",{sessionId:this.id})}async ensureEngine(t){if(this.closed)throw new Error("Session is closed");if(!this.context.internal.engine){const e=t?.params?.url??this.context.url;if(!await P(this.context,{url:e}))throw new Error("No engine found")}}createContext(t=this.options){const e=new m.EventEmitter;return(0,v.defaultsDeep)({...t,id:this.id,eventBus:e,outputs:{},internal:{},execute:async t=>this.execute(t),action:async function(t,e,i){return this.execute({name:t,params:e,...i})}},u)}},j=class{constructor(t={}){this.defaults=t}async createSession(t){const e={...this.defaults,...t||{}};return new U(e)}async fetch(t,e){"string"!=typeof t&&(t=(e=t).url);const i=await this.createSession(e);try{const s=e?.actions||[];t&&0!==s.findIndex(e=>"goto"===e.id&&e.params?.url===t)&&s.unshift({id:"goto",params:{url:t}});return await i.executeAll(s)}finally{await i.dispose()}}},F=require("crawlee"),O=c(require("cheerio")),T=require("@isdk/common-error"),_=class extends A{async buildResponse(t){const{request:e,response:i,body:s,$:n}=t,r=n?.html();let o="string"==typeof s?s:Buffer.isBuffer(s)?s.toString("utf-8"):String(s??"");return r&&r!==o&&(o=r),{url:e.url,finalUrl:e.loadedUrl||e.url,statusCode:i?.statusCode??200,statusText:i?.statusMessage,headers:i?.headers,body:s,html:o,text:o}}async _querySelectorAll(t,e){const{$:i,el:s}=t;return s.find(e).toArray().map(t=>({$:i,el:i(t)}))}async _extractValue(t,e){const{el:i}=e,{attribute:s,type:n="string"}=t;if(0===i.length)return null;let r="";if(r=s?i.attr(s)??null:"html"===n?i.html():i.text().trim(),null===r)return null;switch(n){case"number":return parseFloat(r.replace(/[^0-9.-]+/g,""))||null;case"boolean":const t=r.toLowerCase();return"true"===t||"1"===t;default:return r}}async executeAction(t,e){const{$:i}=t;switch(e.type){case"dispose":return;case"extract":if(!i)throw new T.CommonError(`Cheerio context not available for action: ${e.type}`,"extract");return this._extract(e.schema,{$:i,el:i.root()});case"click":{if(!i)throw new T.CommonError(`Cheerio context not available for action: ${e.type}`,"click");const s=e.selector,n=i(s).first();let r;if(0===n.length)try{r=new URL(s,t.request.loadedUrl||t.request.url).href}catch{throw new T.CommonError(`click: selector not found or invalid URL: ${s}`,"click")}else{if(!n.is("a")||!n.attr("href")){if(n.is('input[type="submit"], button[type="submit"], button, input')){const e=n.closest("form");if(e.length)return this.executeAction(t,{type:"submit",selector:e});throw new T.CommonError("click: submit-like element without form","click")}throw new T.CommonError(`click: unsupported element for http simulate. Selector: ${s}`,"click")}{const e=n.attr("href");r=new URL(e,t.request.loadedUrl||t.request.url).href}}const o=await t.sendRequest({url:r});return void this._updateStateAfterNavigation(t,o)}case"fill":{if(!i)throw new T.CommonError(`Cheerio context not available for action: ${e.type}`),"fill";const s=i(e.selector).first();if(0===s.length)throw new T.CommonError(`fill: selector not found: ${e.selector}`);if(!s.is("input, textarea, select"))throw new T.CommonError(`fill: not a form field: ${e.selector}`,"fill");{s.val(e.value);const i=this.buildResponse(t);this.lastResponse=i}return}case"waitFor":return void(e.options?.ms&&await new Promise(t=>setTimeout(t,e.options.ms)));case"pause":const s=this.ctx?.onPause;return void(s?(console.info(e.message||"Execution paused for manual intervention."),await s({message:e.message}),console.info("Resuming execution...")):console.warn("[PauseAction] was called, but no `onPause` handler was provided in fetchWeb options. Skipped."));case"submit":{if(!i)throw new T.CommonError(`Cheerio context not available for action: ${e.type}`,"submit");const s="string"==typeof e.selector?i(e.selector).first():null!=e.selector?e.selector:i("form").first();if(0===s.length)throw new T.NotFoundError(e.selector,"submit");const n=s.attr("action")||t.request.loadedUrl||t.request.url,r=(s.attr("method")||"GET").toUpperCase(),o=new URL(n,t.request.loadedUrl||t.request.url).href,a={};let c;if(s.find("input, select, textarea").each((t,e)=>{const s=i(e),n=s.attr("name");if(!n)return;const r=s.val();null!=r&&(a[n]=String(r))}),"GET"===r){const e=new URL(o);Object.entries(a).forEach(([t,i])=>e.searchParams.set(t,i)),c=await t.sendRequest({url:e.href,method:"GET"})}else{let i;const n={};"application/json"===(e.options?.enctype||s.attr("enctype")||"application/x-www-form-urlencoded")?(i=JSON.stringify(a),n["Content-Type"]="application/json"):(i=new URLSearchParams(a).toString(),n["Content-Type"]="application/x-www-form-urlencoded"),c=await t.sendRequest({url:o,method:"POST",body:i,headers:n})}return void this._updateStateAfterNavigation(t,c)}case"getContent":return this.buildResponse(t);default:throw new T.CommonError(`Unknown action type: ${e.type}`,"CheerioFetchEngine.executeAction",T.ErrorCode.NotSupported)}}_updateStateAfterNavigation(t,e){const i=e.response||e,{body:s,headers:n,statusCode:r,statusMessage:o}=i,{url:a,loadedUrl:c}=e,l="string"==typeof s?s:Buffer.isBuffer(s)?s.toString("utf-8"):String(s??"");n&&n["content-type"]?.includes("html")&&(t.$=O.load(l)),this.lastResponse={url:a,finalUrl:c||a,statusCode:r,statusText:o,headers:n||{},body:s,html:l,text:l}}_createCrawler(t){return new F.CheerioCrawler(t)}_getSpecificCrawlerOptions(t){const e=this.opts?.proxy?"string"==typeof this.opts.proxy?[this.opts.proxy]:this.opts.proxy:void 0,i=e?.length?new F.ProxyConfiguration({proxyUrls:e}):void 0;return{additionalMimeTypes:["text/plain"],maxRequestRetries:1,requestHandlerTimeoutSecs:Math.max(5,Math.floor((this.opts?.timeoutMs||3e4)/1e3)),proxyConfiguration:i,preNavigationHooks:[(e,i)=>{i.throwHttpErrors=t.throwHttpErrors,this.opts?.timeoutMs&&(i.timeout={request:this.opts.timeoutMs})}]}}async goto(t,e){this.isPageActive&&this.dispatchAction({type:"dispose"}).catch(()=>{});const i="req-"+ ++this.requestCounter,s=new Promise((t,s)=>{const n=e?.timeoutMs||this.opts?.timeoutMs||3e4,r=setTimeout(()=>{this.pendingRequests.delete(i),this.navigationLock.release(),s(new T.CommonError(`goto timed out after ${n}ms.`,"gotoTimeout",T.ErrorCode.RequestTimeout))},n);this.pendingRequests.set(i,{resolve:e=>{clearTimeout(r),t(e)},reject:t=>{clearTimeout(r),s(t)}})});return this.requestQueue.addRequest({...e,url:t,headers:{...this.hdrs,...e?.headers},userData:{requestId:i},uniqueKey:`${t}-${i}`}).catch(t=>{const e=this.pendingRequests.get(i);e&&(this.pendingRequests.delete(i),this.navigationLock.release(),e.reject(t))}),await this.navigationLock,this.navigationLock=R(),s}};_.id="cheerio",_.mode="http",A.register(_);var N=require("crawlee"),M=require("playwright"),B=require("camoufox-js"),G=require("@isdk/common-error"),D=class extends A{async buildResponse(t){const{page:e,response:i,request:s}=t;if(!e||e.isClosed())return{url:s.url,finalUrl:s.loadedUrl||s.url,statusCode:i?.status(),statusText:i?.statusText(),headers:await(i?.allHeaders())||{},body:"",html:"",text:""};const n=await e.content(),r=await e.textContent("body");return{url:e.url(),finalUrl:e.url(),statusCode:i?.status(),statusText:i?.statusText(),headers:await(i?.allHeaders())||{},body:n,html:n,text:r||""}}async _querySelectorAll(t,e){return t.locator(e).all()}async _extractValue(t,e){const{attribute:i,type:s="string"}=t;if(0===await e.count())return null;let n="";if(n=i?await e.getAttribute(i):"html"===s?await e.innerHTML():await e.textContent(),null===n)return null;switch(n=n.trim(),s){case"number":return parseFloat(n.replace(/[^0-9.-]+/g,""))||null;case"boolean":const t=n.toLowerCase();return"true"===t||"1"===t;default:return n}}async executeAction(t,e){const{page:i}=t,s=this.opts?.timeoutMs||3e4;switch(e.type){case"navigate":{const s=await i.goto(e.url,{waitUntil:e.opts?.waitUntil||"domcontentloaded",timeout:this.opts?.timeoutMs||3e4});s&&(t={...t,response:s});const n=await this.buildResponse(t);return this.lastResponse=n,n}case"extract":return this._extract(e.schema,i.locator("body"));case"click":{await i.click(e.selector,{timeout:s}),await i.waitForLoadState("networkidle",{timeout:s});const n=await this.buildResponse(t);return void(this.lastResponse=n)}case"fill":await i.fill(e.selector,e.value,{timeout:s});const n=await this.buildResponse(t);return void(this.lastResponse=n);case"waitFor":return e.options?.selector&&await i.waitForSelector(e.options.selector,{timeout:s}),e.options?.networkIdle&&await i.waitForLoadState("networkidle",{timeout:s}),void(e.options?.ms&&await i.waitForTimeout(e.options.ms));case"submit":{const n=e.selector||"form",r=i.locator(n).first();if(0===await r.count())throw new G.NotFoundError(n,"submit");if("application/json"===(e.options?.enctype||"application/x-www-form-urlencoded")){const t=await r.elementHandle();if(!t)throw new G.CommonError(`submit: could not get form handle for ${n}`,"submit");const e=await t.evaluate(async t=>{const e=new FormData(t),i={};e.forEach((t,e)=>{i[e]=t.toString()});const s=await fetch(t.action,{method:t.method,headers:{"Content-Type":"application/json"},body:JSON.stringify(i)}),n=await s.text();return{status:s.status,statusText:s.statusText,headers:Object.fromEntries(s.headers.entries()),body:n,html:n,text:n,url:t.action,finalUrl:s.url}});return await t.dispose(),await i.setContent(e.html),void(this.lastResponse=e)}return await r.evaluate(t=>t.submit()),await i.waitForLoadState("networkidle",{timeout:s}),void(this.lastResponse=await this.buildResponse(t))}case"pause":{const t=this.ctx?.onPause;return void(t?(console.info(e.message||"Execution paused for manual intervention."),await t({message:e.message}),console.info("Resuming execution...")):console.warn("[PauseAction] was called, but no `onPause` handler was provided in fetchWeb options. Skipped."))}case"getContent":return this.buildResponse(t);default:throw new G.CommonError(`Unknown action type: ${e.type}`,"PlaywrightFetchEngine.executeAction",G.ErrorCode.NotSupported)}}_createCrawler(t){return new N.PlaywrightCrawler(t)}async _getSpecificCrawlerOptions(t){const e=t.browser?.headless??!0,i={maxRequestRetries:t.retries||3,headless:e,preNavigationHooks:[async({page:e,request:i},s)=>{s.throwHttpErrors=t.throwHttpErrors,this.jar.length>0&&await e.context().addCookies(this.jar.map(t=>({...t,url:i.url,domain:t.domain||new URL(i.url).hostname})));const n=this.blockedTypes;n.size>0&&await e.route("**/*",t=>{n.has(t.request().resourceType())?t.abort():t.continue()})}]};if(this.opts?.antibot){console.log("[DEBUG] antibot enabled, configuring camoufox..."),i.browserPoolOptions={useFingerprints:!1},console.log("[DEBUG] Calling launchOptions...");const t=await(0,B.launchOptions)({headless:e});console.log("[DEBUG] launchOptions returned."),i.launchContext={launcher:M.firefox,launchOptions:t},i.postNavigationHooks=[async({page:t,handleCloudflareChallenge:e})=>{console.log(`[DEBUG] In postNavigationHook for ${t.url()}. Calling handleCloudflareChallenge...`),await e(),console.log("[DEBUG] handleCloudflareChallenge returned.")}],console.log("[DEBUG] camoufox configuration complete.")}return i}async goto(t,e){if(this.isPageActive)return this.dispatchAction({type:"navigate",url:t,opts:e});if(!this.requestQueue)throw new G.CommonError("RequestQueue not initialized","goto");const i="req-"+ ++this.requestCounter,s=new Promise((t,e)=>{this.pendingRequests.set(i,{resolve:t,reject:e})});return await this.requestQueue.addRequest({url:t,headers:this.hdrs,userData:{requestId:i,waitUntil:e?.waitUntil||"domcontentloaded"},uniqueKey:`${t}-${i}`}),s}};D.id="playwright",D.mode="browser",A.register(D);var H=class extends f{async onExecute(t,e){const{selector:i,...s}=e?.params||{};if(!i)throw new Error("Selector is required for click action");await this.delegateToEngine(t,"click",i,s)}};H.id="click",H.returnType="none",H.capabilities={http:"simulate",browser:"native"},f.register(H);var L=class extends f{async onExecute(t,e){const{selector:i,value:s,...n}=e?.params||{};if(!i)throw new Error("Selector is required for fill action");if(void 0===s)throw new Error("Value is required for fill action");await this.delegateToEngine(t,"fill",i,s,n)}};L.id="fill",L.returnType="none",L.capabilities={http:"simulate",browser:"native"},f.register(L);var z=class extends f{async onExecute(t,e){return await this.delegateToEngine(t,"getContent",e?.params)}};z.id="getContent",z.returnType="response",z.capabilities={http:"native",browser:"native"},f.register(z);var I=class extends f{async onExecute(t,e,i){const s=e?.params,n=s?.url||t.url;if(!n)throw new Error("URL is required for goto action");const r=t.internal.engine;if(!r)throw new Error("No engine available");t.url=n;return await r.goto(n,s)}};I.id="goto",I.returnType="response",I.capabilities={http:"native",browser:"native"},f.register(I);var W=class extends f{async onExecute(t,e){const{selector:i,...s}=e?.params||{};await this.delegateToEngine(t,"submit",i,s)}};W.id="submit",W.returnType="none",W.capabilities={http:"simulate",browser:"native"},f.register(W);var J=class extends f{async onExecute(t,e){const i=t.internal.engine;if(!i)throw new Error("No engine available");await i.waitFor(e?.params)}};J.id="waitFor",J.returnType="none",J.capabilities={http:"native",browser:"native"},f.register(J);var V=class extends f{async onExecute(t,e){const i=e?.params;if(!i)throw new Error("Schema is required for extract action");return this.delegateToEngine(t,"extract",i)}};V.id="extract",V.returnType="any",V.capabilities={http:"native",browser:"native"},f.register(V);var K=class extends f{async onExecute(t,e){const{selector:i,message:s,attribute:n}=e?.params||{},r=t.internal.engine;if("browser"===r?.mode){if(i){if(!await(r?.extract({selector:i,attribute:n})))return}r&&"pause"in r?await r.pause(s):console.warn("[PauseAction] was called, but the current engine does not support `pause`. Skipped.")}else console.warn("[PauseAction] can only run in browser engine. Skipped.")}};async function Q(t,e){return(new j).fetch(t,e)}K.id="pause",K.capabilities={http:"native",browser:"native"},K.returnType="none",f.register(K);
package/dist/index.mjs ADDED
@@ -0,0 +1 @@
1
+ var t={enableSmart:!0,useSiteRegistry:!0,antibot:!1,headers:{},cookies:[],reuseCookies:!0,proxy:[],blockResources:[],ignoreSslErrors:!0,browser:{engine:"playwright",headless:!0,waitUntil:"domcontentloaded"},http:{method:"GET"},timeoutMs:6e4,maxConcurrency:1,maxRequestsPerMinute:1e3,delayBetweenRequestsMs:0,retries:0,sites:[]},e=(t=>(t[t.Failed=0]="Failed",t[t.Success=1]="Success",t[t.Skipped=2]="Skipped",t))(e||{}),s=class t{static register(t){const e=t.id;if(!e)throw new Error("FetchAction.register: actionClass.id is required");this.registry.set(e,t)}static get(t){return this.registry.get(t)}static create(t){const e="string"==typeof t?t:t.id||t.name;if(!e)throw new Error("Action must have id or name");const s=this.registry.get(e);return s?new s:void 0}static has(t){return this.registry.has(t)}static list(){return Array.from(this.registry.keys())}static getCapability(t){return this.capabilities[t]??"noop"}getCapability(t){return this.constructor.getCapability(t)}get id(){return this.constructor.id}get returnType(){return this.constructor.returnType}get capabilities(){return this.constructor.capabilities}async delegateToEngine(t,e,...s){const i=t.internal.engine;if(!i)throw new Error("No engine available");if("function"!=typeof i[e])throw new Error(`Engine does not have a method named '${String(e)}'`);return await i[e](...s)}installCollectors(e,s){const i=s?.collectors;if(!i?.length)return;const a=[],c=new Set;for(const s of i){const i=r(s.activateOn),l=r(s.collectOn),u=r(s.deactivateOn),h=!(s.background??!0),w=t.create(s);if(!w)continue;let f=!1,d=!1,p=0;const y=async t=>{if(!f&&!d){f=!0;try{await(w.onBeforeExec?.(e,s))}catch(t){e.eventBus.emit("collector:error",{action:this.id,collector:w.id,phase:"before",error:t})}}},m=async(t,i)=>{if(!d){f||await y(i);try{const r=Promise.resolve(w.onExecute?.(e,s,i)).then(i=>{var r,n;if(s.storeAs){((r=e.outputs)[n=s.storeAs]||(r[n]=[])).push(i)}return e.eventBus.emit("collector:result",{action:this.id,collector:s.id||s.name,event:t,result:i}),i}).catch(i=>{e.eventBus.emit("collector:error",{action:this.id,collector:s.id||s.name,event:t,phase:"exec",error:i})}).finally(()=>{p++});h&&(c.add(r),r.finally(()=>c.delete(r)))}catch(s){e.eventBus.emit("collector:error",{action:this.id,collector:w.id,event:t,phase:"exec",error:s})}}},g=async()=>{if(!d){0===p&&m("collector:after"),d=!0;try{await(w.onAfterExec?.(e,s))}catch(t){e.eventBus.emit("collector:error",{action:this.id,collector:s.id||s.name,phase:"after",error:t})}finally{e.eventBus.emit("collector:end",{action:this.id,collector:s.id||s.name}),v.forEach(t=>t())}}},x=n(e,i,y),v=o(e,l,m),b=n(e,u,g);if(a.push(...x,...v,...b),!i.length&&!l.length&&!u.length){const t=()=>{g()};e.eventBus.once(`action:${this.id}.end`,t),a.push(()=>e.eventBus.off("fetcher:action:end",t))}}return a.length||c.size>0?{cleanup:()=>a.forEach(t=>t()),awaitExecPendings:async()=>{c.size>0&&await Promise.allSettled(Array.from(c))}}:void 0}async beforeExec(t,e){t.internal.actionStack||(t.internal.actionStack=[]);const s=t.internal.actionStack,i=s.length,r=s.length>0?s[s.length-1].id:void 0,n={...e,id:this.id,depth:i,parent:r};s.push(n),t.currentAction=n;const o={action:this,context:t,options:e,depth:i,stack:[...s]};t.eventBus.emit(`action:${this.id}.start`,o),t.eventBus.emit("action:start",o),await(this.onBeforeExec?.(t,e));return{entry:o,collectors:this.installCollectors(t,e)}}async afterExec(t,e,s,i){const r=t.internal.actionStack,n=r.length-1,o=i?.collectors;try{await(o?.awaitExecPendings()),t.lastResult=s,"response"!==s?.returnType||s.error||(t.lastResponse=s.result),e?.storeAs&&(t.outputs[e.storeAs]=s?.result),s?.error&&(t.currentAction.error=s.error),await(this.onAfterExec?.(t,e));const i={action:this,context:t,options:e,result:s,depth:n,stack:[...r]};s?.error&&(i.error=s.error);try{t.eventBus.emit(`action:${this.id}.end`,i)}catch(t){}try{t.eventBus.emit("action:end",i)}catch(t){}}finally{try{o?.cleanup()}finally{r.pop();const e=r.length;t.currentAction=e>0?r[e-1]:void 0}}}async execute(t,e){const s=await this.beforeExec(t,e);let i;try{const s=e?.failOnError??!0;return t.throwHttpErrors=s,i=await this.onExecute(t,e),i&&i.returnType||(i={status:1,returnType:this.returnType??"any",result:i}),i}catch(s){if(i={status:0,error:s,meta:{id:this.id,engineType:t.engine,capability:this.getCapability(t.engine)}},e?.failOnError)throw s;return i}finally{await this.afterExec(t,e,i,s)}}};s.registry=new Map,s.returnType="any",s.capabilities={http:"noop",browser:"noop"};var i=s;function r(t){return t?Array.isArray(t)?t:[t]:[]}function n(t,e,s){const i=[];for(const r of e)if("string"==typeof r||r instanceof RegExp){const e=(...t)=>{s(t[0])};t.eventBus.once(r,e),i.push(()=>t.eventBus.off(r,e))}return i}function o(t,e,s){const i=[];for(const r of e)if("string"==typeof r||r instanceof RegExp){const e=t=>s(r,t);t.eventBus.on(r,e),i.push(()=>t.eventBus.off(r,e))}return i}import{EventEmitter as a}from"events-ex";import{defaultsDeep as c}from"lodash-es";import l from"crypto";var u,h,w=t=>((t=>{!u||u.length<t?(u=Buffer.allocUnsafe(128*t),l.randomFillSync(u),h=0):h+t>u.length&&(l.randomFillSync(u),h=0),h+=t})(t|=0),u.subarray(h-t,h)),f=((t,e=21)=>((t,e,s)=>{let i=(2<<31-Math.clz32(t.length-1|1))-1,r=Math.ceil(1.6*i*e/t.length);return(n=e)=>{let o="";for(;;){let e=s(r),a=r;for(;a--;)if(o+=t[e[a]&i]||"",o.length===n)return o}}})(t,e,w))("0123456789abcdefghijklmnopqrstuvwxyz",12);import{defaultsDeep as d,merge as p}from"lodash-es";import{EventEmitter as y}from"events-ex";import{CommonError as m}from"@isdk/common-error";import{Configuration as g,RequestQueue as x}from"crawlee";function v(){let t=()=>{};const e=new Promise(e=>{t=e});return e.release=t,e}g.getGlobalConfig().set("persistStorage",!1);var b=class{constructor(){this.hdrs={},this.jar=[],this.pendingRequests=new Map,this.requestCounter=0,this.actionEmitter=new y,this.isPageActive=!1,this.navigationLock=function(){const t=v();return t.release(),t}(),this.blockedTypes=new Set}static register(t){const e=t.id;if(!e)throw new Error("Engine must define static id");if(this.registry.has(e))throw new Error(`Engine id duplicated: ${e}`);this.registry.set(e,t)}static get(t){return this.registry.get(t)}static getByMode(t){for(const[e,s]of this.registry.entries())if(s.mode===t)return s}static async create(e,s){const i=d(s,e,t),r=i.engine??e.engine,n=r?this.get(r)??this.getByMode(r):null;if(n){const t=new n;return await t.initialize(e,i),t}}async _extract(t,e){const s=t.type;if(!e)return"array"===s?[]:null;if("object"===s){const{selector:s,properties:i}=t;let r=e;if(s){const t=await this._querySelectorAll(e,s);r=t.length>0?t[0]:null}if(!r)return null;const n={};for(const t in i)n[t]=await this._extract(i[t],r);return n}if("array"===s){const{selector:s,items:i}=t,r=s?await this._querySelectorAll(e,s):[e],n=[];for(const t of r)n.push(await this._extract(i,t));return n}const{selector:i}=t;let r=e;if(i){const t=await this._querySelectorAll(e,i);r=t.length>0?t[0]:null}return r?this._extractValue(t,r):null}waitFor(t){return this.dispatchAction({type:"waitFor",options:t})}click(t){return this.dispatchAction({type:"click",selector:t})}fill(t,e){return this.dispatchAction({type:"fill",selector:t,value:e})}submit(t,e){return this.dispatchAction({type:"submit",selector:t,options:e})}pause(t){return this.dispatchAction({type:"pause",message:t})}extract(t){const e=this._normalizeSchema(t);return this.dispatchAction({type:"extract",schema:e})}_normalizeSchema(t){const e=JSON.parse(JSON.stringify(t));if(e.properties)for(const t in e.properties)e.properties[t]=this._normalizeSchema(e.properties[t]);if(e.items&&(e.items=this._normalizeSchema(e.items)),"array"===e.type&&(e.attribute&&!e.items&&(e.items={attribute:e.attribute},delete e.attribute),e.items||(e.items={type:"string"})),e.selector&&(e.has||e.exclude)){const{selector:t,has:s,exclude:i}=e,r=t.split(",").map(t=>{let e=t.trim();return s&&(e=`${e}:has(${s})`),i&&(e=`${e}:not(${i})`),e}).join(", ");e.selector=r,delete e.has,delete e.exclude}return e}get id(){return this.constructor.id}get mode(){return this.constructor.mode}get context(){return this.ctx}async initialize(t,e){if(this.ctx)return;p(t,e),this.ctx=t,this.opts=t,this.hdrs=function(t){const e={};if(t&&"object"==typeof t)for(const[s,i]of Object.entries(t))e[s.toLowerCase()]=i;return e}(t.headers),this.jar=[...t.cookies??[]],t.internal||(t.internal={}),t.internal.engine=this,t.engine=this.mode,this.requestQueue=await x.open();const s=await this._getSpecificCrawlerOptions(t),i={...d(s,{requestQueue:this.requestQueue,maxConcurrency:1,minConcurrency:1,useSessionPool:!0,persistCookiesPerSession:!0,sessionPoolOptions:{maxPoolSize:1,persistenceOptions:{enable:!1},sessionOptions:{maxUsageCount:1e3,maxErrorScore:3}}}),requestHandler:this._requestHandler.bind(this),errorHandler:this._failedRequestHandler.bind(this),failedRequestHandler:this._failedRequestHandler.bind(this)};this.crawler=this._createCrawler(i),this.crawler.run().then(()=>{this.isCrawlerReady=!0}).catch(t=>{this.isCrawlerReady=!1,console.error("Crawler background error:",t)})}async cleanup(){await(this._cleanup?.()),await this._commonCleanup();const t=this.ctx;t&&t.internal?.engine===this&&(t.internal.engine=void 0),this.ctx=void 0,this.opts=void 0}async _executePendingActions(t){await new Promise(e=>{const s=async({action:e,resolve:s,reject:i})=>{try{if("dispose"===e.type)return this.actionEmitter.emit("dispose"),void s();s(await this.executeAction(t,e))}catch(t){i(t)}};this.actionEmitter.on("dispatch",s),this.actionEmitter.once("dispose",()=>{this.actionEmitter.removeListener("dispatch",s),e()})})}async _sharedRequestHandler(t){try{const{request:e}=t;this.isPageActive=!0;const s=this.pendingRequests.get(e.userData.requestId);if(s){const i=await this.buildResponse(t),r=!i.statusCode||i.statusCode>=400;if(this.ctx?.throwHttpErrors&&r){const t=new m(`Request for ${i.finalUrl} failed with status ${i.statusCode||"N/A"}`,"request",i.statusCode);s.reject(t)}else this.lastResponse=i,s.resolve(i);this.pendingRequests.delete(e.userData.requestId)}await this._executePendingActions(t)}finally{this.isPageActive=!1,this.navigationLock.release()}}async _sharedFailedRequestHandler(t,e){const{request:s}=t,i=this.pendingRequests.get(s.userData.requestId);if(i&&e&&this.ctx?.throwHttpErrors){this.pendingRequests.delete(s.userData.requestId);const t=e.response,r=t?.statusCode||500,n=t?.url?t.url:s.url,o=new m(`Request${n?" for "+n:""} failed: ${e.message}`,"request",r);i.reject(o)}return this._sharedRequestHandler(t)}async dispatchAction(t){if(!this.isPageActive)throw new Error("No active page. Call goto() before performing actions.");return new Promise((e,s)=>{this.actionEmitter.emit("dispatch",{action:t,resolve:e,reject:s})})}async _requestHandler(t){await this._sharedRequestHandler(t)}async _failedRequestHandler(t,e){await this._sharedFailedRequestHandler(t,e)}async _commonCleanup(){if(this.isPageActive&&await this.dispatchAction({type:"dispose"}).catch(()=>{}),this.pendingRequests.size>0)for(const[,t]of this.pendingRequests)t.reject(new Error("Cleanup:Request cancelled"));if(this.actionEmitter.removeAllListeners(),this.crawler){try{await(this.crawler.teardown?.())}catch(t){console.error("ccrawler teardown error:",t)}this.crawler=void 0}this.isCrawlerReady=void 0,this.requestQueue&&(await this.requestQueue.drop(),this.requestQueue=void 0),this.pendingRequests.clear()}async blockResources(t,e){return e&&this.blockedTypes.clear(),t.forEach(t=>this.blockedTypes.add(t)),t.length}getContent(){return this.lastResponse?Promise.resolve(this.lastResponse):Promise.reject(new Error("No content fetched yet. Call goto() first."))}async headers(t,e){if(void 0===t)return{...this.hdrs};if("string"==typeof t&&void 0===e)return this.hdrs[t.toLowerCase()]||"";if(null!==t&&"object"==typeof t){const s={};for(const[e,i]of Object.entries(t))s[e.toLowerCase()]=String(i);return this.hdrs=!0===e?s:{...this.hdrs,...s},!0}return"string"==typeof t&&("string"==typeof e?this.hdrs[t.toLowerCase()]=e:null===e&&delete this.hdrs[t.toLowerCase()],!0)}async cookies(t){return Array.isArray(t)?(this.jar=[...t],!0):null===t?(this.jar=[],!0):[...this.jar]}async dispose(){await this.cleanup()}};async function E(t,e){const s=function(t,e){if(!t||!e?.length)return null;const s=new URL(t);let i=e.find(t=>t.domain===s.hostname);i||(i=e.find(t=>s.hostname.endsWith(t.domain)));if(!i)return null;if(i.pathScope?.length){if(!i.pathScope.some(t=>s.pathname.startsWith(t)))return null}return i}(e?.url||t.url,t.sites),i=t.engine||s?.engine||"auto";let r=await b.create(t,{engine:i});return r||(r=await b.create(t,{engine:"http"})),r}b.registry=new Map;var C=class{constructor(t={}){this.options=t,this.closed=!1,this.id=f(),this.context=this.createContext(t)}async execute(t){await this.ensureEngine(t);const e=i.create(t);if(!e)throw new Error(`Unknown action: ${t.id||t.name}`);let s,r;this.context.internal.actionIndex=(this.context.internal.actionIndex||0)+1,this.context.currentAction={...t,index:this.context.internal.actionIndex,startedAt:Date.now()};try{return s=await e.execute(this.context,t),s}catch(t){throw r=t,r}finally{this.context.currentAction=void 0}}async executeAll(t){try{for(let e=0;e<t.length;e++){const s=t[e];await this.execute(s)}const e=await this.execute({id:"getContent"});return{result:e?.result,outputs:this.getOutputs()}}catch(t){throw t}}getOutputs(){return this.context.outputs}async dispose(){if(this.closed)return;const t=this.context.eventBus;t.emit("session:closing",{sessionId:this.id});try{await(this.context.internal.engine?.dispose())}finally{this.closed=!0}t.emit("session:closed",{sessionId:this.id})}async ensureEngine(t){if(this.closed)throw new Error("Session is closed");if(!this.context.internal.engine){const e=t?.params?.url??this.context.url;if(!await E(this.context,{url:e}))throw new Error("No engine found")}}createContext(e=this.options){const s=new a;return c({...e,id:this.id,eventBus:s,outputs:{},internal:{},execute:async t=>this.execute(t),action:async function(t,e,s){return this.execute({name:t,params:e,...s})}},t)}},k=class{constructor(t={}){this.defaults=t}async createSession(t){const e={...this.defaults,...t||{}};return new C(e)}async fetch(t,e){"string"!=typeof t&&(t=(e=t).url);const s=await this.createSession(e);try{const i=e?.actions||[];t&&0!==i.findIndex(e=>"goto"===e.id&&e.params?.url===t)&&i.unshift({id:"goto",params:{url:t}});return await s.executeAll(i)}finally{await s.dispose()}}};import{CheerioCrawler as S,ProxyConfiguration as q}from"crawlee";import*as $ from"cheerio";import{CommonError as R,ErrorCode as U,NotFoundError as P}from"@isdk/common-error";var A=class extends b{async buildResponse(t){const{request:e,response:s,body:i,$:r}=t,n=r?.html();let o="string"==typeof i?i:Buffer.isBuffer(i)?i.toString("utf-8"):String(i??"");return n&&n!==o&&(o=n),{url:e.url,finalUrl:e.loadedUrl||e.url,statusCode:s?.statusCode??200,statusText:s?.statusMessage,headers:s?.headers,body:i,html:o,text:o}}async _querySelectorAll(t,e){const{$:s,el:i}=t;return i.find(e).toArray().map(t=>({$:s,el:s(t)}))}async _extractValue(t,e){const{el:s}=e,{attribute:i,type:r="string"}=t;if(0===s.length)return null;let n="";if(n=i?s.attr(i)??null:"html"===r?s.html():s.text().trim(),null===n)return null;switch(r){case"number":return parseFloat(n.replace(/[^0-9.-]+/g,""))||null;case"boolean":const t=n.toLowerCase();return"true"===t||"1"===t;default:return n}}async executeAction(t,e){const{$:s}=t;switch(e.type){case"dispose":return;case"extract":if(!s)throw new R(`Cheerio context not available for action: ${e.type}`,"extract");return this._extract(e.schema,{$:s,el:s.root()});case"click":{if(!s)throw new R(`Cheerio context not available for action: ${e.type}`,"click");const i=e.selector,r=s(i).first();let n;if(0===r.length)try{n=new URL(i,t.request.loadedUrl||t.request.url).href}catch{throw new R(`click: selector not found or invalid URL: ${i}`,"click")}else{if(!r.is("a")||!r.attr("href")){if(r.is('input[type="submit"], button[type="submit"], button, input')){const e=r.closest("form");if(e.length)return this.executeAction(t,{type:"submit",selector:e});throw new R("click: submit-like element without form","click")}throw new R(`click: unsupported element for http simulate. Selector: ${i}`,"click")}{const e=r.attr("href");n=new URL(e,t.request.loadedUrl||t.request.url).href}}const o=await t.sendRequest({url:n});return void this._updateStateAfterNavigation(t,o)}case"fill":{if(!s)throw new R(`Cheerio context not available for action: ${e.type}`),"fill";const i=s(e.selector).first();if(0===i.length)throw new R(`fill: selector not found: ${e.selector}`);if(!i.is("input, textarea, select"))throw new R(`fill: not a form field: ${e.selector}`,"fill");{i.val(e.value);const s=this.buildResponse(t);this.lastResponse=s}return}case"waitFor":return void(e.options?.ms&&await new Promise(t=>setTimeout(t,e.options.ms)));case"pause":const i=this.ctx?.onPause;return void(i?(console.info(e.message||"Execution paused for manual intervention."),await i({message:e.message}),console.info("Resuming execution...")):console.warn("[PauseAction] was called, but no `onPause` handler was provided in fetchWeb options. Skipped."));case"submit":{if(!s)throw new R(`Cheerio context not available for action: ${e.type}`,"submit");const i="string"==typeof e.selector?s(e.selector).first():null!=e.selector?e.selector:s("form").first();if(0===i.length)throw new P(e.selector,"submit");const r=i.attr("action")||t.request.loadedUrl||t.request.url,n=(i.attr("method")||"GET").toUpperCase(),o=new URL(r,t.request.loadedUrl||t.request.url).href,a={};let c;if(i.find("input, select, textarea").each((t,e)=>{const i=s(e),r=i.attr("name");if(!r)return;const n=i.val();null!=n&&(a[r]=String(n))}),"GET"===n){const e=new URL(o);Object.entries(a).forEach(([t,s])=>e.searchParams.set(t,s)),c=await t.sendRequest({url:e.href,method:"GET"})}else{let s;const r={};"application/json"===(e.options?.enctype||i.attr("enctype")||"application/x-www-form-urlencoded")?(s=JSON.stringify(a),r["Content-Type"]="application/json"):(s=new URLSearchParams(a).toString(),r["Content-Type"]="application/x-www-form-urlencoded"),c=await t.sendRequest({url:o,method:"POST",body:s,headers:r})}return void this._updateStateAfterNavigation(t,c)}case"getContent":return this.buildResponse(t);default:throw new R(`Unknown action type: ${e.type}`,"CheerioFetchEngine.executeAction",U.NotSupported)}}_updateStateAfterNavigation(t,e){const s=e.response||e,{body:i,headers:r,statusCode:n,statusMessage:o}=s,{url:a,loadedUrl:c}=e,l="string"==typeof i?i:Buffer.isBuffer(i)?i.toString("utf-8"):String(i??"");r&&r["content-type"]?.includes("html")&&(t.$=$.load(l)),this.lastResponse={url:a,finalUrl:c||a,statusCode:n,statusText:o,headers:r||{},body:i,html:l,text:l}}_createCrawler(t){return new S(t)}_getSpecificCrawlerOptions(t){const e=this.opts?.proxy?"string"==typeof this.opts.proxy?[this.opts.proxy]:this.opts.proxy:void 0,s=e?.length?new q({proxyUrls:e}):void 0;return{additionalMimeTypes:["text/plain"],maxRequestRetries:1,requestHandlerTimeoutSecs:Math.max(5,Math.floor((this.opts?.timeoutMs||3e4)/1e3)),proxyConfiguration:s,preNavigationHooks:[(e,s)=>{s.throwHttpErrors=t.throwHttpErrors,this.opts?.timeoutMs&&(s.timeout={request:this.opts.timeoutMs})}]}}async goto(t,e){this.isPageActive&&this.dispatchAction({type:"dispose"}).catch(()=>{});const s="req-"+ ++this.requestCounter,i=new Promise((t,i)=>{const r=e?.timeoutMs||this.opts?.timeoutMs||3e4,n=setTimeout(()=>{this.pendingRequests.delete(s),this.navigationLock.release(),i(new R(`goto timed out after ${r}ms.`,"gotoTimeout",U.RequestTimeout))},r);this.pendingRequests.set(s,{resolve:e=>{clearTimeout(n),t(e)},reject:t=>{clearTimeout(n),i(t)}})});return this.requestQueue.addRequest({...e,url:t,headers:{...this.hdrs,...e?.headers},userData:{requestId:s},uniqueKey:`${t}-${s}`}).catch(t=>{const e=this.pendingRequests.get(s);e&&(this.pendingRequests.delete(s),this.navigationLock.release(),e.reject(t))}),await this.navigationLock,this.navigationLock=v(),i}};A.id="cheerio",A.mode="http",b.register(A);import{PlaywrightCrawler as T}from"crawlee";import{firefox as O}from"playwright";import{launchOptions as j}from"camoufox-js";import{CommonError as _,ErrorCode as N,NotFoundError as F}from"@isdk/common-error";var M=class extends b{async buildResponse(t){const{page:e,response:s,request:i}=t;if(!e||e.isClosed())return{url:i.url,finalUrl:i.loadedUrl||i.url,statusCode:s?.status(),statusText:s?.statusText(),headers:await(s?.allHeaders())||{},body:"",html:"",text:""};const r=await e.content(),n=await e.textContent("body");return{url:e.url(),finalUrl:e.url(),statusCode:s?.status(),statusText:s?.statusText(),headers:await(s?.allHeaders())||{},body:r,html:r,text:n||""}}async _querySelectorAll(t,e){return t.locator(e).all()}async _extractValue(t,e){const{attribute:s,type:i="string"}=t;if(0===await e.count())return null;let r="";if(r=s?await e.getAttribute(s):"html"===i?await e.innerHTML():await e.textContent(),null===r)return null;switch(r=r.trim(),i){case"number":return parseFloat(r.replace(/[^0-9.-]+/g,""))||null;case"boolean":const t=r.toLowerCase();return"true"===t||"1"===t;default:return r}}async executeAction(t,e){const{page:s}=t,i=this.opts?.timeoutMs||3e4;switch(e.type){case"navigate":{const i=await s.goto(e.url,{waitUntil:e.opts?.waitUntil||"domcontentloaded",timeout:this.opts?.timeoutMs||3e4});i&&(t={...t,response:i});const r=await this.buildResponse(t);return this.lastResponse=r,r}case"extract":return this._extract(e.schema,s.locator("body"));case"click":{await s.click(e.selector,{timeout:i}),await s.waitForLoadState("networkidle",{timeout:i});const r=await this.buildResponse(t);return void(this.lastResponse=r)}case"fill":await s.fill(e.selector,e.value,{timeout:i});const r=await this.buildResponse(t);return void(this.lastResponse=r);case"waitFor":return e.options?.selector&&await s.waitForSelector(e.options.selector,{timeout:i}),e.options?.networkIdle&&await s.waitForLoadState("networkidle",{timeout:i}),void(e.options?.ms&&await s.waitForTimeout(e.options.ms));case"submit":{const r=e.selector||"form",n=s.locator(r).first();if(0===await n.count())throw new F(r,"submit");if("application/json"===(e.options?.enctype||"application/x-www-form-urlencoded")){const t=await n.elementHandle();if(!t)throw new _(`submit: could not get form handle for ${r}`,"submit");const e=await t.evaluate(async t=>{const e=new FormData(t),s={};e.forEach((t,e)=>{s[e]=t.toString()});const i=await fetch(t.action,{method:t.method,headers:{"Content-Type":"application/json"},body:JSON.stringify(s)}),r=await i.text();return{status:i.status,statusText:i.statusText,headers:Object.fromEntries(i.headers.entries()),body:r,html:r,text:r,url:t.action,finalUrl:i.url}});return await t.dispose(),await s.setContent(e.html),void(this.lastResponse=e)}return await n.evaluate(t=>t.submit()),await s.waitForLoadState("networkidle",{timeout:i}),void(this.lastResponse=await this.buildResponse(t))}case"pause":{const t=this.ctx?.onPause;return void(t?(console.info(e.message||"Execution paused for manual intervention."),await t({message:e.message}),console.info("Resuming execution...")):console.warn("[PauseAction] was called, but no `onPause` handler was provided in fetchWeb options. Skipped."))}case"getContent":return this.buildResponse(t);default:throw new _(`Unknown action type: ${e.type}`,"PlaywrightFetchEngine.executeAction",N.NotSupported)}}_createCrawler(t){return new T(t)}async _getSpecificCrawlerOptions(t){const e=t.browser?.headless??!0,s={maxRequestRetries:t.retries||3,headless:e,preNavigationHooks:[async({page:e,request:s},i)=>{i.throwHttpErrors=t.throwHttpErrors,this.jar.length>0&&await e.context().addCookies(this.jar.map(t=>({...t,url:s.url,domain:t.domain||new URL(s.url).hostname})));const r=this.blockedTypes;r.size>0&&await e.route("**/*",t=>{r.has(t.request().resourceType())?t.abort():t.continue()})}]};if(this.opts?.antibot){console.log("[DEBUG] antibot enabled, configuring camoufox..."),s.browserPoolOptions={useFingerprints:!1},console.log("[DEBUG] Calling launchOptions...");const t=await j({headless:e});console.log("[DEBUG] launchOptions returned."),s.launchContext={launcher:O,launchOptions:t},s.postNavigationHooks=[async({page:t,handleCloudflareChallenge:e})=>{console.log(`[DEBUG] In postNavigationHook for ${t.url()}. Calling handleCloudflareChallenge...`),await e(),console.log("[DEBUG] handleCloudflareChallenge returned.")}],console.log("[DEBUG] camoufox configuration complete.")}return s}async goto(t,e){if(this.isPageActive)return this.dispatchAction({type:"navigate",url:t,opts:e});if(!this.requestQueue)throw new _("RequestQueue not initialized","goto");const s="req-"+ ++this.requestCounter,i=new Promise((t,e)=>{this.pendingRequests.set(s,{resolve:t,reject:e})});return await this.requestQueue.addRequest({url:t,headers:this.hdrs,userData:{requestId:s,waitUntil:e?.waitUntil||"domcontentloaded"},uniqueKey:`${t}-${s}`}),i}};M.id="playwright",M.mode="browser",b.register(M);var B=class extends i{async onExecute(t,e){const{selector:s,...i}=e?.params||{};if(!s)throw new Error("Selector is required for click action");await this.delegateToEngine(t,"click",s,i)}};B.id="click",B.returnType="none",B.capabilities={http:"simulate",browser:"native"},i.register(B);var D=class extends i{async onExecute(t,e){const{selector:s,value:i,...r}=e?.params||{};if(!s)throw new Error("Selector is required for fill action");if(void 0===i)throw new Error("Value is required for fill action");await this.delegateToEngine(t,"fill",s,i,r)}};D.id="fill",D.returnType="none",D.capabilities={http:"simulate",browser:"native"},i.register(D);var H=class extends i{async onExecute(t,e){return await this.delegateToEngine(t,"getContent",e?.params)}};H.id="getContent",H.returnType="response",H.capabilities={http:"native",browser:"native"},i.register(H);var G=class extends i{async onExecute(t,e,s){const i=e?.params,r=i?.url||t.url;if(!r)throw new Error("URL is required for goto action");const n=t.internal.engine;if(!n)throw new Error("No engine available");t.url=r;return await n.goto(r,i)}};G.id="goto",G.returnType="response",G.capabilities={http:"native",browser:"native"},i.register(G);var L=class extends i{async onExecute(t,e){const{selector:s,...i}=e?.params||{};await this.delegateToEngine(t,"submit",s,i)}};L.id="submit",L.returnType="none",L.capabilities={http:"simulate",browser:"native"},i.register(L);var z=class extends i{async onExecute(t,e){const s=t.internal.engine;if(!s)throw new Error("No engine available");await s.waitFor(e?.params)}};z.id="waitFor",z.returnType="none",z.capabilities={http:"native",browser:"native"},i.register(z);var I=class extends i{async onExecute(t,e){const s=e?.params;if(!s)throw new Error("Schema is required for extract action");return this.delegateToEngine(t,"extract",s)}};I.id="extract",I.returnType="any",I.capabilities={http:"native",browser:"native"},i.register(I);var J=class extends i{async onExecute(t,e){const{selector:s,message:i,attribute:r}=e?.params||{},n=t.internal.engine;if("browser"===n?.mode){if(s){if(!await(n?.extract({selector:s,attribute:r})))return}n&&"pause"in n?await n.pause(i):console.warn("[PauseAction] was called, but the current engine does not support `pause`. Skipped.")}else console.warn("[PauseAction] can only run in browser engine. Skipped.")}};async function V(t,e){return(new k).fetch(t,e)}J.id="pause",J.capabilities={http:"native",browser:"native"},J.returnType="none",i.register(J);export{A as CheerioFetchEngine,B as ClickAction,t as DefaultFetcherProperties,I as ExtractAction,i as FetchAction,e as FetchActionResultStatus,b as FetchEngine,C as FetchSession,D as FillAction,H as GetContentAction,G as GotoAction,J as PauseAction,M as PlaywrightFetchEngine,L as SubmitAction,z as WaitForAction,k as WebFetcher,V as fetchWeb};
package/docs/README.md ADDED
@@ -0,0 +1,151 @@
1
+ **@isdk/web-fetcher**
2
+
3
+ ***
4
+
5
+ # đŸ•¸ī¸ @isdk/web-fetcher
6
+
7
+ English | [įŽ€äŊ“中文](_media/README.cn.md)
8
+
9
+ > A powerful and flexible web fetching and browser automation library.
10
+ > It features a dual-engine architecture (HTTP and Browser) and a declarative action system, making it perfect for AI agents and complex data scraping tasks.
11
+
12
+ ---
13
+
14
+ ## ✨ Core Features
15
+
16
+ * **âš™ī¸ Dual-Engine Architecture**: Choose between **`http`** mode (powered by Cheerio) for speed on static sites, or **`browser`** mode (powered by Playwright) for full JavaScript execution on dynamic sites.
17
+ * **📜 Declarative Action Scripts**: Define multi-step workflows (like logging in, filling forms, and clicking buttons) in a simple, readable JSON format.
18
+ * **📊 Powerful and Flexible Data Extraction**: Easily extract all kinds of structured data, from simple text to complex nested objects, through an intuitive and powerful declarative Schema.
19
+ * **🧠 Smart Engine Selection**: Automatically detects dynamic sites and can upgrade the engine from `http` to `browser` on the fly.
20
+ * **🧩 Extensible**: Easily create custom, high-level "composite" actions to encapsulate reusable business logic (e.g., a `login` action).
21
+ * **🧲 Advanced Collectors**: Asynchronously collect data in the background, triggered by events during the execution of a main action.
22
+ * **đŸ›Ąī¸ Anti-Bot Evasion**: In `browser` mode, an optional `antibot` flag helps to bypass common anti-bot measures like Cloudflare challenges.
23
+
24
+ ---
25
+
26
+ ## đŸ“Ļ Installation
27
+
28
+ 1. **Install the Package:**
29
+
30
+ ```bash
31
+ npm install @isdk/web-fetcher
32
+ ```
33
+
34
+ 2. **Install Browsers (For `browser` mode):**
35
+
36
+ The `browser` engine is powered by Playwright, which requires separate browser binaries to be downloaded. If you plan to use the `browser` engine for interacting with dynamic websites, run the following command:
37
+
38
+ ```bash
39
+ npx playwright install
40
+ ```
41
+
42
+ > â„šī¸ **Note:** This step is only required for `browser` mode. The lightweight `http` mode works out of the box without this installation.
43
+
44
+ ---
45
+
46
+ ## 🚀 Quick Start
47
+
48
+ The following example fetches a web page and extracts its title.
49
+
50
+ ```typescript
51
+ import { fetchWeb } from '@isdk/web-fetcher';
52
+
53
+ async function getTitle(url: string) {
54
+ const { outputs } = await fetchWeb({
55
+ url,
56
+ actions: [
57
+ {
58
+ id: 'extract',
59
+ params: {
60
+ // Extracts the text content of the <title> tag
61
+ selector: 'title',
62
+ },
63
+ // Stores the result in the `outputs` object under the key 'pageTitle'
64
+ storeAs: 'pageTitle',
65
+ },
66
+ ],
67
+ });
68
+
69
+ console.log('Page Title:', outputs.pageTitle);
70
+ }
71
+
72
+ getTitle('https://www.google.com');
73
+ ```
74
+
75
+ ---
76
+
77
+ ## 🤖 Advanced Usage: Multi-Step Form Submission
78
+
79
+ This example demonstrates how to use the `browser` engine to perform a search on Google.
80
+
81
+ ```typescript
82
+ import { fetchWeb } from '@isdk/web-fetcher';
83
+
84
+ async function searchGoogle(query: string) {
85
+ // Search for the query on Google
86
+ const { result, outputs } = await fetchWeb({
87
+ url: 'https://www.google.com',
88
+ engine: 'browser', // Use the full browser engine for interaction
89
+ actions: [
90
+ // The initial navigation to google.com is handled by the `url` option
91
+ { id: 'fill', params: { selector: 'textarea[name=q]', value: query } },
92
+ { id: 'submit', params: { selector: 'form' } },
93
+ { id: 'waitFor', params: { selector: '#search' } }, // Wait for the search results container to appear
94
+ { id: 'getContent', storeAs: 'searchResultsPage' },
95
+ ]
96
+ });
97
+
98
+ console.log('Search Results URL:', result?.finalUrl);
99
+ console.log('Outputs contains the full page content:', outputs.searchResultsPage.html.substring(0, 100));
100
+ }
101
+
102
+ searchGoogle('gemini');
103
+ ```
104
+
105
+ ---
106
+
107
+ ## đŸ—ī¸ Architecture
108
+
109
+ This library is built on two core concepts: **Engines** and **Actions**.
110
+
111
+ * ### Engine Architecture
112
+
113
+ The library's core is its dual-engine design. It abstracts away the complexities of web interaction behind a unified API. For detailed information on the `http` (Cheerio) and `browser` (Playwright) engines, how they manage state, and how to extend them, please see the [**Fetch Engine Architecture**](_media/README.engine.md) document.
114
+
115
+ * ### Action Architecture
116
+
117
+ All workflows are defined as a series of "Actions". The library provides a set of built-in atomic actions and a powerful composition model for creating your own semantic actions. For a deep dive into creating and using actions, see the [**Action Script Architecture**](_media/README.action.md) document.
118
+
119
+ ---
120
+
121
+ ## 📚 API Reference
122
+
123
+ ### `fetchWeb(options)` or `fetchWeb(url, options)`
124
+
125
+ This is the main entry point for the library.
126
+
127
+ **Key `FetcherOptions`**:
128
+
129
+ * `url` (string): The initial URL to navigate to.
130
+ * `engine` ('http' | 'browser' | 'auto'): The engine to use. Defaults to `auto`.
131
+ * `actions` (FetchActionOptions[]): An array of action objects to execute.
132
+ * `headers` (Record<string, string>): Headers to use for all requests.
133
+ * ...and many other options for proxy, cookies, retries, etc.
134
+
135
+ ### Built-in Actions
136
+
137
+ Here are the essential built-in actions:
138
+
139
+ * `goto`: Navigates to a new URL.
140
+ * `click`: Clicks on an element specified by a selector.
141
+ * `fill`: Fills an input field with a specified value.
142
+ * `submit`: Submits a form.
143
+ * `waitFor`: Pauses execution to wait for a specific condition (e.g., a timeout, a selector to appear, or network to be idle).
144
+ * `getContent`: Retrieves the full content (HTML, text, etc.) of the current page state.
145
+ * `extract`: Extracts any structured data from the page with ease using an expressive, declarative schema.
146
+
147
+ ---
148
+
149
+ ## 📜 License
150
+
151
+ [MIT](_media/LICENSE-MIT)
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2025 Riceball LEE
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following
10
+ conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ OTHER DEALINGS IN THE SOFTWARE.