@isdk/web-fetcher 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.action.cn.md +32 -24
- package/README.action.md +14 -4
- package/README.cn.md +10 -2
- package/README.hackernews.md +52 -0
- package/README.md +10 -2
- package/dist/index.d.mts +7 -4
- package/dist/index.d.ts +7 -4
- package/dist/index.js +1 -1
- package/dist/index.mjs +1 -1
- package/docs/README.md +10 -2
- package/docs/_media/README.action.md +14 -4
- package/docs/_media/README.cn.md +10 -2
- package/docs/classes/CheerioFetchEngine.md +91 -69
- package/docs/classes/ClickAction.md +23 -23
- package/docs/classes/ExtractAction.md +23 -23
- package/docs/classes/FetchAction.md +23 -23
- package/docs/classes/FetchEngine.md +87 -69
- package/docs/classes/FetchSession.md +8 -8
- package/docs/classes/FillAction.md +23 -23
- package/docs/classes/GetContentAction.md +23 -23
- package/docs/classes/GotoAction.md +23 -23
- package/docs/classes/PauseAction.md +23 -23
- package/docs/classes/PlaywrightFetchEngine.md +91 -69
- package/docs/classes/SubmitAction.md +23 -23
- package/docs/classes/WaitForAction.md +23 -23
- package/docs/classes/WebFetcher.md +5 -5
- package/docs/enumerations/FetchActionResultStatus.md +4 -4
- package/docs/functions/fetchWeb.md +2 -2
- package/docs/globals.md +1 -0
- package/docs/interfaces/BaseFetchActionProperties.md +9 -9
- package/docs/interfaces/BaseFetchCollectorActionProperties.md +13 -13
- package/docs/interfaces/BaseFetcherProperties.md +29 -21
- package/docs/interfaces/DispatchedEngineAction.md +4 -4
- package/docs/interfaces/ExtractActionProperties.md +9 -9
- package/docs/interfaces/FetchActionInContext.md +13 -13
- package/docs/interfaces/FetchActionProperties.md +10 -10
- package/docs/interfaces/FetchActionResult.md +6 -6
- package/docs/interfaces/FetchContext.md +43 -31
- package/docs/interfaces/FetchEngineContext.md +38 -26
- package/docs/interfaces/FetchMetadata.md +5 -5
- package/docs/interfaces/FetchResponse.md +13 -13
- package/docs/interfaces/FetchReturnTypeRegistry.md +7 -7
- package/docs/interfaces/FetchSite.md +36 -24
- package/docs/interfaces/FetcherOptions.md +35 -23
- package/docs/interfaces/GotoActionOptions.md +6 -6
- package/docs/interfaces/PendingEngineRequest.md +3 -3
- package/docs/interfaces/SubmitActionOptions.md +2 -2
- package/docs/interfaces/WaitForActionOptions.md +4 -4
- package/docs/type-aliases/BaseFetchActionOptions.md +1 -1
- package/docs/type-aliases/BaseFetchCollectorOptions.md +1 -1
- package/docs/type-aliases/BrowserEngine.md +1 -1
- package/docs/type-aliases/FetchActionCapabilities.md +1 -1
- package/docs/type-aliases/FetchActionCapabilityMode.md +1 -1
- package/docs/type-aliases/FetchActionOptions.md +1 -1
- package/docs/type-aliases/FetchEngineAction.md +1 -1
- package/docs/type-aliases/FetchEngineType.md +1 -1
- package/docs/type-aliases/FetchReturnType.md +1 -1
- package/docs/type-aliases/FetchReturnTypeFor.md +1 -1
- package/docs/type-aliases/OnFetchPauseCallback.md +1 -1
- package/docs/type-aliases/ResourceType.md +1 -1
- package/docs/variables/DefaultFetcherProperties.md +1 -1
- package/docs/variables/FetcherOptionKeys.md +11 -0
- package/package.json +3 -1
package/dist/index.mjs
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
var t={enableSmart:!0,useSiteRegistry:!0,antibot:!1,headers:{},cookies:[],reuseCookies:!0,proxy:[],blockResources:[],ignoreSslErrors:!0,browser:{engine:"playwright",headless:!0,waitUntil:"domcontentloaded"},http:{method:"GET"},timeoutMs:6e4,maxConcurrency:1,maxRequestsPerMinute:1e3,delayBetweenRequestsMs:0,retries:0,sites:[]},e=(t=>(t[t.Failed=0]="Failed",t[t.Success=1]="Success",t[t.Skipped=2]="Skipped",t))(e||{}),s=class t{static register(t){const e=t.id;if(!e)throw new Error("FetchAction.register: actionClass.id is required");this.registry.set(e,t)}static get(t){return this.registry.get(t)}static create(t){const e="string"==typeof t?t:t.id||t.name;if(!e)throw new Error("Action must have id or name");const s=this.registry.get(e);return s?new s:void 0}static has(t){return this.registry.has(t)}static list(){return Array.from(this.registry.keys())}static getCapability(t){return this.capabilities[t]??"noop"}getCapability(t){return this.constructor.getCapability(t)}get id(){return this.constructor.id}get returnType(){return this.constructor.returnType}get capabilities(){return this.constructor.capabilities}async delegateToEngine(t,e,...s){const i=t.internal.engine;if(!i)throw new Error("No engine available");if("function"!=typeof i[e])throw new Error(`Engine does not have a method named '${String(e)}'`);return await i[e](...s)}installCollectors(e,s){const i=s?.collectors;if(!i?.length)return;const a=[],c=new Set;for(const s of i){const i=r(s.activateOn),l=r(s.collectOn),u=r(s.deactivateOn),h=!(s.background??!0),w=t.create(s);if(!w)continue;let f=!1,d=!1,p=0;const y=async t=>{if(!f&&!d){f=!0;try{await(w.onBeforeExec?.(e,s))}catch(t){e.eventBus.emit("collector:error",{action:this.id,collector:w.id,phase:"before",error:t})}}},m=async(t,i)=>{if(!d){f||await y(i);try{const r=Promise.resolve(w.onExecute?.(e,s,i)).then(i=>{var r,n;if(s.storeAs){((r=e.outputs)[n=s.storeAs]||(r[n]=[])).push(i)}return e.eventBus.emit("collector:result",{action:this.id,collector:s.id||s.name,event:t,result:i}),i}).catch(i=>{e.eventBus.emit("collector:error",{action:this.id,collector:s.id||s.name,event:t,phase:"exec",error:i})}).finally(()=>{p++});h&&(c.add(r),r.finally(()=>c.delete(r)))}catch(s){e.eventBus.emit("collector:error",{action:this.id,collector:w.id,event:t,phase:"exec",error:s})}}},g=async()=>{if(!d){0===p&&m("collector:after"),d=!0;try{await(w.onAfterExec?.(e,s))}catch(t){e.eventBus.emit("collector:error",{action:this.id,collector:s.id||s.name,phase:"after",error:t})}finally{e.eventBus.emit("collector:end",{action:this.id,collector:s.id||s.name}),v.forEach(t=>t())}}},x=n(e,i,y),v=o(e,l,m),b=n(e,u,g);if(a.push(...x,...v,...b),!i.length&&!l.length&&!u.length){const t=()=>{g()};e.eventBus.once(`action:${this.id}.end`,t),a.push(()=>e.eventBus.off("fetcher:action:end",t))}}return a.length||c.size>0?{cleanup:()=>a.forEach(t=>t()),awaitExecPendings:async()=>{c.size>0&&await Promise.allSettled(Array.from(c))}}:void 0}async beforeExec(t,e){t.internal.actionStack||(t.internal.actionStack=[]);const s=t.internal.actionStack,i=s.length,r=s.length>0?s[s.length-1].id:void 0,n={...e,id:this.id,depth:i,parent:r};s.push(n),t.currentAction=n;const o={action:this,context:t,options:e,depth:i,stack:[...s]};t.eventBus.emit(`action:${this.id}.start`,o),t.eventBus.emit("action:start",o),await(this.onBeforeExec?.(t,e));return{entry:o,collectors:this.installCollectors(t,e)}}async afterExec(t,e,s,i){const r=t.internal.actionStack,n=r.length-1,o=i?.collectors;try{await(o?.awaitExecPendings()),t.lastResult=s,"response"!==s?.returnType||s.error||(t.lastResponse=s.result),e?.storeAs&&(t.outputs[e.storeAs]=s?.result),s?.error&&(t.currentAction.error=s.error),await(this.onAfterExec?.(t,e));const i={action:this,context:t,options:e,result:s,depth:n,stack:[...r]};s?.error&&(i.error=s.error);try{t.eventBus.emit(`action:${this.id}.end`,i)}catch(t){}try{t.eventBus.emit("action:end",i)}catch(t){}}finally{try{o?.cleanup()}finally{r.pop();const e=r.length;t.currentAction=e>0?r[e-1]:void 0}}}async execute(t,e){const s=await this.beforeExec(t,e);let i;try{const s=e?.failOnError??!0;return t.throwHttpErrors=s,i=await this.onExecute(t,e),i&&i.returnType||(i={status:1,returnType:this.returnType??"any",result:i}),i}catch(s){if(i={status:0,error:s,meta:{id:this.id,engineType:t.engine,capability:this.getCapability(t.engine)}},e?.failOnError)throw s;return i}finally{await this.afterExec(t,e,i,s)}}};s.registry=new Map,s.returnType="any",s.capabilities={http:"noop",browser:"noop"};var i=s;function r(t){return t?Array.isArray(t)?t:[t]:[]}function n(t,e,s){const i=[];for(const r of e)if("string"==typeof r||r instanceof RegExp){const e=(...t)=>{s(t[0])};t.eventBus.once(r,e),i.push(()=>t.eventBus.off(r,e))}return i}function o(t,e,s){const i=[];for(const r of e)if("string"==typeof r||r instanceof RegExp){const e=t=>s(r,t);t.eventBus.on(r,e),i.push(()=>t.eventBus.off(r,e))}return i}import{EventEmitter as a}from"events-ex";import{defaultsDeep as c}from"lodash-es";import l from"crypto";var u,h,w=t=>((t=>{!u||u.length<t?(u=Buffer.allocUnsafe(128*t),l.randomFillSync(u),h=0):h+t>u.length&&(l.randomFillSync(u),h=0),h+=t})(t|=0),u.subarray(h-t,h)),f=((t,e=21)=>((t,e,s)=>{let i=(2<<31-Math.clz32(t.length-1|1))-1,r=Math.ceil(1.6*i*e/t.length);return(n=e)=>{let o="";for(;;){let e=s(r),a=r;for(;a--;)if(o+=t[e[a]&i]||"",o.length===n)return o}}})(t,e,w))("0123456789abcdefghijklmnopqrstuvwxyz",12);import{defaultsDeep as d,merge as p}from"lodash-es";import{EventEmitter as y}from"events-ex";import{CommonError as m}from"@isdk/common-error";import{Configuration as g,RequestQueue as x}from"crawlee";function v(){let t=()=>{};const e=new Promise(e=>{t=e});return e.release=t,e}g.getGlobalConfig().set("persistStorage",!1);var b=class{constructor(){this.hdrs={},this.jar=[],this.pendingRequests=new Map,this.requestCounter=0,this.actionEmitter=new y,this.isPageActive=!1,this.navigationLock=function(){const t=v();return t.release(),t}(),this.blockedTypes=new Set}static register(t){const e=t.id;if(!e)throw new Error("Engine must define static id");if(this.registry.has(e))throw new Error(`Engine id duplicated: ${e}`);this.registry.set(e,t)}static get(t){return this.registry.get(t)}static getByMode(t){for(const[e,s]of this.registry.entries())if(s.mode===t)return s}static async create(e,s){const i=d(s,e,t),r=i.engine??e.engine,n=r?this.get(r)??this.getByMode(r):null;if(n){const t=new n;return await t.initialize(e,i),t}}async _extract(t,e){const s=t.type;if(!e)return"array"===s?[]:null;if("object"===s){const{selector:s,properties:i}=t;let r=e;if(s){const t=await this._querySelectorAll(e,s);r=t.length>0?t[0]:null}if(!r)return null;const n={};for(const t in i)n[t]=await this._extract(i[t],r);return n}if("array"===s){const{selector:s,items:i}=t,r=s?await this._querySelectorAll(e,s):[e],n=[];for(const t of r)n.push(await this._extract(i,t));return n}const{selector:i}=t;let r=e;if(i){const t=await this._querySelectorAll(e,i);r=t.length>0?t[0]:null}return r?this._extractValue(t,r):null}waitFor(t){return this.dispatchAction({type:"waitFor",options:t})}click(t){return this.dispatchAction({type:"click",selector:t})}fill(t,e){return this.dispatchAction({type:"fill",selector:t,value:e})}submit(t,e){return this.dispatchAction({type:"submit",selector:t,options:e})}pause(t){return this.dispatchAction({type:"pause",message:t})}extract(t){const e=this._normalizeSchema(t);return this.dispatchAction({type:"extract",schema:e})}_normalizeSchema(t){const e=JSON.parse(JSON.stringify(t));if(e.properties)for(const t in e.properties)e.properties[t]=this._normalizeSchema(e.properties[t]);if(e.items&&(e.items=this._normalizeSchema(e.items)),"array"===e.type&&(e.attribute&&!e.items&&(e.items={attribute:e.attribute},delete e.attribute),e.items||(e.items={type:"string"})),e.selector&&(e.has||e.exclude)){const{selector:t,has:s,exclude:i}=e,r=t.split(",").map(t=>{let e=t.trim();return s&&(e=`${e}:has(${s})`),i&&(e=`${e}:not(${i})`),e}).join(", ");e.selector=r,delete e.has,delete e.exclude}return e}get id(){return this.constructor.id}get mode(){return this.constructor.mode}get context(){return this.ctx}async initialize(t,e){if(this.ctx)return;p(t,e),this.ctx=t,this.opts=t,this.hdrs=function(t){const e={};if(t&&"object"==typeof t)for(const[s,i]of Object.entries(t))e[s.toLowerCase()]=i;return e}(t.headers),this.jar=[...t.cookies??[]],t.internal||(t.internal={}),t.internal.engine=this,t.engine=this.mode,this.requestQueue=await x.open();const s=await this._getSpecificCrawlerOptions(t),i={...d(s,{requestQueue:this.requestQueue,maxConcurrency:1,minConcurrency:1,useSessionPool:!0,persistCookiesPerSession:!0,sessionPoolOptions:{maxPoolSize:1,persistenceOptions:{enable:!1},sessionOptions:{maxUsageCount:1e3,maxErrorScore:3}}}),requestHandler:this._requestHandler.bind(this),errorHandler:this._failedRequestHandler.bind(this),failedRequestHandler:this._failedRequestHandler.bind(this)};this.crawler=this._createCrawler(i),this.crawler.run().then(()=>{this.isCrawlerReady=!0}).catch(t=>{this.isCrawlerReady=!1,console.error("Crawler background error:",t)})}async cleanup(){await(this._cleanup?.()),await this._commonCleanup();const t=this.ctx;t&&t.internal?.engine===this&&(t.internal.engine=void 0),this.ctx=void 0,this.opts=void 0}async _executePendingActions(t){await new Promise(e=>{const s=async({action:e,resolve:s,reject:i})=>{try{if("dispose"===e.type)return this.actionEmitter.emit("dispose"),void s();s(await this.executeAction(t,e))}catch(t){i(t)}};this.actionEmitter.on("dispatch",s),this.actionEmitter.once("dispose",()=>{this.actionEmitter.removeListener("dispatch",s),e()})})}async _sharedRequestHandler(t){try{const{request:e}=t;this.isPageActive=!0;const s=this.pendingRequests.get(e.userData.requestId);if(s){const i=await this.buildResponse(t),r=!i.statusCode||i.statusCode>=400;if(this.ctx?.throwHttpErrors&&r){const t=new m(`Request for ${i.finalUrl} failed with status ${i.statusCode||"N/A"}`,"request",i.statusCode);s.reject(t)}else this.lastResponse=i,s.resolve(i);this.pendingRequests.delete(e.userData.requestId)}await this._executePendingActions(t)}finally{this.isPageActive=!1,this.navigationLock.release()}}async _sharedFailedRequestHandler(t,e){const{request:s}=t,i=this.pendingRequests.get(s.userData.requestId);if(i&&e&&this.ctx?.throwHttpErrors){this.pendingRequests.delete(s.userData.requestId);const t=e.response,r=t?.statusCode||500,n=t?.url?t.url:s.url,o=new m(`Request${n?" for "+n:""} failed: ${e.message}`,"request",r);i.reject(o)}return this._sharedRequestHandler(t)}async dispatchAction(t){if(!this.isPageActive)throw new Error("No active page. Call goto() before performing actions.");return new Promise((e,s)=>{this.actionEmitter.emit("dispatch",{action:t,resolve:e,reject:s})})}async _requestHandler(t){await this._sharedRequestHandler(t)}async _failedRequestHandler(t,e){await this._sharedFailedRequestHandler(t,e)}async _commonCleanup(){if(this.isPageActive&&await this.dispatchAction({type:"dispose"}).catch(()=>{}),this.pendingRequests.size>0)for(const[,t]of this.pendingRequests)t.reject(new Error("Cleanup:Request cancelled"));if(this.actionEmitter.removeAllListeners(),this.crawler){try{await(this.crawler.teardown?.())}catch(t){console.error("ccrawler teardown error:",t)}this.crawler=void 0}this.isCrawlerReady=void 0,this.requestQueue&&(await this.requestQueue.drop(),this.requestQueue=void 0),this.pendingRequests.clear()}async blockResources(t,e){return e&&this.blockedTypes.clear(),t.forEach(t=>this.blockedTypes.add(t)),t.length}getContent(){return this.lastResponse?Promise.resolve(this.lastResponse):Promise.reject(new Error("No content fetched yet. Call goto() first."))}async headers(t,e){if(void 0===t)return{...this.hdrs};if("string"==typeof t&&void 0===e)return this.hdrs[t.toLowerCase()]||"";if(null!==t&&"object"==typeof t){const s={};for(const[e,i]of Object.entries(t))s[e.toLowerCase()]=String(i);return this.hdrs=!0===e?s:{...this.hdrs,...s},!0}return"string"==typeof t&&("string"==typeof e?this.hdrs[t.toLowerCase()]=e:null===e&&delete this.hdrs[t.toLowerCase()],!0)}async cookies(t){return Array.isArray(t)?(this.jar=[...t],!0):null===t?(this.jar=[],!0):[...this.jar]}async dispose(){await this.cleanup()}};async function E(t,e){const s=function(t,e){if(!t||!e?.length)return null;const s=new URL(t);let i=e.find(t=>t.domain===s.hostname);i||(i=e.find(t=>s.hostname.endsWith(t.domain)));if(!i)return null;if(i.pathScope?.length){if(!i.pathScope.some(t=>s.pathname.startsWith(t)))return null}return i}(e?.url||t.url,t.sites),i=t.engine||s?.engine||"auto";let r=await b.create(t,{engine:i});return r||(r=await b.create(t,{engine:"http"})),r}b.registry=new Map;var C=class{constructor(t={}){this.options=t,this.closed=!1,this.id=f(),this.context=this.createContext(t)}async execute(t){await this.ensureEngine(t);const e=i.create(t);if(!e)throw new Error(`Unknown action: ${t.id||t.name}`);let s,r;this.context.internal.actionIndex=(this.context.internal.actionIndex||0)+1,this.context.currentAction={...t,index:this.context.internal.actionIndex,startedAt:Date.now()};try{return s=await e.execute(this.context,t),s}catch(t){throw r=t,r}finally{this.context.currentAction=void 0}}async executeAll(t){try{for(let e=0;e<t.length;e++){const s=t[e];await this.execute(s)}const e=await this.execute({id:"getContent"});return{result:e?.result,outputs:this.getOutputs()}}catch(t){throw t}}getOutputs(){return this.context.outputs}async dispose(){if(this.closed)return;const t=this.context.eventBus;t.emit("session:closing",{sessionId:this.id});try{await(this.context.internal.engine?.dispose())}finally{this.closed=!0}t.emit("session:closed",{sessionId:this.id})}async ensureEngine(t){if(this.closed)throw new Error("Session is closed");if(!this.context.internal.engine){const e=t?.params?.url??this.context.url;if(!await E(this.context,{url:e}))throw new Error("No engine found")}}createContext(e=this.options){const s=new a;return c({...e,id:this.id,eventBus:s,outputs:{},internal:{},execute:async t=>this.execute(t),action:async function(t,e,s){return this.execute({name:t,params:e,...s})}},t)}},k=class{constructor(t={}){this.defaults=t}async createSession(t){const e={...this.defaults,...t||{}};return new C(e)}async fetch(t,e){"string"!=typeof t&&(t=(e=t).url);const s=await this.createSession(e);try{const i=e?.actions||[];t&&0!==i.findIndex(e=>"goto"===e.id&&e.params?.url===t)&&i.unshift({id:"goto",params:{url:t}});return await s.executeAll(i)}finally{await s.dispose()}}};import{CheerioCrawler as S,ProxyConfiguration as q}from"crawlee";import*as $ from"cheerio";import{CommonError as R,ErrorCode as P,NotFoundError as U}from"@isdk/common-error";var A=class extends b{async buildResponse(t){const{request:e,response:s,body:i,$:r}=t,n=r?.html();let o="string"==typeof i?i:Buffer.isBuffer(i)?i.toString("utf-8"):String(i??"");return n&&n!==o&&(o=n),{url:e.url,finalUrl:e.loadedUrl||e.url,statusCode:s?.statusCode??200,statusText:s?.statusMessage,headers:s?.headers,body:i,html:o,text:o}}async _querySelectorAll(t,e){const{$:s,el:i}=t;return i.find(e).toArray().map(t=>({$:s,el:s(t)}))}async _extractValue(t,e){const{el:s}=e,{attribute:i,type:r="string"}=t;if(0===s.length)return null;let n="";if(n=i?s.attr(i)??null:"html"===r?s.html():s.text().trim(),null===n)return null;switch(r){case"number":return parseFloat(n.replace(/[^0-9.-]+/g,""))||null;case"boolean":const t=n.toLowerCase();return"true"===t||"1"===t;default:return n}}async executeAction(t,e){const{$:s}=t;switch(e.type){case"dispose":return;case"extract":if(!s)throw new R(`Cheerio context not available for action: ${e.type}`,"extract");return this._extract(e.schema,{$:s,el:s.root()});case"click":{if(!s)throw new R(`Cheerio context not available for action: ${e.type}`,"click");const i=e.selector,r=s(i).first();let n;if(0===r.length)try{n=new URL(i,t.request.loadedUrl||t.request.url).href}catch{throw new R(`click: selector not found or invalid URL: ${i}`,"click")}else{if(!r.is("a")||!r.attr("href")){if(r.is('input[type="submit"], button[type="submit"], button, input')){const e=r.closest("form");if(e.length)return this.executeAction(t,{type:"submit",selector:e});throw new R("click: submit-like element without form","click")}throw new R(`click: unsupported element for http simulate. Selector: ${i}`,"click")}{const e=r.attr("href");n=new URL(e,t.request.loadedUrl||t.request.url).href}}const o=await t.sendRequest({url:n});return void this._updateStateAfterNavigation(t,o)}case"fill":{if(!s)throw new R(`Cheerio context not available for action: ${e.type}`),"fill";const i=s(e.selector).first();if(0===i.length)throw new R(`fill: selector not found: ${e.selector}`);if(!i.is("input, textarea, select"))throw new R(`fill: not a form field: ${e.selector}`,"fill");{i.val(e.value);const s=this.buildResponse(t);this.lastResponse=s}return}case"waitFor":return void(e.options?.ms&&await new Promise(t=>setTimeout(t,e.options.ms)));case"pause":const i=this.ctx?.onPause;return void(i?(console.info(e.message||"Execution paused for manual intervention."),await i({message:e.message}),console.info("Resuming execution...")):console.warn("[PauseAction] was called, but no `onPause` handler was provided in fetchWeb options. Skipped."));case"submit":{if(!s)throw new R(`Cheerio context not available for action: ${e.type}`,"submit");const i="string"==typeof e.selector?s(e.selector).first():null!=e.selector?e.selector:s("form").first();if(0===i.length)throw new U(e.selector,"submit");const r=i.attr("action")||t.request.loadedUrl||t.request.url,n=(i.attr("method")||"GET").toUpperCase(),o=new URL(r,t.request.loadedUrl||t.request.url).href,a={};let c;if(i.find("input, select, textarea").each((t,e)=>{const i=s(e),r=i.attr("name");if(!r)return;const n=i.val();null!=n&&(a[r]=String(n))}),"GET"===n){const e=new URL(o);Object.entries(a).forEach(([t,s])=>e.searchParams.set(t,s)),c=await t.sendRequest({url:e.href,method:"GET"})}else{let s;const r={};"application/json"===(e.options?.enctype||i.attr("enctype")||"application/x-www-form-urlencoded")?(s=JSON.stringify(a),r["Content-Type"]="application/json"):(s=new URLSearchParams(a).toString(),r["Content-Type"]="application/x-www-form-urlencoded"),c=await t.sendRequest({url:o,method:"POST",body:s,headers:r})}return void this._updateStateAfterNavigation(t,c)}case"getContent":return this.buildResponse(t);default:throw new R(`Unknown action type: ${e.type}`,"CheerioFetchEngine.executeAction",P.NotSupported)}}_updateStateAfterNavigation(t,e){const s=e.response||e,{body:i,headers:r,statusCode:n,statusMessage:o}=s,{url:a,loadedUrl:c}=e,l="string"==typeof i?i:Buffer.isBuffer(i)?i.toString("utf-8"):String(i??"");r&&r["content-type"]?.includes("html")&&(t.$=$.load(l)),this.lastResponse={url:a,finalUrl:c||a,statusCode:n,statusText:o,headers:r||{},body:i,html:l,text:l}}_createCrawler(t){return new S(t)}_getSpecificCrawlerOptions(t){const e=this.opts?.proxy?"string"==typeof this.opts.proxy?[this.opts.proxy]:this.opts.proxy:void 0,s=e?.length?new q({proxyUrls:e}):void 0;return{additionalMimeTypes:["text/plain"],maxRequestRetries:1,requestHandlerTimeoutSecs:Math.max(5,Math.floor((this.opts?.timeoutMs||3e4)/1e3)),proxyConfiguration:s,preNavigationHooks:[(e,s)=>{s.throwHttpErrors=t.throwHttpErrors,this.opts?.timeoutMs&&(s.timeout={request:this.opts.timeoutMs})}]}}async goto(t,e){this.isPageActive&&this.dispatchAction({type:"dispose"}).catch(()=>{});const s="req-"+ ++this.requestCounter,i=new Promise((t,i)=>{const r=e?.timeoutMs||this.opts?.timeoutMs||3e4,n=setTimeout(()=>{this.pendingRequests.delete(s),this.navigationLock.release(),i(new R(`goto timed out after ${r}ms.`,"gotoTimeout",P.RequestTimeout))},r);this.pendingRequests.set(s,{resolve:e=>{clearTimeout(n),t(e)},reject:t=>{clearTimeout(n),i(t)}})});return this.requestQueue.addRequest({...e,url:t,headers:{...this.hdrs,...e?.headers},userData:{requestId:s},uniqueKey:`${t}-${s}`}).catch(t=>{const e=this.pendingRequests.get(s);e&&(this.pendingRequests.delete(s),this.navigationLock.release(),e.reject(t))}),await this.navigationLock,this.navigationLock=v(),i}};A.id="cheerio",A.mode="http",b.register(A);import{PlaywrightCrawler as T}from"crawlee";import{firefox as j}from"playwright";import{launchOptions as _}from"camoufox-js";import{CommonError as O,ErrorCode as F,NotFoundError as N}from"@isdk/common-error";var M=class extends b{async buildResponse(t){const{page:e,response:s,request:i}=t;if(!e||e.isClosed())return{url:i.url,finalUrl:i.loadedUrl||i.url,statusCode:s?.status(),statusText:s?.statusText(),headers:await(s?.allHeaders())||{},body:"",html:"",text:""};const r=await e.content(),n=await e.textContent("body");return{url:e.url(),finalUrl:e.url(),statusCode:s?.status(),statusText:s?.statusText(),headers:await(s?.allHeaders())||{},body:r,html:r,text:n||""}}async _querySelectorAll(t,e){return t.locator(e).all()}async _extractValue(t,e){const{attribute:s,type:i="string"}=t;if(0===await e.count())return null;let r="";if(r=s?await e.getAttribute(s):"html"===i?await e.innerHTML():await e.textContent(),null===r)return null;switch(r=r.trim(),i){case"number":return parseFloat(r.replace(/[^0-9.-]+/g,""))||null;case"boolean":const t=r.toLowerCase();return"true"===t||"1"===t;default:return r}}async executeAction(t,e){const{page:s}=t,i=this.opts?.timeoutMs||3e4;switch(e.type){case"navigate":{const i=await s.goto(e.url,{waitUntil:e.opts?.waitUntil||"domcontentloaded",timeout:this.opts?.timeoutMs||3e4});i&&(t={...t,response:i});const r=await this.buildResponse(t);return this.lastResponse=r,r}case"extract":return this._extract(e.schema,s.locator("body"));case"click":{await s.click(e.selector,{timeout:i}),await s.waitForLoadState("networkidle",{timeout:i});const r=await this.buildResponse(t);return void(this.lastResponse=r)}case"fill":await s.fill(e.selector,e.value,{timeout:i});const r=await this.buildResponse(t);return void(this.lastResponse=r);case"waitFor":return e.options?.selector&&await s.waitForSelector(e.options.selector,{timeout:i}),e.options?.networkIdle&&await s.waitForLoadState("networkidle",{timeout:i}),void(e.options?.ms&&await s.waitForTimeout(e.options.ms));case"submit":{const r=e.selector||"form",n=s.locator(r).first();if(0===await n.count())throw new N(r,"submit");if("application/json"===(e.options?.enctype||"application/x-www-form-urlencoded")){const t=await n.elementHandle();if(!t)throw new O(`submit: could not get form handle for ${r}`,"submit");const e=await t.evaluate(async t=>{const e=new FormData(t),s={};e.forEach((t,e)=>{s[e]=t.toString()});const i=await fetch(t.action,{method:t.method,headers:{"Content-Type":"application/json"},body:JSON.stringify(s)}),r=await i.text();return{status:i.status,statusText:i.statusText,headers:Object.fromEntries(i.headers.entries()),body:r,html:r,text:r,url:t.action,finalUrl:i.url}});return await t.dispose(),await s.setContent(e.html),void(this.lastResponse=e)}return await n.evaluate(t=>t.submit()),await s.waitForLoadState("networkidle",{timeout:i}),void(this.lastResponse=await this.buildResponse(t))}case"pause":{const t=this.ctx?.onPause;return void(t?(console.info(e.message||"Execution paused for manual intervention."),await t({message:e.message}),console.info("Resuming execution...")):console.warn("[PauseAction] was called, but no `onPause` handler was provided in fetchWeb options. Skipped."))}case"getContent":return this.buildResponse(t);default:throw new O(`Unknown action type: ${e.type}`,"PlaywrightFetchEngine.executeAction",F.NotSupported)}}_createCrawler(t){return new T(t)}async _getSpecificCrawlerOptions(t){const e=t.browser?.headless??!0,s={maxRequestRetries:t.retries||3,headless:e,preNavigationHooks:[async({page:e,request:s},i)=>{i.throwHttpErrors=t.throwHttpErrors,this.jar.length>0&&await e.context().addCookies(this.jar.map(t=>({...t,url:s.url,domain:t.domain||new URL(s.url).hostname})));const r=this.blockedTypes;r.size>0&&await e.route("**/*",t=>{r.has(t.request().resourceType())?t.abort():t.continue()})}]};if(this.opts?.antibot){s.browserPoolOptions={useFingerprints:!1};const t=await _({headless:e});s.launchContext={launcher:j,launchOptions:t},s.postNavigationHooks=[async({page:t,handleCloudflareChallenge:e})=>{await e()}]}return s}async goto(t,e){if(this.isPageActive)return this.dispatchAction({type:"navigate",url:t,opts:e});if(!this.requestQueue)throw new O("RequestQueue not initialized","goto");const s="req-"+ ++this.requestCounter,i=new Promise((t,e)=>{this.pendingRequests.set(s,{resolve:t,reject:e})});return await this.requestQueue.addRequest({url:t,headers:this.hdrs,userData:{requestId:s,waitUntil:e?.waitUntil||"domcontentloaded"},uniqueKey:`${t}-${s}`}),i}};M.id="playwright",M.mode="browser",b.register(M);var H=class extends i{async onExecute(t,e){const{selector:s,...i}=e?.params||{};if(!s)throw new Error("Selector is required for click action");await this.delegateToEngine(t,"click",s,i)}};H.id="click",H.returnType="none",H.capabilities={http:"simulate",browser:"native"},i.register(H);var L=class extends i{async onExecute(t,e){const{selector:s,value:i,...r}=e?.params||{};if(!s)throw new Error("Selector is required for fill action");if(void 0===i)throw new Error("Value is required for fill action");await this.delegateToEngine(t,"fill",s,i,r)}};L.id="fill",L.returnType="none",L.capabilities={http:"simulate",browser:"native"},i.register(L);var B=class extends i{async onExecute(t,e){return await this.delegateToEngine(t,"getContent",e?.params)}};B.id="getContent",B.returnType="response",B.capabilities={http:"native",browser:"native"},i.register(B);var z=class extends i{async onExecute(t,e,s){const i=e?.params,r=i?.url||t.url;if(!r)throw new Error("URL is required for goto action");const n=t.internal.engine;if(!n)throw new Error("No engine available");t.url=r;return await n.goto(r,i)}};z.id="goto",z.returnType="response",z.capabilities={http:"native",browser:"native"},i.register(z);var D=class extends i{async onExecute(t,e){const{selector:s,...i}=e?.params||{};await this.delegateToEngine(t,"submit",s,i)}};D.id="submit",D.returnType="none",D.capabilities={http:"simulate",browser:"native"},i.register(D);var G=class extends i{async onExecute(t,e){const s=t.internal.engine;if(!s)throw new Error("No engine available");await s.waitFor(e?.params)}};G.id="waitFor",G.returnType="none",G.capabilities={http:"native",browser:"native"},i.register(G);var I=class extends i{async onExecute(t,e){const s=e?.params;if(!s)throw new Error("Schema is required for extract action");return this.delegateToEngine(t,"extract",s)}};I.id="extract",I.returnType="any",I.capabilities={http:"native",browser:"native"},i.register(I);var J=class extends i{async onExecute(t,e){const{selector:s,message:i,attribute:r}=e?.params||{},n=t.internal.engine;if("browser"===n?.mode){if(s){if(!await(n?.extract({selector:s,attribute:r})))return}n&&"pause"in n?await n.pause(i):console.warn("[PauseAction] was called, but the current engine does not support `pause`. Skipped.")}else console.warn("[PauseAction] can only run in browser engine. Skipped.")}};async function V(t,e){return(new k).fetch(t,e)}J.id="pause",J.capabilities={http:"native",browser:"native"},J.returnType="none",i.register(J);export{A as CheerioFetchEngine,H as ClickAction,t as DefaultFetcherProperties,I as ExtractAction,i as FetchAction,e as FetchActionResultStatus,b as FetchEngine,C as FetchSession,L as FillAction,B as GetContentAction,z as GotoAction,J as PauseAction,M as PlaywrightFetchEngine,D as SubmitAction,G as WaitForAction,k as WebFetcher,V as fetchWeb};
|
|
1
|
+
var t={engine:"auto",enableSmart:!0,useSiteRegistry:!0,antibot:!1,headers:{},cookies:[],reuseCookies:!0,throwHttpErrors:void 0,proxy:[],blockResources:[],ignoreSslErrors:!0,browser:{engine:"playwright",headless:!0,waitUntil:"domcontentloaded"},http:{method:"GET"},timeoutMs:6e4,requestHandlerTimeoutSecs:void 0,maxConcurrency:1,maxRequestsPerMinute:1e3,delayBetweenRequestsMs:0,retries:0,sites:[]},e=Object.keys(t).concat(["actions","onPause"]),i=(t=>(t[t.Failed=0]="Failed",t[t.Success=1]="Success",t[t.Skipped=2]="Skipped",t))(i||{}),s=class t{static register(t){const e=t.id;if(!e)throw new Error("FetchAction.register: actionClass.id is required");this.registry.set(e,t)}static get(t){return this.registry.get(t)}static create(t){const e="string"==typeof t?t:t.id||t.name;if(!e)throw new Error("Action must have id or name");const i=this.registry.get(e);return i?new i:void 0}static has(t){return this.registry.has(t)}static list(){return Array.from(this.registry.keys())}static getCapability(t){return this.capabilities[t]??"noop"}getCapability(t){return this.constructor.getCapability(t)}get id(){return this.constructor.id}get returnType(){return this.constructor.returnType}get capabilities(){return this.constructor.capabilities}async delegateToEngine(t,e,...i){const s=t.internal.engine;if(!s)throw new Error("No engine available");if("function"!=typeof s[e])throw new Error(`Engine does not have a method named '${String(e)}'`);return await s[e](...i)}installCollectors(e,i){const s=i?.collectors;if(!s?.length)return;const r=[],c=new Set;for(const i of s){const s=n(i.activateOn),l=n(i.collectOn),u=n(i.deactivateOn),h=!(i.background??!0),w=t.create(i);if(!w)continue;let f=!1,d=!1,p=0;const y=async t=>{if(!f&&!d){f=!0;try{await(w.onBeforeExec?.(e,i))}catch(t){e.eventBus.emit("collector:error",{action:this.id,collector:w.id,phase:"before",error:t})}}},m=async(t,s)=>{if(!d){f||await y(s);try{const r=Promise.resolve(w.onExecute?.(e,i,s)).then(s=>{var r,n;if(i.storeAs){((r=e.outputs)[n=i.storeAs]||(r[n]=[])).push(s)}return e.eventBus.emit("collector:result",{action:this.id,collector:i.id||i.name,event:t,result:s}),s}).catch(s=>{e.eventBus.emit("collector:error",{action:this.id,collector:i.id||i.name,event:t,phase:"exec",error:s})}).finally(()=>{p++});h&&(c.add(r),r.finally(()=>c.delete(r)))}catch(i){e.eventBus.emit("collector:error",{action:this.id,collector:w.id,event:t,phase:"exec",error:i})}}},g=async()=>{if(!d){0===p&&m("collector:after"),d=!0;try{await(w.onAfterExec?.(e,i))}catch(t){e.eventBus.emit("collector:error",{action:this.id,collector:i.id||i.name,phase:"after",error:t})}finally{e.eventBus.emit("collector:end",{action:this.id,collector:i.id||i.name}),x.forEach(t=>t())}}},v=o(e,s,y),x=a(e,l,m),b=o(e,u,g);if(r.push(...v,...x,...b),!s.length&&!l.length&&!u.length){const t=()=>{g()};e.eventBus.once(`action:${this.id}.end`,t),r.push(()=>e.eventBus.off("fetcher:action:end",t))}}return r.length||c.size>0?{cleanup:()=>r.forEach(t=>t()),awaitExecPendings:async()=>{c.size>0&&await Promise.allSettled(Array.from(c))}}:void 0}async beforeExec(t,e){t.internal.actionStack||(t.internal.actionStack=[]);const i=t.internal.actionStack,s=i.length,r=i.length>0?i[i.length-1].id:void 0,n={...e,id:this.id,depth:s,parent:r};i.push(n),t.currentAction=n;const o={action:this,context:t,options:e,depth:s,stack:[...i]};t.eventBus.emit(`action:${this.id}.start`,o),t.eventBus.emit("action:start",o),await(this.onBeforeExec?.(t,e));return{entry:o,collectors:this.installCollectors(t,e)}}async afterExec(t,e,i,s){const r=t.internal.actionStack,n=r.length-1,o=s?.collectors;try{await(o?.awaitExecPendings()),t.lastResult=i,"response"!==i?.returnType||i.error||(t.lastResponse=i.result),e?.storeAs&&(t.outputs[e.storeAs]=i?.result),i?.error&&(t.currentAction.error=i.error),await(this.onAfterExec?.(t,e));const s={action:this,context:t,options:e,result:i,depth:n,stack:[...r]};i?.error&&(s.error=i.error);try{t.eventBus.emit(`action:${this.id}.end`,s)}catch(t){}try{t.eventBus.emit("action:end",s)}catch(t){}}finally{try{o?.cleanup()}finally{r.pop();const e=r.length;t.currentAction=e>0?r[e-1]:void 0}}}async execute(t,e){const i=await this.beforeExec(t,e);let s;try{const i=e?.failOnError??!0;return t.throwHttpErrors=i,s=await this.onExecute(t,e),s&&s.returnType||(s={status:1,returnType:this.returnType??"any",result:s}),s}catch(i){if(s={status:0,error:i,meta:{id:this.id,engineType:t.engine,capability:this.getCapability(t.engine)}},e?.failOnError)throw i;return s}finally{await this.afterExec(t,e,s,i)}}};s.registry=new Map,s.returnType="any",s.capabilities={http:"noop",browser:"noop"};var r=s;function n(t){return t?Array.isArray(t)?t:[t]:[]}function o(t,e,i){const s=[];for(const r of e)if("string"==typeof r||r instanceof RegExp){const e=(...t)=>{i(t[0])};t.eventBus.once(r,e),s.push(()=>t.eventBus.off(r,e))}return s}function a(t,e,i){const s=[];for(const r of e)if("string"==typeof r||r instanceof RegExp){const e=t=>i(r,t);t.eventBus.on(r,e),s.push(()=>t.eventBus.off(r,e))}return s}import{EventEmitter as c}from"events-ex";import{defaultsDeep as l}from"lodash-es";import{customAlphabet as u}from"nanoid";var h=u("0123456789abcdefghijklmnopqrstuvwxyz",12);import{defaultsDeep as w,merge as f}from"lodash-es";import{EventEmitter as d}from"events-ex";import{CommonError as p}from"@isdk/common-error";import{Configuration as y,RequestQueue as m}from"crawlee";function g(){let t=()=>{};const e=new Promise(e=>{t=e});return e.release=t,e}y.getGlobalConfig().set("persistStorage",!1);var v=class{constructor(){this.hdrs={},this.jar=[],this.pendingRequests=new Map,this.requestCounter=0,this.actionEmitter=new d,this.isPageActive=!1,this.navigationLock=function(){const t=g();return t.release(),t}(),this.blockedTypes=new Set}static register(t){const e=t.id;if(!e)throw new Error("Engine must define static id");if(this.registry.has(e))throw new Error(`Engine id duplicated: ${e}`);this.registry.set(e,t)}static get(t){return this.registry.get(t)}static getByMode(t){for(const[e,i]of this.registry.entries())if(i.mode===t)return i}static async create(e,i){const s=w(i,e,t),r=s.engine??e.engine,n=r?this.get(r)??this.getByMode(r):null;if(n){const t=new n;return await t.initialize(e,s),t}}async _extract(t,e){const i=t.type;if(!e)return"array"===i?[]:null;if("object"===i){const{selector:i,properties:s}=t;let r=e;if(i){const t=await this._querySelectorAll(e,i);r=t.length>0?t[0]:null}if(!r)return null;const n={};for(const t in s)n[t]=await this._extract(s[t],r);return n}if("array"===i){const{selector:i,items:s}=t,r=i?await this._querySelectorAll(e,i):[e],n=[];for(const t of r)n.push(await this._extract(s,t));return n}const{selector:s}=t;let r=e;if(s){const t=await this._querySelectorAll(e,s);r=t.length>0?t[0]:null}return r?this._extractValue(t,r):null}async buildResponse(t){const e=await this._buildResponse(t),i=e.headers["content-type"]||"";return e.contentType=i.split(";")[0].trim(),e}waitFor(t){return this.dispatchAction({type:"waitFor",options:t})}click(t){return this.dispatchAction({type:"click",selector:t})}fill(t,e){return this.dispatchAction({type:"fill",selector:t,value:e})}submit(t,e){return this.dispatchAction({type:"submit",selector:t,options:e})}pause(t){return this.dispatchAction({type:"pause",message:t})}extract(t){const e=this._normalizeSchema(t);return this.dispatchAction({type:"extract",schema:e})}_normalizeSchema(t){const e=JSON.parse(JSON.stringify(t));if(e.properties)for(const t in e.properties)e.properties[t]=this._normalizeSchema(e.properties[t]);if(e.items&&(e.items=this._normalizeSchema(e.items)),"array"===e.type&&(e.attribute&&!e.items&&(e.items={attribute:e.attribute},delete e.attribute),e.items||(e.items={type:"string"})),e.selector&&(e.has||e.exclude)){const{selector:t,has:i,exclude:s}=e,r=t.split(",").map(t=>{let e=t.trim();return i&&(e=`${e}:has(${i})`),s&&(e=`${e}:not(${s})`),e}).join(", ");e.selector=r,delete e.has,delete e.exclude}return e}get id(){return this.constructor.id}get mode(){return this.constructor.mode}get context(){return this.ctx}async initialize(t,e){if(this.ctx)return;f(t,e),this.ctx=t,this.opts=t,this.hdrs=function(t){const e={};if(t&&"object"==typeof t)for(const[i,s]of Object.entries(t))e[i.toLowerCase()]=s;return e}(t.headers),this.jar=[...t.cookies??[]],t.internal||(t.internal={}),t.internal.engine=this,t.engine=this.mode,this.actionEmitter.setMaxListeners(100),this.requestQueue=await m.open();const i=await this._getSpecificCrawlerOptions(t),s={...w(i,{requestQueue:this.requestQueue,maxConcurrency:1,minConcurrency:1,useSessionPool:!0,persistCookiesPerSession:!0,sessionPoolOptions:{maxPoolSize:1,persistenceOptions:{enable:!1},sessionOptions:{maxUsageCount:1e3,maxErrorScore:3}}}),requestHandler:this._requestHandler.bind(this),errorHandler:this._failedRequestHandler.bind(this),failedRequestHandler:this._failedRequestHandler.bind(this)};this.crawler=this._createCrawler(s),this.crawler.run().then(()=>{this.isCrawlerReady=!0}).catch(t=>{this.isCrawlerReady=!1,console.error("Crawler background error:",t)})}async cleanup(){await(this._cleanup?.()),await this._commonCleanup();const t=this.ctx;t&&t.internal?.engine===this&&(t.internal.engine=void 0),this.ctx=void 0,this.opts=void 0}async _executePendingActions(t){await new Promise(e=>{const i=async({action:e,resolve:i,reject:s})=>{try{if("dispose"===e.type)return this.actionEmitter.emit("dispose"),void i();i(await this.executeAction(t,e))}catch(t){s(t)}};this.actionEmitter.on("dispatch",i),this.actionEmitter.once("dispose",()=>{this.actionEmitter.removeListener("dispatch",i),e()})})}async _sharedRequestHandler(t){try{const{request:e}=t;this.isPageActive=!0;const i=this.pendingRequests.get(e.userData.requestId);if(i){const s=await this._buildResponse(t),r=!s.statusCode||s.statusCode>=400;if(this.ctx?.throwHttpErrors&&r){const t=new p(`Request for ${s.finalUrl} failed with status ${s.statusCode||"N/A"}`,"request",s.statusCode);i.reject(t)}else this.lastResponse=s,i.resolve(s);this.pendingRequests.delete(e.userData.requestId)}await this._executePendingActions(t)}finally{this.isPageActive=!1,this.navigationLock.release()}}async _sharedFailedRequestHandler(t,e){const{request:i}=t,s=this.pendingRequests.get(i.userData.requestId);if(s&&e&&this.ctx?.throwHttpErrors){this.pendingRequests.delete(i.userData.requestId);const t=e.response,r=t?.statusCode||500,n=t?.url?t.url:i.url,o=new p(`Request${n?" for "+n:""} failed: ${e.message}`,"request",r);s.reject(o)}return this._sharedRequestHandler(t)}async dispatchAction(t){if(!this.isPageActive)throw new Error("No active page. Call goto() before performing actions.");return new Promise((e,i)=>{this.actionEmitter.emit("dispatch",{action:t,resolve:e,reject:i})})}async _requestHandler(t){await this._sharedRequestHandler(t)}async _failedRequestHandler(t,e){await this._sharedFailedRequestHandler(t,e)}async _commonCleanup(){if(this.isPageActive&&await this.dispatchAction({type:"dispose"}).catch(()=>{}),this.pendingRequests.size>0)for(const[,t]of this.pendingRequests)t.reject(new Error("Cleanup:Request cancelled"));if(this.actionEmitter.removeAllListeners(),this.crawler){try{await(this.crawler.teardown?.())}catch(t){console.error("ccrawler teardown error:",t)}this.crawler=void 0}this.isCrawlerReady=void 0,this.requestQueue&&(await this.requestQueue.drop(),this.requestQueue=void 0),this.pendingRequests.clear()}async blockResources(t,e){return e&&this.blockedTypes.clear(),t.forEach(t=>this.blockedTypes.add(t)),t.length}getContent(){return this.lastResponse?Promise.resolve(this.lastResponse):Promise.reject(new Error("No content fetched yet. Call goto() first."))}async headers(t,e){if(void 0===t)return{...this.hdrs};if("string"==typeof t&&void 0===e)return this.hdrs[t.toLowerCase()]||"";if(null!==t&&"object"==typeof t){const i={};for(const[e,s]of Object.entries(t))i[e.toLowerCase()]=String(s);return this.hdrs=!0===e?i:{...this.hdrs,...i},!0}return"string"==typeof t&&("string"==typeof e?this.hdrs[t.toLowerCase()]=e:null===e&&delete this.hdrs[t.toLowerCase()],!0)}async cookies(t){return Array.isArray(t)?(this.jar=[...t],!0):null===t?(this.jar=[],!0):[...this.jar]}async dispose(){await this.cleanup()}};async function x(t,e){const i=function(t,e){if(!t||!e?.length)return null;const i=new URL(t);let s=e.find(t=>t.domain===i.hostname);s||(s=e.find(t=>i.hostname.endsWith(t.domain)));if(!s)return null;if(s.pathScope?.length){if(!s.pathScope.some(t=>i.pathname.startsWith(t)))return null}return s}(e?.url||t.url,t.sites),s=t.engine||i?.engine||"auto";let r=await v.create(t,{engine:s});return r||(r=await v.create(t,{engine:"http"})),r}v.registry=new Map;var b=class{constructor(t={}){this.options=t,this.closed=!1,this.id=h(),this.context=this.createContext(t)}async execute(t){await this.ensureEngine(t);const e=r.create(t);if(!e)throw new Error(`Unknown action: ${t.id||t.name}`);let i,s;this.context.internal.actionIndex=(this.context.internal.actionIndex||0)+1,this.context.currentAction={...t,index:this.context.internal.actionIndex,startedAt:Date.now()};try{return i=await e.execute(this.context,t),i}catch(t){throw s=t,s}finally{this.context.currentAction=void 0}}async executeAll(t){try{for(let e=0;e<t.length;e++){const i=t[e];await this.execute(i)}const e=await this.execute({id:"getContent"});return{result:e?.result,outputs:this.getOutputs()}}catch(t){throw t}}getOutputs(){return this.context.outputs}async dispose(){if(this.closed)return;const t=this.context.eventBus;t.emit("session:closing",{sessionId:this.id});try{await(this.context.internal.engine?.dispose())}finally{this.closed=!0}t.emit("session:closed",{sessionId:this.id})}async ensureEngine(t){if(this.closed)throw new Error("Session is closed");if(!this.context.internal.engine){const e=t?.params?.url??this.context.url;if(!await x(this.context,{url:e}))throw new Error("No engine found")}}createContext(e=this.options){const i=new c;return l({...e,id:this.id,eventBus:i,outputs:{},internal:{},execute:async t=>this.execute(t),action:async function(t,e,i){return this.execute({name:t,params:e,...i})}},t)}},E=class{constructor(t={}){this.defaults=t}async createSession(t){const e={...this.defaults,...t||{}};return new b(e)}async fetch(t,e){"string"!=typeof t&&(t=(e=t).url);const i=await this.createSession(e);try{const s=e?.actions||[];t&&0!==s.findIndex(e=>"goto"===e.id&&e.params?.url===t)&&s.unshift({id:"goto",params:{url:t}});return await i.executeAll(s)}finally{await i.dispose()}}};import{CheerioCrawler as C,ProxyConfiguration as k}from"crawlee";import*as q from"cheerio";import{CommonError as S,ErrorCode as $,NotFoundError as R}from"@isdk/common-error";var P=class extends v{async _buildResponse(t){const{request:e,response:i,body:s,$:r}=t,n=r?.html();let o="string"==typeof s?s:Buffer.isBuffer(s)?s.toString("utf-8"):String(s??"");n&&n!==o&&(o=n);let a=i?.headers;if(!a&&i?.rawHeaders){a={};const t=i.rawHeaders;for(let e=0;e<t.length;e+=2)a[t[e].toLowerCase()]=t[e+1]}return{url:e.url,finalUrl:e.loadedUrl||e.url,statusCode:i?.statusCode??200,statusText:i?.statusMessage,headers:a||{},body:s,html:o,text:o}}async _querySelectorAll(t,e){const{$:i,el:s}=t;return s.find(e).toArray().map(t=>({$:i,el:i(t)}))}async _extractValue(t,e){const{el:i}=e,{attribute:s,type:r="string"}=t;if(0===i.length)return null;let n="";if(n=s?i.attr(s)??null:"html"===r?i.html():i.text().trim(),null===n)return null;switch(r){case"number":return parseFloat(n.replace(/[^0-9.-]+/g,""))||null;case"boolean":const t=n.toLowerCase();return"true"===t||"1"===t;default:return n}}async executeAction(t,e){const{$:i}=t;switch(e.type){case"dispose":return;case"extract":if(!i)throw new S(`Cheerio context not available for action: ${e.type}`,"extract");return this._extract(e.schema,{$:i,el:i.root()});case"click":{if(!i)throw new S(`Cheerio context not available for action: ${e.type}`,"click");const s=e.selector,r=i(s).first();let n;if(0===r.length)try{n=new URL(s,t.request.loadedUrl||t.request.url).href}catch{throw new S(`click: selector not found or invalid URL: ${s}`,"click")}else{if(!r.is("a")||!r.attr("href")){if(r.is('input[type="submit"], button[type="submit"], button, input')){const e=r.closest("form");if(e.length)return this.executeAction(t,{type:"submit",selector:e});throw new S("click: submit-like element without form","click")}throw new S(`click: unsupported element for http simulate. Selector: ${s}`,"click")}{const e=r.attr("href");n=new URL(e,t.request.loadedUrl||t.request.url).href}}const o=await t.sendRequest({url:n});return void this._updateStateAfterNavigation(t,o)}case"fill":{if(!i)throw new S(`Cheerio context not available for action: ${e.type}`),"fill";const s=i(e.selector).first();if(0===s.length)throw new S(`fill: selector not found: ${e.selector}`);if(!s.is("input, textarea, select"))throw new S(`fill: not a form field: ${e.selector}`);return s.val(e.value),void(this.lastResponse=await this.buildResponse(t))}case"waitFor":return void(e.options?.ms&&await new Promise(t=>setTimeout(t,e.options.ms)));case"pause":const s=this.ctx?.onPause;return void(s?(console.info(e.message||"Execution paused for manual intervention."),await s({message:e.message}),console.info("Resuming execution...")):console.warn("[PauseAction] was called, but no `onPause` handler was provided in fetchWeb options. Skipped."));case"submit":{if(!i)throw new S(`Cheerio context not available for action: ${e.type}`,"submit");const s="string"==typeof e.selector?i(e.selector).first():null!=e.selector?e.selector:i("form").first();if(0===s.length)throw new R(e.selector,"submit");const r=s.attr("action")||t.request.loadedUrl||t.request.url,n=(s.attr("method")||"GET").toUpperCase(),o=new URL(r,t.request.loadedUrl||t.request.url).href,a={};let c;if(s.find("input, select, textarea").each((t,e)=>{const s=i(e),r=s.attr("name");if(!r)return;const n=s.val();null!=n&&(a[r]=String(n))}),"GET"===n){const e=new URL(o);Object.entries(a).forEach(([t,i])=>e.searchParams.set(t,i)),c=await t.sendRequest({url:e.href,method:"GET"})}else{let i;const r={};"application/json"===(e.options?.enctype||s.attr("enctype")||"application/x-www-form-urlencoded")?(i=JSON.stringify(a),r["Content-Type"]="application/json"):(i=new URLSearchParams(a).toString(),r["Content-Type"]="application/x-www-form-urlencoded"),c=await t.sendRequest({url:o,method:"POST",body:i,headers:r})}return void this._updateStateAfterNavigation(t,c)}case"getContent":return this.buildResponse(t);default:throw new S(`Unknown action type: ${e.type}`,"CheerioFetchEngine.executeAction",$.NotSupported)}}_updateStateAfterNavigation(t,e){const i=e;let s=i.headers;if(!s&&i.rawHeaders){s={};for(let t=0;t<i.rawHeaders.length;t+=2)s[i.rawHeaders[t].toLowerCase()]=i.rawHeaders[t+1]}s=s||{};const r=i.body,n=q.load(r??"");t.$=n,t.response=i,t.body=r;const o=n.html(),a=n.text(),c=(s["content-type"]||"").split(";")[0].trim();this.lastResponse={url:t.request.url,finalUrl:i.url,statusCode:i.statusCode,statusText:i.statusMessage,headers:s,contentType:c,body:r,html:o,text:a}}_createCrawler(t){return new C(t)}_getSpecificCrawlerOptions(t){const e=this.opts?.proxy?"string"==typeof this.opts.proxy?[this.opts.proxy]:this.opts.proxy:void 0,i=e?.length?new k({proxyUrls:e}):void 0;return{additionalMimeTypes:["text/plain"],maxRequestRetries:1,requestHandlerTimeoutSecs:t.requestHandlerTimeoutSecs,proxyConfiguration:i,preNavigationHooks:[(e,i)=>{i.throwHttpErrors=t.throwHttpErrors,this.opts?.timeoutMs&&(i.timeout={request:this.opts.timeoutMs})}]}}async goto(t,e){this.isPageActive&&this.dispatchAction({type:"dispose"}).catch(()=>{});const i="req-"+ ++this.requestCounter,s=new Promise((t,s)=>{const r=e?.timeoutMs||this.opts?.timeoutMs||3e4,n=setTimeout(()=>{this.pendingRequests.delete(i),this.navigationLock.release(),s(new S(`goto timed out after ${r}ms.`,"gotoTimeout",$.RequestTimeout))},r);this.pendingRequests.set(i,{resolve:e=>{clearTimeout(n),t(e)},reject:t=>{clearTimeout(n),s(t)}})});return this.requestQueue.addRequest({...e,url:t,headers:{...this.hdrs,...e?.headers},userData:{requestId:i},uniqueKey:`${t}-${i}`}).catch(t=>{const e=this.pendingRequests.get(i);e&&(this.pendingRequests.delete(i),this.navigationLock.release(),e.reject(t))}),await this.navigationLock,this.navigationLock=g(),s}};P.id="cheerio",P.mode="http",v.register(P);import{PlaywrightCrawler as T}from"crawlee";import{firefox as A}from"playwright";import{launchOptions as U}from"camoufox-js";import{CommonError as _,ErrorCode as j,NotFoundError as O}from"@isdk/common-error";var F=class extends v{async _buildResponse(t){const{page:e,response:i,request:s}=t;if(!e||e.isClosed())return{url:s.url,finalUrl:s.loadedUrl||s.url,statusCode:i?.status(),statusText:i?.statusText(),headers:await(i?.allHeaders())||{},body:"",html:"",text:""};const r=await e.content(),n=await e.textContent("body");return{url:e.url(),finalUrl:e.url(),statusCode:i?.status(),statusText:i?.statusText(),headers:await(i?.allHeaders())||{},body:r,html:r,text:n||""}}async _querySelectorAll(t,e){return t.locator(e).all()}async _extractValue(t,e){const{attribute:i,type:s="string"}=t;if(0===await e.count())return null;let r="";if(r=i?await e.getAttribute(i):"html"===s?await e.innerHTML():await e.textContent(),null===r)return null;switch(r=r.trim(),s){case"number":return parseFloat(r.replace(/[^0-9.-]+/g,""))||null;case"boolean":const t=r.toLowerCase();return"true"===t||"1"===t;default:return r}}async executeAction(t,e){const{page:i}=t,s=this.opts?.timeoutMs||3e4;switch(e.type){case"navigate":{const s=await i.goto(e.url,{waitUntil:e.opts?.waitUntil||"domcontentloaded",timeout:this.opts?.timeoutMs||3e4});s&&(t={...t,response:s});const r=await this.buildResponse(t);return this.lastResponse=r,r}case"extract":return this._extract(e.schema,i.locator("body"));case"click":{await i.click(e.selector,{timeout:s}),await i.waitForLoadState("networkidle",{timeout:s});const r=await this.buildResponse(t);return void(this.lastResponse=r)}case"fill":await i.fill(e.selector,e.value,{timeout:s});const r=await this.buildResponse(t);return void(this.lastResponse=r);case"waitFor":return e.options?.selector&&await i.waitForSelector(e.options.selector,{timeout:s}),e.options?.networkIdle&&await i.waitForLoadState("networkidle",{timeout:s}),void(e.options?.ms&&await i.waitForTimeout(e.options.ms));case"submit":{const r=e.selector||"form",n=i.locator(r).first();if(0===await n.count())throw new O(r,"submit");if("application/json"===(e.options?.enctype||"application/x-www-form-urlencoded")){const t=await n.elementHandle();if(!t)throw new _(`submit: could not get form handle for ${r}`,"submit");const e=await t.evaluate(async t=>{const e=new FormData(t),i={};e.forEach((t,e)=>{i[e]=t.toString()});const s=await fetch(t.action,{method:t.method,headers:{"Content-Type":"application/json"},body:JSON.stringify(i)}),r=await s.text();return{status:s.status,statusText:s.statusText,headers:Object.fromEntries(s.headers.entries()),body:r,html:r,text:r,url:t.action,finalUrl:s.url}});return await t.dispose(),await i.setContent(e.html),void(this.lastResponse=e)}return await n.evaluate(t=>t.submit()),await i.waitForLoadState("networkidle",{timeout:s}),void(this.lastResponse=await this.buildResponse(t))}case"pause":{const t=this.ctx?.onPause;return void(t?(console.info(e.message||"Execution paused for manual intervention."),await t({message:e.message}),console.info("Resuming execution...")):console.warn("[PauseAction] was called, but no `onPause` handler was provided in fetchWeb options. Skipped."))}case"getContent":return this.buildResponse(t);default:throw new _(`Unknown action type: ${e.type}`,"PlaywrightFetchEngine.executeAction",j.NotSupported)}}_createCrawler(t){return new T(t)}async _getSpecificCrawlerOptions(t){const e=t.browser?.headless??!0,i={maxRequestRetries:t.retries||3,headless:e,requestHandlerTimeoutSecs:t.requestHandlerTimeoutSecs,preNavigationHooks:[async({page:e,request:i},s)=>{s.throwHttpErrors=t.throwHttpErrors,this.jar.length>0&&await e.context().addCookies(this.jar.map(t=>({...t,url:i.url,domain:t.domain||new URL(i.url).hostname})));const r=this.blockedTypes;r.size>0&&await e.route("**/*",t=>{r.has(t.request().resourceType())?t.abort():t.continue()})}]};if(this.opts?.antibot){i.browserPoolOptions={useFingerprints:!1};const t=await U({headless:e});i.launchContext={launcher:A,launchOptions:t},i.postNavigationHooks=[async({page:t,handleCloudflareChallenge:e})=>{await e()}]}return i}async goto(t,e){if(this.isPageActive)return this.dispatchAction({type:"navigate",url:t,opts:e});if(!this.requestQueue)throw new _("RequestQueue not initialized","goto");const i="req-"+ ++this.requestCounter,s=new Promise((t,e)=>{this.pendingRequests.set(i,{resolve:t,reject:e})});return await this.requestQueue.addRequest({url:t,headers:this.hdrs,userData:{requestId:i,waitUntil:e?.waitUntil||"domcontentloaded"},uniqueKey:`${t}-${i}`}),s}};F.id="playwright",F.mode="browser",v.register(F);var N=class extends r{async onExecute(t,e){const{selector:i,...s}=e?.params||{};if(!i)throw new Error("Selector is required for click action");await this.delegateToEngine(t,"click",i,s)}};N.id="click",N.returnType="none",N.capabilities={http:"simulate",browser:"native"},r.register(N);var H=class extends r{async onExecute(t,e){const{selector:i,value:s,...r}=e?.params||{};if(!i)throw new Error("Selector is required for fill action");if(void 0===s)throw new Error("Value is required for fill action");await this.delegateToEngine(t,"fill",i,s,r)}};H.id="fill",H.returnType="none",H.capabilities={http:"simulate",browser:"native"},r.register(H);var L=class extends r{async onExecute(t,e){return await this.delegateToEngine(t,"getContent",e?.params)}};L.id="getContent",L.returnType="response",L.capabilities={http:"native",browser:"native"},r.register(L);var M=class extends r{async onExecute(t,e,i){const s=e?.params,r=s?.url||t.url;if(!r)throw new Error("URL is required for goto action");const n=t.internal.engine;if(!n)throw new Error("No engine available");t.url=r;return await n.goto(r,s)}};M.id="goto",M.returnType="response",M.capabilities={http:"native",browser:"native"},r.register(M);var z=class extends r{async onExecute(t,e){const{selector:i,...s}=e?.params||{};await this.delegateToEngine(t,"submit",i,s)}};z.id="submit",z.returnType="none",z.capabilities={http:"simulate",browser:"native"},r.register(z);var D=class extends r{async onExecute(t,e){const i=t.internal.engine;if(!i)throw new Error("No engine available");await i.waitFor(e?.params)}};D.id="waitFor",D.returnType="none",D.capabilities={http:"native",browser:"native"},r.register(D);var B=class extends r{async onExecute(t,e){const i=e?.params;if(!i)throw new Error("Schema is required for extract action");return this.delegateToEngine(t,"extract",i)}};B.id="extract",B.returnType="any",B.capabilities={http:"native",browser:"native"},r.register(B);var G=class extends r{async onExecute(t,e){const{selector:i,message:s,attribute:r}=e?.params||{},n=t.internal.engine;if("browser"===n?.mode){if(i){if(!await(n?.extract({selector:i,attribute:r})))return}n&&"pause"in n?await n.pause(s):console.warn("[PauseAction] was called, but the current engine does not support `pause`. Skipped.")}else console.warn("[PauseAction] can only run in browser engine. Skipped.")}};async function I(t,e){return(new E).fetch(t,e)}G.id="pause",G.capabilities={http:"native",browser:"native"},G.returnType="none",r.register(G);export{P as CheerioFetchEngine,N as ClickAction,t as DefaultFetcherProperties,B as ExtractAction,r as FetchAction,i as FetchActionResultStatus,v as FetchEngine,b as FetchSession,e as FetcherOptionKeys,H as FillAction,L as GetContentAction,M as GotoAction,G as PauseAction,F as PlaywrightFetchEngine,z as SubmitAction,D as WaitForAction,E as WebFetcher,I as fetchWeb};
|
package/docs/README.md
CHANGED
|
@@ -4,10 +4,17 @@
|
|
|
4
4
|
|
|
5
5
|
# 🕸️ @isdk/web-fetcher
|
|
6
6
|
|
|
7
|
+
[](https://www.npmjs.com/package/@isdk/web-fetcher)
|
|
8
|
+
[](https://www.npmjs.com/package/@isdk/web-fetcher)
|
|
9
|
+
[](https://github.com/isdk/web-fetcher.js/blob/main/LICENSE)
|
|
10
|
+
[](https://nodejs.org/)
|
|
11
|
+
[](https://www.typescriptlang.org/)
|
|
12
|
+
[](https://github.com/isdk/web-fetcher.js)
|
|
13
|
+

|
|
14
|
+
|
|
7
15
|
English | [简体中文](_media/README.cn.md)
|
|
8
16
|
|
|
9
|
-
>
|
|
10
|
-
> It features a dual-engine architecture (HTTP and Browser) and a declarative action system, making it perfect for AI agents and complex data scraping tasks.
|
|
17
|
+
> An AI-friendly web automation library that simplifies complex web interactions into a declarative JSON action script. Write your script once and run it in either a fast **`http`** mode for static content or a full **`browser`** mode for dynamic sites. An optional **`antibot`** flag helps bypass detection mechanisms. The library is designed for targeted, task-oriented data extraction (e.g., get X from page Y), not for building whole-site crawlers.
|
|
11
18
|
|
|
12
19
|
---
|
|
13
20
|
|
|
@@ -141,6 +148,7 @@ Here are the essential built-in actions:
|
|
|
141
148
|
* `fill`: Fills an input field with a specified value.
|
|
142
149
|
* `submit`: Submits a form.
|
|
143
150
|
* `waitFor`: Pauses execution to wait for a specific condition (e.g., a timeout, a selector to appear, or network to be idle).
|
|
151
|
+
* `pause`: Pauses execution for manual intervention (e.g., solving a CAPTCHA).
|
|
144
152
|
* `getContent`: Retrieves the full content (HTML, text, etc.) of the current page state.
|
|
145
153
|
* `extract`: Extracts any structured data from the page with ease using an expressive, declarative schema.
|
|
146
154
|
|
|
@@ -94,7 +94,7 @@ Clicks on an element specified by a selector.
|
|
|
94
94
|
|
|
95
95
|
* **`id`**: `click`
|
|
96
96
|
* **`params`**:
|
|
97
|
-
* `selector` (string): A CSS selector
|
|
97
|
+
* `selector` (string): A CSS selector to identify the element to click.
|
|
98
98
|
* **`returns`**: `none`
|
|
99
99
|
|
|
100
100
|
#### `fill`
|
|
@@ -105,7 +105,12 @@ Fills an input field with a specified value.
|
|
|
105
105
|
* **`params`**:
|
|
106
106
|
* `selector` (string): A selector for the input element.
|
|
107
107
|
* `value` (string): The text to fill into the element.
|
|
108
|
-
* **`returns`**: `
|
|
108
|
+
* **`returns`**: `response`
|
|
109
|
+
|
|
110
|
+
> **Note**: The behavior of the returned content differs between engines.
|
|
111
|
+
>
|
|
112
|
+
> * **`cheerio`**: This engine directly manipulates its internal HTML representation, so the returned content will include the filled value.
|
|
113
|
+
> * **`playwright`**: This engine returns the rendered HTML of the page (similar to `document.documentElement.outerHTML`). However, when `page.fill()` updates an input, it changes the input's internal `value` property. This property is not always serialized back to the `value` attribute in the HTML source. As a result, the filled value will **not** be visible in the HTML returned by `page.content()`.
|
|
109
114
|
|
|
110
115
|
#### `submit`
|
|
111
116
|
|
|
@@ -118,10 +123,15 @@ Submits a form.
|
|
|
118
123
|
|
|
119
124
|
#### `waitFor`
|
|
120
125
|
|
|
121
|
-
Pauses execution to wait for
|
|
126
|
+
Pauses execution to wait for one or more conditions to be met.
|
|
127
|
+
|
|
128
|
+
In `browser` mode, if multiple conditions are provided, they are awaited sequentially. For example, it will first wait for the selector to appear, then wait for the network to be idle, and finally wait for the specified duration.
|
|
122
129
|
|
|
123
130
|
* **`id`**: `waitFor`
|
|
124
|
-
* **`params`**: An object specifying the wait condition
|
|
131
|
+
* **`params`**: An object specifying the wait condition, which can contain one or more of the following keys:
|
|
132
|
+
* **`ms`** (number): Waits for the specified number of milliseconds. Supported by both engines.
|
|
133
|
+
* **`selector`** (string): Waits for an element matching the selector to appear on the page. Supported only in `browser` mode.
|
|
134
|
+
* **`networkIdle`** (boolean): Waits until the network is idle (i.e., no new network requests for a period of time). Supported only in `browser` mode.
|
|
125
135
|
* **`returns`**: `none`
|
|
126
136
|
|
|
127
137
|
#### `pause`
|
package/docs/_media/README.cn.md
CHANGED
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
# 🕸️ @isdk/web-fetcher
|
|
2
2
|
|
|
3
|
+
[](https://www.npmjs.com/package/@isdk/web-fetcher)
|
|
4
|
+
[](https://www.npmjs.com/package/@isdk/web-fetcher)
|
|
5
|
+
[](https://github.com/isdk/web-fetcher.js/blob/main/LICENSE)
|
|
6
|
+
[](https://nodejs.org/)
|
|
7
|
+
[](https://www.typescriptlang.org/)
|
|
8
|
+
[](https://github.com/isdk/web-fetcher.js)
|
|
9
|
+

|
|
10
|
+
|
|
3
11
|
[English](./README.md) | 简体中文
|
|
4
12
|
|
|
5
|
-
>
|
|
6
|
-
> 它采用双引擎架构(HTTP 和浏览器)和声明式动作系统,是 AI 代理和复杂数据抓取任务的理想选择。
|
|
13
|
+
> 一个面向AI的网页自动化库,它将复杂的网页交互简化为声明式JSON动作脚本。一次编写,你的脚本即可在快速的 **`http`** 模式(用于静态内容)或完整的 **`browser`** 模式(用于动态站点)下运行。可选的 **`antibot`** 标志有助于绕过检测机制。该库专为有针对性的、面向任务的数据提取而设计(例如,从页面Y获取数据X),而非用于构建全站爬虫。
|
|
7
14
|
|
|
8
15
|
---
|
|
9
16
|
|
|
@@ -137,6 +144,7 @@ searchGoogle('gemini');
|
|
|
137
144
|
* `fill`: 用指定的值填充一个输入字段。
|
|
138
145
|
* `submit`: 提交一个表单。
|
|
139
146
|
* `waitFor`: 暂停执行以等待特定条件(例如,超时、选择器出现或网络空闲)。
|
|
147
|
+
* `pause`: 暂停执行以进行手动干预(例如,解决验证码)。
|
|
140
148
|
* `getContent`: 获取当前页面状态的完整内容(HTML、文本等)。
|
|
141
149
|
* `extract`: 使用富有表现力的声明式 Schema,可轻松提取页面中的任意结构化数据。
|
|
142
150
|
|