@cyia/crawl 0.0.19 → 0.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/index.mjs +3 -3
  2. package/init.d.ts +1 -1
  3. package/package.json +1 -1
package/index.mjs CHANGED
@@ -1,3 +1,3 @@
1
- import xe from"puppeteer-core";import{ElementHandle as oe}from"puppeteer-core";import{load as C}from"cheerio";import w from"turndown";import{Readability as A}from"@mozilla/readability";import{JSDOM as P}from"jsdom";function T(a,o){if(o.cleanContent){let t=new P(a),i=new A(t.window.document).parse();if(i)switch(o.format){case"html":return i.content;case"text":return i.textContent;case"markdown":{var r=new w;return r.turndown(i.content)}default:throw""}let n=C(a,void 0,!0),s=n("body");if(s.find("script,style,iframe,footer,br,hr,svg,header,img").remove(),s.find("*").removeAttr("class"),s.find("*").removeAttr("style"),n("*").contents().filter(function(){return this.type==="comment"||this.type==="text"&&!this.data.trim()}).remove(),n("*").contents().filter(function(){return this.type==="text"&&!!this.data.trim()}).text((p,m)=>m.trim()),o.format==="html")return s.html();if(o.format==="text")return s.text();if(o.format==="markdown"){var r=new w;return r.turndown(s.html())}}else{if(o.format==="html")return a;if(o.format==="text")return C(a,void 0,!0)("body").text();if(o.format==="markdown"){var r=new w;return r.turndown(a)}}}function I(a){let o=new P(a);return new A(o.window.document,{charThreshold:100}).parse()}import{promise as ie}from"fastq";var d=class{page;browser;parent;#e={};constructor(o,r,t){this.page=o,this.browser=r,this.parent=t}ab;timeoutId;setMaxTimeout(o){this.ab=new AbortController,this.timeoutId=setTimeout(()=>{this.ab.abort("timeout")},o)}clearTimeout(){clearTimeout(this.timeoutId)}setVariable(o,r){this.#e[o]=r}getVariable(o){return this.#e[o]}#r(o,r){let t,i=!1;for(let n=0;n<r.length;n++){let s=r[n];if(s===".."){if(!o.parent)throw new Error("未找到父级");o=o.parent}else if(i){if(!t||typeof t!="object")throw new Error(`${r}路径下未找到值`);t=t[s]}else t=o.#e[s],i=!0}return t}#o(o){if(typeof o=="string")return o;if(o.source==="variable")return typeof o.key=="string"?this.#e[o.key]:this.#r(this,o.key)}#t(o,r){if(r)if(typeof r=="string")this.#e[r]=o;else switch(r.method){case"push":{if(!Array.isArray(this.#e[r.key]))throw new Error(`${r.key}不是数组类型`);this.#e[r.key]||=[],this.#e[r.key].push(o);break}case"flat-push":{if(!Array.isArray(this.#e[r.key]))throw new Error(`${r.key}不是数组类型`);if(!Array.isArray(o))throw new Error(`${JSON.stringify(o)}不是数组类型`);this.#e[r.key]||=[],this.#e[r.key].push(...o);break}case"define":{this.#e[r.key]=o;break}case"merge":{if(typeof this.#e[r.key]!="object")throw new Error(`${r.key}不是对象类型`);this.#e[r.key]||={},this.#e[r.key]={...this.#e[r.key],...o};break}default:break}}async exeQueue(o){let r;for(let t of o)switch(console.log("准备执行",t),t.type){case"click":await this.page.click(t.selector,{offset:t.offset,delay:t.delay,count:t.count});break;case"type":{await this.page.type(t.selector,this.#o(t.text),{delay:t.delay});break}case"goto":{r=await this.page.goto(this.#o(t.url),{waitUntil:t.waitUntil,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"setViewport":{r=await this.page.setViewport({width:t.width,height:t.height,isMobile:t.isMobile,isLandscape:t.isLandscape});break}case"wait":{switch(t.config.mode){case"selector":{r=await this.page.waitForSelector(t.config.selector,{visible:t.config.visible,hidden:t.config.hidden,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"request":{let i=t.config;r=await this.page.waitForRequest(async n=>!(i.urlRegexp&&!i.urlRegexp.test(n.url())||i.method&&i.method!==n.method()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"response":{let i=t.config;r=await this.page.waitForResponse(async n=>!(i.urlRegexp&&!i.urlRegexp.test(n.url())||i.status&&i.status!==n.status()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"networkIdle":{r=await this.page.waitForNetworkIdle({idleTime:t.config.idleTime,concurrency:t.config.concurrency,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"navigation":{r=await this.page.waitForNavigation({signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"waitBodyElements":{let i=t.config.threshold||100;r=await this.page.waitForFunction(n=>document.body.querySelectorAll("*").length>=n,{timeout:this.browser.getConfig()?.actionTimeout,signal:this.ab?.signal},i);break}case"delay":r=await this.page.waitForFunction(i=>new Promise(n=>{setTimeout(()=>{n(void 0)},i)}),{timeout:this.browser.getConfig()?.actionTimeout,signal:this.ab?.signal},t.config.value)}break}case"selector":{t.multi?this.#t(r=await this.page.$$(t.selector),t.output):this.#t(r=await this.page.$(t.selector),t.output);break}case"keypress":{await this.page.keyboard.press(t.key,{delay:t.delay});break}case"findData":{let i=this.#e[t.input];Array.isArray(i)?t.kind==="property"&&this.#t(r=await Promise.all(i.map(n=>n.getProperty(t.key).then(s=>s.jsonValue()))),t.output):i instanceof oe&&this.#t(r=i.getProperty(t.key).then(n=>n.jsonValue()),t.output);break}case"getContent":{let i=await this.page.content();this.#t(r=T(i,{cleanContent:t.cleanContent,format:t.format}),t.output);break}case"rawContent":{let i=await this.page.content();this.#t(r=i,t.output);break}case"page":{let i=this.#e[t.input],n=Array.isArray(i)?i:[i],s=ie(async l=>{console.log("准备执行",l);try{let y=await this.browser.openPage(async f=>(f.setVariable("$item",n[l.index]),f.setVariable("$index",l.index),f.setVariable("$first",l.index===0),f.setVariable("$last",l.index===n.length-1),{result:await f.exeQueue(t.actions),page:f}),this);m.push(y)}catch(y){if(t.throwError)throw y;m.push(void 0)}},t.concurrency),p;s.error(l=>{l&&(p=l)});let m=[];for(let l=0;l<n.length;l++)s.push({index:l});if(await s.drained(),t.throwError&&p)throw p;r=m,this.#t(r,t.output);break}case"setUserAgent":{await this.page.setUserAgent({userAgent:t.userAgent});break}case"close":{await this.page.close({runBeforeUnload:!1}),this.clearTimeout();break}case"custom":{if(typeof t.fn=="function")r=await t.fn(this);else{let i=this.browser.getCustom(t.config.type);if(!i)throw new Error(`自定义[${t.config.type}]未实现处理`);r=await i(t.config,this)}break}case"evaluate":{this.#t(r=await this.page.evaluate(t.fn,...t.args??[]),t.output);break}case"read-variable":{r=this.#e[t.input];break}default:break}return r}dispose(){return this.page.close()}};import*as e from"valibot";var ne=e.picklist(["0","1","2","3","4","5","6","7","8","9","Power","Eject","Abort","Help","Backspace","Tab","Numpad5","NumpadEnter","Enter","\r",`
2
- `,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),c=e.optional(e.number()),v=e.optional(e.boolean()),ae=e.optional(e.string());var se=c,le=e.pipe(e.string(),e.transform(a=>new RegExp(a))),ce=e.pipe(e.tuple([e.string(),e.pipe(e.string())]),e.transform(([a,o])=>new RegExp(a,o))),E=e.union([le,ce]),b=e.string(),D=e.union([e.string(),e.object({source:e.literal("variable"),key:e.union([e.string(),e.array(e.string())])})]),h=e.optional(e.union([e.string(),e.object({key:e.string(),method:e.picklist(["push","flat-push","define","merge"])})])),O=e.object({timeout:se,waitUntil:e.optional(e.picklist(["load","domcontentloaded","networkidle0","networkidle2"]),"networkidle2"),url:D,type:e.literal("goto")}),R=e.object({width:e.optional(e.number(),1920),height:e.optional(e.number(),1080),isMobile:e.optional(e.boolean()),isLandscape:e.optional(e.boolean()),type:e.literal("setViewport")}),S=e.object({userAgent:e.string(),type:e.literal("setUserAgent")}),ue=e.object({selector:b,visible:v,hidden:v}),pe=e.object({mode:e.literal("selector"),...ue.entries}),fe=e.object({mode:e.literal("request"),urlRegexp:E,method:ae}),me=e.object({mode:e.literal("response"),urlRegexp:e.optional(E),status:c}),ge=e.object({mode:e.literal("networkIdle"),idleTime:c,concurrency:c}),he=e.object({mode:e.literal("waitBodyElements"),threshold:c}),ye=e.object({mode:e.literal("navigation")}),de=e.object({mode:e.literal("delay"),value:e.number()}),j=e.object({type:e.literal("wait"),config:e.variant("mode",[pe,fe,me,ge,ye,he,de])}),F=e.object({type:e.literal("click"),selector:b,offset:e.optional(e.object({x:e.number(),y:e.number()})),delay:c,count:c}),L=e.object({type:e.literal("type"),selector:b,text:D,delay:c}),U=e.object({type:e.literal("keypress"),key:ne,delay:c}),V=e.object({type:e.literal("selector"),selector:b,output:h,multi:e.optional(e.boolean(),!1)}),M=e.object({type:e.literal("findData"),input:e.string(),output:h,kind:e.picklist(["property"]),key:e.optional(e.string())}),N=e.object({type:e.literal("getContent"),format:e.optional(e.picklist(["html","text","markdown"]),"html"),cleanContent:v,output:h}),K=e.object({type:e.literal("rawContent"),output:h}),B=e.object({type:e.literal("evaluate"),fn:e.custom(a=>typeof a=="function"),args:e.optional(e.array(e.any())),output:h}),$=e.object({type:e.literal("close")}),W=e.object({type:e.literal("custom"),config:e.optional(e.looseObject({type:e.string()})),fn:e.optional(e.custom(Boolean))}),G=e.object({type:e.literal("read-variable"),input:e.string()}),be=e.object({type:e.literal("page"),input:e.string(),output:h,concurrency:e.optional(e.number(),2),throwError:e.optional(e.boolean(),!1),actions:e.lazy(()=>e.array(q))}),we=[...[O,R,S,j,F,L,U,V,M,N,K,$,W,G,B].map(a=>a.entries.type.literal),"page"],q=e.fallback(e.union([O,R,S,j,F,L,U,V,M,N,K,be,$,W,G,B]),a=>{if(we.includes((a?.value).type))throw new Error(JSON.stringify(a?.issues));return{type:"custom",config:a?.value}}),H=e.array(q),Ke=e.object({maxTimeout:c,actionTimeout:c});import*as Y from"valibot";import{Browser as Ce,computeExecutablePath as Ae}from"@puppeteer/browsers";import*as Z from"fs";import{Browser as Q,computeExecutablePath as ve,install as ke}from"@puppeteer/browsers";async function J(a){let o=await ke({browser:Q.CHROME,baseUrl:"https://cdn.npmmirror.com/binaries/chrome-for-testing",...a,unpack:!0})}function Ge(a,o){return ve({cacheDir:a,browser:Q.CHROME,buildId:o})}import{PUPPETEER_REVISIONS as Pe}from"puppeteer-core/internal/revisions.js";async function z(a){return k.init(a)}var _=Pe.chrome,k=class a{browser;static async init(o){let r=Ce.CHROME,t=Ae({cacheDir:o.cacheDir,browser:r,buildId:_});Z.existsSync(t)||(console.log("准备下载"),await J({cacheDir:o.cacheDir,buildId:_,browser:r}));let i=await xe.launch({...o,executablePath:t});return new a(i)}constructor(o){this.browser=o}#e;#r=new Map;setConfig(o){this.#e=o}getConfig(){return this.#e}registerCustom(o,r){this.#r.set(o,r)}clearCustom(){this.#r.clear()}getCustom(o){return this.#r.get(o)}async openPage(o,r){let t=new d(await this.browser.newPage(),this,r);return this.#e?.maxTimeout&&t.setMaxTimeout(this.#e.maxTimeout),o(t)}runQueue(o,r){let t=Y.safeParse(H,o);if(!t.success)throw new Error(`解析配置错误
3
- ${JSON.stringify(t.issues)}`);return this.openPage(async i=>{if(r)for(let n in r)i.setVariable(n,r[n]);return{result:await i.exeQueue(t.output),page:i}})}};import{load as Te}from"cheerio";function X(a,o){let r=Te(o,{baseURI:a}),t=r("a").map((i,n)=>r(n).attr("href")?r(n).prop("href"):"").get().filter(Boolean);return r("a").each((i,n)=>{r(n).attr("href")&&r(n).attr("href",r(n).prop("href"))}),{links:t,content:r.html()}}function ee(a){let o=new URL(a);return o.hash="",o.toString()}import{URL as Ie}from"url";function te(a){return new Ie(a).hash.startsWith("#/")}import{Subject as Ee}from"rxjs";import De from"turndown";var re=class{config;browser;dataMap=new Map;#e=new Set;data$=new Ee;constructor(o){this.config=o}async start(o,r){return this.browser=await z({...o,cacheDir:process.cwd()}),this.browser.setConfig({maxTimeout:12e4,actionTimeout:12e4,...r}),await this.searchWebOne(this.config.rootUrl,void 0),await this.browser.browser.close(),this.data$.complete(),this.dataMap}async searchWebOne(o,r){let t=await this.browser.runQueue([...await this.config.queueList(o),{type:"evaluate",output:"baseURI",fn:()=>document.baseURI},{type:"evaluate",output:"href",fn:()=>window.location.href},{type:"rawContent",output:"data"}]);console.log("解析完成",o);let i=t.page.getVariable("href"),n=t.page.getVariable("baseURI"),s=t.page.getVariable("data"),p=I(s);this.#e.add(o),this.#e.add(i);let{links:m,content:l}=X(n,s);if(!await this.config.skipParseContent?.(o)){let u;if(p){var f=new De;u=f.turndown(p.content)}let g={requestUrl:o,parsedUrl:i,parentUrl:r?.from,metadata:p,raw:l,markdown:u};if(await t.page.dispose(),this.dataMap.set(o,g),this.data$.next(g),!p)return}let x=[];for(let u of m)await this.config.filterLink(u)&&x.push(u);for(let u of x){let g=u;if((!this.config.hashMode||!te(u))&&(g=ee(u)),this.#e.has(g)){console.log("已索引,跳过",u);continue}await this.searchWebOne(g,{from:o})}}};export{q as ActionDefine,H as ActionListDefine,re as FullWebRequest,Ke as GlobalConfig,k as WebBrowser,d as WebPage,J as download,T as format,I as formatDoc,Ge as getExecutablePath,z as init};
1
+ import xe from"puppeteer-core";import{ElementHandle as oe}from"puppeteer-core";import{load as C}from"cheerio";import w from"turndown";import{Readability as A}from"@mozilla/readability";import{JSDOM as P}from"jsdom";function T(a,r){if(r.cleanContent){let t=new P(a),i=new A(t.window.document).parse();if(i)switch(r.format){case"html":return i.content;case"text":return i.textContent;case"markdown":{var o=new w;return o.turndown(i.content)}default:throw""}let n=C(a,void 0,!0),s=n("body");if(s.find("script,style,iframe,footer,br,hr,svg,header,img").remove(),s.find("*").removeAttr("class"),s.find("*").removeAttr("style"),n("*").contents().filter(function(){return this.type==="comment"||this.type==="text"&&!this.data.trim()}).remove(),n("*").contents().filter(function(){return this.type==="text"&&!!this.data.trim()}).text((p,m)=>m.trim()),r.format==="html")return s.html();if(r.format==="text")return s.text();if(r.format==="markdown"){var o=new w;return o.turndown(s.html())}}else{if(r.format==="html")return a;if(r.format==="text")return C(a,void 0,!0)("body").text();if(r.format==="markdown"){var o=new w;return o.turndown(a)}}}function I(a){let r=new P(a);return new A(r.window.document,{charThreshold:100}).parse()}import{promise as ie}from"fastq";var d=class{page;browser;parent;#e={};constructor(r,o,t){this.page=r,this.browser=o,this.parent=t}ab;timeoutId;setMaxTimeout(r){this.ab=new AbortController,this.timeoutId=setTimeout(()=>{this.ab.abort("timeout")},r)}clearTimeout(){clearTimeout(this.timeoutId)}setVariable(r,o){this.#e[r]=o}getVariable(r){return this.#e[r]}#r(r,o){let t,i=!1;for(let n=0;n<o.length;n++){let s=o[n];if(s===".."){if(!r.parent)throw new Error("未找到父级");r=r.parent}else if(i){if(!t||typeof t!="object")throw new Error(`${o}路径下未找到值`);t=t[s]}else t=r.#e[s],i=!0}return t}#o(r){if(typeof r=="string")return r;if(r.source==="variable")return typeof r.key=="string"?this.#e[r.key]:this.#r(this,r.key)}#t(r,o){if(o)if(typeof o=="string")this.#e[o]=r;else switch(o.method){case"push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(r);break}case"flat-push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);if(!Array.isArray(r))throw new Error(`${JSON.stringify(r)}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(...r);break}case"define":{this.#e[o.key]=r;break}case"merge":{if(typeof this.#e[o.key]!="object")throw new Error(`${o.key}不是对象类型`);this.#e[o.key]||={},this.#e[o.key]={...this.#e[o.key],...r};break}default:break}}async exeQueue(r){let o;for(let t of r)switch(console.log("准备执行",t),t.type){case"click":await this.page.click(t.selector,{offset:t.offset,delay:t.delay,count:t.count});break;case"type":{await this.page.type(t.selector,this.#o(t.text),{delay:t.delay});break}case"goto":{o=await this.page.goto(this.#o(t.url),{waitUntil:t.waitUntil,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"setViewport":{o=await this.page.setViewport({width:t.width,height:t.height,isMobile:t.isMobile,isLandscape:t.isLandscape});break}case"wait":{switch(t.config.mode){case"selector":{o=await this.page.waitForSelector(t.config.selector,{visible:t.config.visible,hidden:t.config.hidden,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"request":{let i=t.config;o=await this.page.waitForRequest(async n=>!(i.urlRegexp&&!i.urlRegexp.test(n.url())||i.method&&i.method!==n.method()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"response":{let i=t.config;o=await this.page.waitForResponse(async n=>!(i.urlRegexp&&!i.urlRegexp.test(n.url())||i.status&&i.status!==n.status()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"networkIdle":{o=await this.page.waitForNetworkIdle({idleTime:t.config.idleTime,concurrency:t.config.concurrency,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"navigation":{o=await this.page.waitForNavigation({signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"waitBodyElements":{let i=t.config.threshold||100;o=await this.page.waitForFunction(n=>document.body.querySelectorAll("*").length>=n,{timeout:this.browser.getConfig()?.actionTimeout,signal:this.ab?.signal},i);break}case"delay":{let i=t.config;await new Promise(n=>{setTimeout(()=>{n(void 0)},i.value)})}}break}case"selector":{t.multi?this.#t(o=await this.page.$$(t.selector),t.output):this.#t(o=await this.page.$(t.selector),t.output);break}case"keypress":{await this.page.keyboard.press(t.key,{delay:t.delay});break}case"findData":{let i=this.#e[t.input];Array.isArray(i)?t.kind==="property"&&this.#t(o=await Promise.all(i.map(n=>n.getProperty(t.key).then(s=>s.jsonValue()))),t.output):i instanceof oe&&this.#t(o=i.getProperty(t.key).then(n=>n.jsonValue()),t.output);break}case"getContent":{let i=await this.page.content();this.#t(o=T(i,{cleanContent:t.cleanContent,format:t.format}),t.output);break}case"rawContent":{let i=await this.page.content();this.#t(o=i,t.output);break}case"page":{let i=this.#e[t.input],n=Array.isArray(i)?i:[i],s=ie(async l=>{console.log("准备执行",l);try{let y=await this.browser.openPage(async f=>(f.setVariable("$item",n[l.index]),f.setVariable("$index",l.index),f.setVariable("$first",l.index===0),f.setVariable("$last",l.index===n.length-1),{result:await f.exeQueue(t.actions),page:f}),this);m.push(y)}catch(y){if(t.throwError)throw y;m.push(void 0)}},t.concurrency),p;s.error(l=>{l&&(p=l)});let m=[];for(let l=0;l<n.length;l++)s.push({index:l});if(await s.drained(),t.throwError&&p)throw p;o=m,this.#t(o,t.output);break}case"setUserAgent":{await this.page.setUserAgent({userAgent:t.userAgent});break}case"close":{await this.page.close({runBeforeUnload:!1}),this.clearTimeout();break}case"custom":{if(typeof t.fn=="function")o=await t.fn(this);else{let i=this.browser.getCustom(t.config.type);if(!i)throw new Error(`自定义[${t.config.type}]未实现处理`);o=await i(t.config,this)}break}case"evaluate":{this.#t(o=await this.page.evaluate(t.fn,...t.args??[]),t.output);break}case"read-variable":{o=this.#e[t.input];break}default:break}return o}dispose(){return this.page.close()}};import*as e from"valibot";var ne=e.picklist(["0","1","2","3","4","5","6","7","8","9","Power","Eject","Abort","Help","Backspace","Tab","Numpad5","NumpadEnter","Enter","\r",`
2
+ `,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),u=e.optional(e.number()),v=e.optional(e.boolean()),ae=e.optional(e.string());var se=u,le=e.pipe(e.string(),e.transform(a=>new RegExp(a))),ce=e.pipe(e.tuple([e.string(),e.pipe(e.string())]),e.transform(([a,r])=>new RegExp(a,r))),E=e.union([le,ce]),b=e.string(),D=e.union([e.string(),e.object({source:e.literal("variable"),key:e.union([e.string(),e.array(e.string())])})]),h=e.optional(e.union([e.string(),e.object({key:e.string(),method:e.picklist(["push","flat-push","define","merge"])})])),O=e.object({timeout:se,waitUntil:e.optional(e.picklist(["load","domcontentloaded","networkidle0","networkidle2"]),"networkidle2"),url:D,type:e.literal("goto")}),R=e.object({width:e.optional(e.number(),1920),height:e.optional(e.number(),1080),isMobile:e.optional(e.boolean()),isLandscape:e.optional(e.boolean()),type:e.literal("setViewport")}),S=e.object({userAgent:e.string(),type:e.literal("setUserAgent")}),ue=e.object({selector:b,visible:v,hidden:v}),pe=e.object({mode:e.literal("selector"),...ue.entries}),fe=e.object({mode:e.literal("request"),urlRegexp:E,method:ae}),me=e.object({mode:e.literal("response"),urlRegexp:e.optional(E),status:u}),ge=e.object({mode:e.literal("networkIdle"),idleTime:u,concurrency:u}),he=e.object({mode:e.literal("waitBodyElements"),threshold:u}),ye=e.object({mode:e.literal("navigation")}),de=e.object({mode:e.literal("delay"),value:e.number()}),j=e.object({type:e.literal("wait"),config:e.variant("mode",[pe,fe,me,ge,ye,he,de])}),L=e.object({type:e.literal("click"),selector:b,offset:e.optional(e.object({x:e.number(),y:e.number()})),delay:u,count:u}),U=e.object({type:e.literal("type"),selector:b,text:D,delay:u}),F=e.object({type:e.literal("keypress"),key:ne,delay:u}),V=e.object({type:e.literal("selector"),selector:b,output:h,multi:e.optional(e.boolean(),!1)}),M=e.object({type:e.literal("findData"),input:e.string(),output:h,kind:e.picklist(["property"]),key:e.optional(e.string())}),N=e.object({type:e.literal("getContent"),format:e.optional(e.picklist(["html","text","markdown"]),"html"),cleanContent:v,output:h}),K=e.object({type:e.literal("rawContent"),output:h}),B=e.object({type:e.literal("evaluate"),fn:e.custom(a=>typeof a=="function"),args:e.optional(e.array(e.any())),output:h}),$=e.object({type:e.literal("close")}),W=e.object({type:e.literal("custom"),config:e.optional(e.looseObject({type:e.string()})),fn:e.optional(e.custom(Boolean))}),G=e.object({type:e.literal("read-variable"),input:e.string()}),be=e.object({type:e.literal("page"),input:e.string(),output:h,concurrency:e.optional(e.number(),2),throwError:e.optional(e.boolean(),!1),actions:e.lazy(()=>e.array(q))}),we=[...[O,R,S,j,L,U,F,V,M,N,K,$,W,G,B].map(a=>a.entries.type.literal),"page"],q=e.fallback(e.union([O,R,S,j,L,U,F,V,M,N,K,be,$,W,G,B]),a=>{if(we.includes((a?.value).type))throw new Error(JSON.stringify(a?.issues));return{type:"custom",config:a?.value}}),H=e.array(q),Ke=e.object({maxTimeout:u,actionTimeout:u});import*as Y from"valibot";import{Browser as Ce,computeExecutablePath as Ae}from"@puppeteer/browsers";import*as Z from"fs";import{Browser as Q,computeExecutablePath as ve,install as ke}from"@puppeteer/browsers";async function J(a){let r=await ke({browser:Q.CHROME,baseUrl:"https://cdn.npmmirror.com/binaries/chrome-for-testing",...a,unpack:!0})}function Ge(a,r){return ve({cacheDir:a,browser:Q.CHROME,buildId:r})}import{PUPPETEER_REVISIONS as Pe}from"puppeteer-core/internal/revisions.js";async function z(a){return k.init(a)}var _=Pe.chrome,k=class a{browser;static async init(r){let o=Ce.CHROME,t=Ae({cacheDir:r.cacheDir,browser:o,buildId:_});Z.existsSync(t)||(console.log("准备下载"),await J({cacheDir:r.cacheDir,buildId:_,browser:o}));let i=await xe.launch({...r,executablePath:t});return new a(i)}constructor(r){this.browser=r}#e;#r=new Map;setConfig(r){this.#e=r}getConfig(){return this.#e}registerCustom(r,o){this.#r.set(r,o)}clearCustom(){this.#r.clear()}getCustom(r){return this.#r.get(r)}async openPage(r,o){let t=new d(await this.browser.newPage(),this,o);return this.#e?.maxTimeout&&t.setMaxTimeout(this.#e.maxTimeout),r(t)}runQueue(r,o){let t=Y.safeParse(H,r);if(!t.success)throw new Error(`解析配置错误
3
+ ${JSON.stringify(t.issues)}`);return this.openPage(async i=>{if(o)for(let n in o)i.setVariable(n,o[n]);return{result:i.exeQueue(t.output),page:i}})}};import{load as Te}from"cheerio";function X(a,r){let o=Te(r,{baseURI:a}),t=o("a").map((i,n)=>o(n).attr("href")?o(n).prop("href"):"").get().filter(Boolean);return o("a").each((i,n)=>{o(n).attr("href")&&o(n).attr("href",o(n).prop("href"))}),{links:t,content:o.html()}}function ee(a){let r=new URL(a);return r.hash="",r.toString()}import{URL as Ie}from"url";function te(a){return new Ie(a).hash.startsWith("#/")}import{Subject as Ee}from"rxjs";import De from"turndown";var re=class{config;browser;dataMap=new Map;#e=new Set;data$=new Ee;constructor(r){this.config=r}async start(r,o){return this.browser=await z({...r,cacheDir:process.cwd()}),this.browser.setConfig({maxTimeout:12e4,actionTimeout:12e4,...o}),await this.searchWebOne(this.config.rootUrl,void 0),await this.browser.browser.close(),this.data$.complete(),this.dataMap}async searchWebOne(r,o){let t=await this.browser.runQueue([...await this.config.queueList(r),{type:"evaluate",output:"baseURI",fn:()=>document.baseURI},{type:"evaluate",output:"href",fn:()=>window.location.href},{type:"rawContent",output:"data"}]);try{console.log("解析完成",r),await t.result}catch(c){console.log("解析失败",r),console.error(c),await t.page.dispose();return}let i=t.page.getVariable("href"),n=t.page.getVariable("baseURI"),s=t.page.getVariable("data"),p=I(s);this.#e.add(r),this.#e.add(i);let{links:m,content:l}=X(n,s);if(!await this.config.skipParseContent?.(r)){let c;if(p){var f=new De;c=f.turndown(p.content)}let g={requestUrl:r,parsedUrl:i,parentUrl:o?.from,metadata:p,raw:l,markdown:c};if(await t.page.dispose(),this.dataMap.set(r,g),this.data$.next(g),!p)return}let x=[];for(let c of m)await this.config.filterLink(c)&&x.push(c);for(let c of x){let g=c;if((!this.config.hashMode||!te(c))&&(g=ee(c)),this.#e.has(g)){console.log("已索引,跳过",c);continue}await this.searchWebOne(g,{from:r})}}};export{q as ActionDefine,H as ActionListDefine,re as FullWebRequest,Ke as GlobalConfig,k as WebBrowser,d as WebPage,J as download,T as format,I as formatDoc,Ge as getExecutablePath,z as init};
package/init.d.ts CHANGED
@@ -21,7 +21,7 @@ export declare class WebBrowser {
21
21
  getCustom(key: string): PluginFn | undefined;
22
22
  openPage<T>(fn: (page: WebPage) => Promise<T>, parent?: WebPage): Promise<T>;
23
23
  runQueue(list: QueueList, input?: Record<string, any>): Promise<{
24
- result: any;
24
+ result: Promise<any>;
25
25
  page: WebPage;
26
26
  }>;
27
27
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cyia/crawl",
3
- "version": "0.0.19",
3
+ "version": "0.0.21",
4
4
  "author": "wszgrcy",
5
5
  "description": "",
6
6
  "dependencies": {