@cyia/crawl 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/index.mjs +1 -1
  2. package/package.json +3 -2
package/index.mjs CHANGED
@@ -1,3 +1,3 @@
1
- import re from"puppeteer-core";import{ElementHandle as U}from"puppeteer-core";import{load as M}from"cheerio";import B from"turndown";function h(a,o){let i=M(a,void 0,!0),t=i("body");if(o.cleanContent&&(t.find("script,style,iframe,footer,br,hr,svg,header").remove(),t.find("*").removeAttr("class"),t.find("*").removeAttr("style"),i("*").contents().filter(function(){return this.type==="comment"||this.type==="text"&&!this.data.trim()}).remove(),i("*").contents().filter(function(){return this.type==="text"&&!!this.data.trim()}).text((n,l)=>l.trim())),o.format==="html")return t.html();if(o.format==="text")return t.text();if(o.format==="markdown"){var r=new B;return r.turndown(t.html())}}import{promise as W}from"fastq";var u=class{page;browser;parent;#e={};constructor(o,i,t){this.page=o,this.browser=i,this.parent=t}ab;timeoutId;setMaxTimeout(o){this.ab=new AbortController,this.timeoutId=setTimeout(()=>{this.ab.abort("timeout")},o)}clearTimeout(){clearTimeout(this.timeoutId)}setVariable(o,i){this.#e[o]=i}getVariable(o){return this.#e[o]}#t(o,i){let t,r=!1;for(let n=0;n<i.length;n++){let l=i[n];if(l===".."){if(!o.parent)throw new Error("未找到父级");o=o.parent}else if(r){if(!t||typeof t!="object")throw new Error(`${i}路径下未找到值`);t=t[l]}else t=o.#e[l],r=!0}return t}#o(o){if(typeof o=="string")return o;if(o.source==="variable")return typeof o.key=="string"?this.#e[o.key]:this.#t(this,o.key)}async exeQueue(o){let i;for(let t of o)switch(console.log("准备执行",t),t.type){case"click":await this.page.click(t.selector,{offset:t.offset,delay:t.delay,count:t.count});break;case"type":{await this.page.type(t.selector,this.#o(t.text),{delay:t.delay});break}case"goto":{i=await this.page.goto(this.#o(t.url),{waitUntil:t.waitUntil,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"setViewport":{i=await this.page.setViewport({width:t.width,height:t.height,isMobile:t.isMobile,isLandscape:t.isLandscape});break}case"wait":{switch(t.config.mode){case"selector":{i=await this.page.waitForSelector(t.config.selector,{visible:t.config.visible,hidden:t.config.hidden,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"request":{let r=t.config;i=await this.page.waitForRequest(async n=>!(r.urlRegexp&&!r.urlRegexp.test(n.url())||r.method&&r.method!==n.method()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"response":{let r=t.config;i=await this.page.waitForResponse(async n=>!(r.urlRegexp&&!r.urlRegexp.test(n.url())||r.status&&r.status!==n.status()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"networkIdle":{i=await this.page.waitForNetworkIdle({idleTime:t.config.idleTime,concurrency:t.config.concurrency,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"navigation":{i=await this.page.waitForNavigation({signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}}break}case"selector":{t.multi?i=this.#e[t.output]=await this.page.$$(t.selector):i=this.#e[t.output]=await this.page.$(t.selector);break}case"keypress":{await this.page.keyboard.press(t.key,{delay:t.delay});break}case"findData":{let r=this.#e[t.input];Array.isArray(r)?t.kind==="property"&&(i=this.#e[t.output]=await Promise.all(r.map(n=>n.getProperty(t.key).then(l=>l.jsonValue())))):r instanceof U&&(i=this.#e[t.output]=r.getProperty(t.key).then(n=>n.jsonValue()));break}case"getContent":{let r=await this.page.content();i=this.#e[t.output]=h(r,{cleanContent:t.cleanContent,format:t.format});break}case"page":{let r=this.#e[t.input],n=Array.isArray(r)?r:[r],l=W(s=>(s.page.setVariable("$item",n[s.index]),s.page.setVariable("$index",s.index),s.page.setVariable("$first",s.index===0),s.page.setVariable("$last",s.index===n.length-1),s.page.exeQueue(t.actions)),t.concurrency),g;l.error(s=>{s&&(g=s)});let y=[];for(let s=0;s<n.length;s++)y.push(await this.browser.openPage(async L=>l.push({page:L,index:s}),this).catch(()=>{}));if(await l.drained(),t.throwError&&g)throw g;i=y,t.output&&(this.#e[t.output]=i);break}case"setUserAgent":{await this.page.setUserAgent(t.userAgent);break}case"close":{await this.page.close({runBeforeUnload:!1}),this.clearTimeout();break}case"custom":{if(typeof t.fn=="function")i=await t.fn(this);else{let r=this.browser.getCustom(t.config.type);if(!r)throw new Error(`自定义[${t.config.type}]未实现处理`);i=await r(t.config,this)}break}default:break}return i}};import*as e from"valibot";var G=e.picklist(["0","1","2","3","4","5","6","7","8","9","Power","Eject","Abort","Help","Backspace","Tab","Numpad5","NumpadEnter","Enter","\r",`
1
+ import re from"puppeteer-core";import{ElementHandle as U}from"puppeteer-core";import{load as M}from"cheerio";import B from"turndown";function h(a,o){let i=M(a,void 0,!0),t=i("body");if(o.cleanContent&&(t.find("script,style,iframe,footer,br,hr,svg,header,img").remove(),t.find("*").removeAttr("class"),t.find("*").removeAttr("style"),i("*").contents().filter(function(){return this.type==="comment"||this.type==="text"&&!this.data.trim()}).remove(),i("*").contents().filter(function(){return this.type==="text"&&!!this.data.trim()}).text((n,l)=>l.trim())),o.format==="html")return t.html();if(o.format==="text")return t.text();if(o.format==="markdown"){var r=new B;return r.turndown(t.html())}}import{promise as W}from"fastq";var u=class{page;browser;parent;#e={};constructor(o,i,t){this.page=o,this.browser=i,this.parent=t}ab;timeoutId;setMaxTimeout(o){this.ab=new AbortController,this.timeoutId=setTimeout(()=>{this.ab.abort("timeout")},o)}clearTimeout(){clearTimeout(this.timeoutId)}setVariable(o,i){this.#e[o]=i}getVariable(o){return this.#e[o]}#t(o,i){let t,r=!1;for(let n=0;n<i.length;n++){let l=i[n];if(l===".."){if(!o.parent)throw new Error("未找到父级");o=o.parent}else if(r){if(!t||typeof t!="object")throw new Error(`${i}路径下未找到值`);t=t[l]}else t=o.#e[l],r=!0}return t}#o(o){if(typeof o=="string")return o;if(o.source==="variable")return typeof o.key=="string"?this.#e[o.key]:this.#t(this,o.key)}async exeQueue(o){let i;for(let t of o)switch(console.log("准备执行",t),t.type){case"click":await this.page.click(t.selector,{offset:t.offset,delay:t.delay,count:t.count});break;case"type":{await this.page.type(t.selector,this.#o(t.text),{delay:t.delay});break}case"goto":{i=await this.page.goto(this.#o(t.url),{waitUntil:t.waitUntil,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"setViewport":{i=await this.page.setViewport({width:t.width,height:t.height,isMobile:t.isMobile,isLandscape:t.isLandscape});break}case"wait":{switch(t.config.mode){case"selector":{i=await this.page.waitForSelector(t.config.selector,{visible:t.config.visible,hidden:t.config.hidden,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"request":{let r=t.config;i=await this.page.waitForRequest(async n=>!(r.urlRegexp&&!r.urlRegexp.test(n.url())||r.method&&r.method!==n.method()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"response":{let r=t.config;i=await this.page.waitForResponse(async n=>!(r.urlRegexp&&!r.urlRegexp.test(n.url())||r.status&&r.status!==n.status()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"networkIdle":{i=await this.page.waitForNetworkIdle({idleTime:t.config.idleTime,concurrency:t.config.concurrency,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"navigation":{i=await this.page.waitForNavigation({signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}}break}case"selector":{t.multi?i=this.#e[t.output]=await this.page.$$(t.selector):i=this.#e[t.output]=await this.page.$(t.selector);break}case"keypress":{await this.page.keyboard.press(t.key,{delay:t.delay});break}case"findData":{let r=this.#e[t.input];Array.isArray(r)?t.kind==="property"&&(i=this.#e[t.output]=await Promise.all(r.map(n=>n.getProperty(t.key).then(l=>l.jsonValue())))):r instanceof U&&(i=this.#e[t.output]=r.getProperty(t.key).then(n=>n.jsonValue()));break}case"getContent":{let r=await this.page.content();i=this.#e[t.output]=h(r,{cleanContent:t.cleanContent,format:t.format});break}case"page":{let r=this.#e[t.input],n=Array.isArray(r)?r:[r],l=W(s=>(s.page.setVariable("$item",n[s.index]),s.page.setVariable("$index",s.index),s.page.setVariable("$first",s.index===0),s.page.setVariable("$last",s.index===n.length-1),s.page.exeQueue(t.actions)),t.concurrency),g;l.error(s=>{s&&(g=s)});let y=[];for(let s=0;s<n.length;s++)y.push(await this.browser.openPage(async L=>l.push({page:L,index:s}),this).catch(()=>{}));if(await l.drained(),t.throwError&&g)throw g;i=y,t.output&&(this.#e[t.output]=i);break}case"setUserAgent":{await this.page.setUserAgent(t.userAgent);break}case"close":{await this.page.close({runBeforeUnload:!1}),this.clearTimeout();break}case"custom":{if(typeof t.fn=="function")i=await t.fn(this);else{let r=this.browser.getCustom(t.config.type);if(!r)throw new Error(`自定义[${t.config.type}]未实现处理`);i=await r(t.config,this)}break}default:break}return i}};import*as e from"valibot";var G=e.picklist(["0","1","2","3","4","5","6","7","8","9","Power","Eject","Abort","Help","Backspace","Tab","Numpad5","NumpadEnter","Enter","\r",`
2
2
  `,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),c=e.optional(e.number()),f=e.optional(e.boolean()),$=e.optional(e.string());var H=c,q=e.pipe(e.string(),e.transform(a=>new RegExp(a))),Q=e.pipe(e.tuple([e.string(),e.pipe(e.string())]),e.transform(([a,o])=>new RegExp(a,o))),b=e.union([q,Q]),p=e.string(),v=e.union([e.string(),e.object({source:e.literal("variable"),key:e.union([e.string(),e.array(e.string())])})]),d=e.object({timeout:H,waitUntil:e.optional(e.picklist(["load","domcontentloaded","networkidle0","networkidle2"]),"networkidle2"),url:v,type:e.literal("goto")}),w=e.object({width:e.optional(e.number(),1920),height:e.optional(e.number(),1080),isMobile:e.optional(e.boolean()),isLandscape:e.optional(e.boolean()),type:e.literal("setViewport")}),k=e.object({userAgent:e.string(),type:e.literal("setUserAgent")}),J=e.object({selector:p,visible:f,hidden:f}),Y=e.object({mode:e.literal("selector"),...J.entries}),Z=e.object({mode:e.literal("request"),urlRegexp:b,method:$}),z=e.object({mode:e.literal("response"),urlRegexp:e.optional(b),status:c}),X=e.object({mode:e.literal("networkIdle"),idleTime:c,concurrency:c}),_=e.object({mode:e.literal("navigation")}),x=e.object({type:e.literal("wait"),config:e.variant("mode",[Y,Z,z,X,_])}),C=e.object({type:e.literal("click"),selector:p,offset:e.optional(e.object({x:e.number(),y:e.number()})),delay:c,count:c}),A=e.object({type:e.literal("type"),selector:p,text:v,delay:c}),P=e.object({type:e.literal("keypress"),key:G,delay:c}),T=e.object({type:e.literal("selector"),selector:p,output:e.string(),multi:e.optional(e.boolean(),!1)}),E=e.object({type:e.literal("findData"),input:e.string(),output:e.string(),kind:e.picklist(["property"]),key:e.optional(e.string())}),D=e.object({type:e.literal("getContent"),format:e.optional(e.picklist(["html","text","markdown"]),"html"),cleanContent:f,output:e.string()}),I=e.object({type:e.literal("close")}),F=e.object({type:e.literal("custom"),config:e.optional(e.looseObject({type:e.string()})),fn:e.optional(e.custom(Boolean))}),ee=e.object({type:e.literal("page"),input:e.string(),output:e.optional(e.string()),concurrency:e.optional(e.number(),2),throwError:e.optional(e.boolean(),!1),actions:e.lazy(()=>e.array(S))}),te=[...[d,w,k,x,C,A,P,T,E,D,I,F].map(a=>a.entries.type.literal),"page"],S=e.fallback(e.union([d,w,k,x,C,A,P,T,E,D,ee,I,F]),a=>{if(te.includes((a?.value).type))throw new Error(JSON.stringify(a?.issues));return{type:"custom",config:a?.value}}),R=e.array(S),he=e.object({maxTimeout:c,actionTimeout:c});import*as j from"valibot";import{Browser as ne,computeExecutablePath as ae}from"@puppeteer/browsers";import*as O from"fs";import{Browser as N,computeExecutablePath as oe,install as ie}from"@puppeteer/browsers";async function V(a){let o=await ie({browser:N.CHROME,baseUrl:"https://cdn.npmmirror.com/binaries/chrome-for-testing",...a,unpack:!0})}function we(a,o){return oe({cacheDir:a,browser:N.CHROME,buildId:o})}import{PUPPETEER_REVISIONS as se}from"puppeteer-core/internal/revisions.js";async function Re(a){return m.init(a)}var K=se.chrome,m=class a{browser;static async init(o){let i=ne.CHROME,t=ae({cacheDir:o.cacheDir,browser:i,buildId:K});O.existsSync(t)||(console.log("准备下载"),await V({cacheDir:o.cacheDir,buildId:K,browser:i}));let r=await re.launch({...o,executablePath:t});return new a(r)}constructor(o){this.browser=o}#e;#t=new Map;setConfig(o){this.#e=o}getConfig(){return this.#e}registerCustom(o,i){this.#t.set(o,i)}clearCustom(){this.#t.clear()}getCustom(o){return this.#t.get(o)}async openPage(o,i){let t=new u(await this.browser.newPage(),this,i);return this.#e?.maxTimeout&&t.setMaxTimeout(this.#e.maxTimeout),o(t)}runQueue(o,i){let t=j.safeParse(R,o);if(!t.success)throw new Error(`解析配置错误
3
3
  ${JSON.stringify(t.issues)}`);return this.openPage(async r=>{if(i)for(let n in i)r.setVariable(n,i[n]);return r.exeQueue(t.output)})}};export{S as ActionDefine,R as ActionListDefine,he as GlobalConfig,m as WebBrowser,u as WebPage,V as download,h as format,we as getExecutablePath,Re as init};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cyia/crawl",
3
- "version": "0.0.6",
3
+ "version": "0.0.8",
4
4
  "author": "wszgrcy",
5
5
  "description": "",
6
6
  "dependencies": {
@@ -8,7 +8,8 @@
8
8
  "fastq": "1.19.1",
9
9
  "html-entities": "2.6.0",
10
10
  "puppeteer-core": "24.6.0",
11
- "valibot": "1.0.0"
11
+ "valibot": "1.0.0",
12
+ "turndown": "^7.2.0"
12
13
  },
13
14
  "exports": {
14
15
  ".": {