@cyia/crawl 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/define.d.ts +4 -4
- package/format.d.ts +4 -1
- package/index.d.ts +1 -0
- package/index.mjs +3 -3
- package/package.json +1 -1
package/define.d.ts
CHANGED
|
@@ -78,7 +78,7 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
|
|
|
78
78
|
readonly key: v.OptionalSchema<v.StringSchema<undefined>, undefined>;
|
|
79
79
|
}, undefined>, v.ObjectSchema<{
|
|
80
80
|
readonly type: v.LiteralSchema<"getContent", undefined>;
|
|
81
|
-
readonly format: v.OptionalSchema<v.PicklistSchema<["html", "text"], undefined>, "html">;
|
|
81
|
+
readonly format: v.OptionalSchema<v.PicklistSchema<["html", "text", "markdown"], undefined>, "html">;
|
|
82
82
|
readonly cleanContent: v.OptionalSchema<v.BooleanSchema<undefined>, undefined>;
|
|
83
83
|
readonly output: v.StringSchema<undefined>;
|
|
84
84
|
}, undefined>, v.GenericSchema<{
|
|
@@ -182,7 +182,7 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
|
|
|
182
182
|
key?: string | undefined;
|
|
183
183
|
} | {
|
|
184
184
|
type: "getContent";
|
|
185
|
-
format: "text" | "
|
|
185
|
+
format: "html" | "text" | "markdown";
|
|
186
186
|
cleanContent?: boolean | undefined;
|
|
187
187
|
output: string;
|
|
188
188
|
} | {
|
|
@@ -279,7 +279,7 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
|
|
|
279
279
|
readonly key: v.OptionalSchema<v.StringSchema<undefined>, undefined>;
|
|
280
280
|
}, undefined>, v.ObjectSchema<{
|
|
281
281
|
readonly type: v.LiteralSchema<"getContent", undefined>;
|
|
282
|
-
readonly format: v.OptionalSchema<v.PicklistSchema<["html", "text"], undefined>, "html">;
|
|
282
|
+
readonly format: v.OptionalSchema<v.PicklistSchema<["html", "text", "markdown"], undefined>, "html">;
|
|
283
283
|
readonly cleanContent: v.OptionalSchema<v.BooleanSchema<undefined>, undefined>;
|
|
284
284
|
readonly output: v.StringSchema<undefined>;
|
|
285
285
|
}, undefined>, v.GenericSchema<{
|
|
@@ -383,7 +383,7 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
|
|
|
383
383
|
key?: string | undefined;
|
|
384
384
|
} | {
|
|
385
385
|
type: "getContent";
|
|
386
|
-
format: "text" | "
|
|
386
|
+
format: "html" | "text" | "markdown";
|
|
387
387
|
cleanContent?: boolean | undefined;
|
|
388
388
|
output: string;
|
|
389
389
|
} | {
|
package/format.d.ts
CHANGED
package/index.d.ts
CHANGED
package/index.mjs
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import
|
|
2
|
-
`,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),c=e.optional(e.number()),f=e.optional(e.boolean())
|
|
3
|
-
${JSON.stringify(t.issues)}`);return this.openPage(async r=>{if(i)for(let n in i)r.setVariable(n,i[n]);return r.exeQueue(t.output)})}};export{S as ActionDefine,R as ActionListDefine,
|
|
1
|
+
import re from"puppeteer-core";import{ElementHandle as U}from"puppeteer-core";import{load as M}from"cheerio";import B from"turndown";function h(a,o){let i=M(a,void 0,!0),t=i("body");if(o.cleanContent&&(t.find("script,style,iframe,footer,br,hr,svg,header").remove(),t.find("*").removeAttr("class"),t.find("*").removeAttr("style"),i("*").contents().filter(function(){return this.type==="comment"||this.type==="text"&&!this.data.trim()}).remove(),i("*").contents().filter(function(){return this.type==="text"&&!!this.data.trim()}).text((n,l)=>l.trim())),o.format==="html")return t.html();if(o.format==="text")return t.text();if(o.format==="markdown"){var r=new B;return r.turndown(t.html())}}import{promise as W}from"fastq";var u=class{page;browser;parent;#e={};constructor(o,i,t){this.page=o,this.browser=i,this.parent=t}ab;timeoutId;setMaxTimeout(o){this.ab=new AbortController,this.timeoutId=setTimeout(()=>{this.ab.abort("timeout")},o)}clearTimeout(){clearTimeout(this.timeoutId)}setVariable(o,i){this.#e[o]=i}getVariable(o){return this.#e[o]}#t(o,i){let t,r=!1;for(let n=0;n<i.length;n++){let l=i[n];if(l===".."){if(!o.parent)throw new Error("未找到父级");o=o.parent}else if(r){if(!t||typeof t!="object")throw new Error(`${i}路径下未找到值`);t=t[l]}else t=o.#e[l],r=!0}return t}#o(o){if(typeof o=="string")return o;if(o.source==="variable")return typeof o.key=="string"?this.#e[o.key]:this.#t(this,o.key)}async exeQueue(o){let i;for(let t of o)switch(console.log("准备执行",t),t.type){case"click":await this.page.click(t.selector,{offset:t.offset,delay:t.delay,count:t.count});break;case"type":{await this.page.type(t.selector,this.#o(t.text),{delay:t.delay});break}case"goto":{i=await this.page.goto(this.#o(t.url),{waitUntil:t.waitUntil,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"setViewport":{i=await this.page.setViewport({width:t.width,height:t.height,isMobile:t.isMobile,isLandscape:t.isLandscape});break}case"wait":{switch(t.config.mode){case"selector":{i=await this.page.waitForSelector(t.config.selector,{visible:t.config.visible,hidden:t.config.hidden,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"request":{let r=t.config;i=await this.page.waitForRequest(async n=>!(r.urlRegexp&&!r.urlRegexp.test(n.url())||r.method&&r.method!==n.method()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"response":{let r=t.config;i=await this.page.waitForResponse(async n=>!(r.urlRegexp&&!r.urlRegexp.test(n.url())||r.status&&r.status!==n.status()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"networkIdle":{i=await this.page.waitForNetworkIdle({idleTime:t.config.idleTime,concurrency:t.config.concurrency,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"navigation":{i=await this.page.waitForNavigation({signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}}break}case"selector":{t.multi?i=this.#e[t.output]=await this.page.$$(t.selector):i=this.#e[t.output]=await this.page.$(t.selector);break}case"keypress":{await this.page.keyboard.press(t.key,{delay:t.delay});break}case"findData":{let r=this.#e[t.input];Array.isArray(r)?t.kind==="property"&&(i=this.#e[t.output]=await Promise.all(r.map(n=>n.getProperty(t.key).then(l=>l.jsonValue())))):r instanceof U&&(i=this.#e[t.output]=r.getProperty(t.key).then(n=>n.jsonValue()));break}case"getContent":{let r=await this.page.content();i=this.#e[t.output]=h(r,{cleanContent:t.cleanContent,format:t.format});break}case"page":{let r=this.#e[t.input],n=Array.isArray(r)?r:[r],l=W(s=>(s.page.setVariable("$item",n[s.index]),s.page.setVariable("$index",s.index),s.page.setVariable("$first",s.index===0),s.page.setVariable("$last",s.index===n.length-1),s.page.exeQueue(t.actions)),t.concurrency),g;l.error(s=>{s&&(g=s)});let y=[];for(let s=0;s<n.length;s++)y.push(await this.browser.openPage(async L=>l.push({page:L,index:s}),this).catch(()=>{}));if(await l.drained(),t.throwError&&g)throw g;i=y,t.output&&(this.#e[t.output]=i);break}case"setUserAgent":{await this.page.setUserAgent(t.userAgent);break}case"close":{await this.page.close({runBeforeUnload:!1}),this.clearTimeout();break}case"custom":{if(typeof t.fn=="function")i=await t.fn(this);else{let r=this.browser.getCustom(t.config.type);if(!r)throw new Error(`自定义[${t.config.type}]未实现处理`);i=await r(t.config,this)}break}default:break}return i}};import*as e from"valibot";var G=e.picklist(["0","1","2","3","4","5","6","7","8","9","Power","Eject","Abort","Help","Backspace","Tab","Numpad5","NumpadEnter","Enter","\r",`
|
|
2
|
+
`,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),c=e.optional(e.number()),f=e.optional(e.boolean()),$=e.optional(e.string());var H=c,q=e.pipe(e.string(),e.transform(a=>new RegExp(a))),Q=e.pipe(e.tuple([e.string(),e.pipe(e.string())]),e.transform(([a,o])=>new RegExp(a,o))),b=e.union([q,Q]),p=e.string(),v=e.union([e.string(),e.object({source:e.literal("variable"),key:e.union([e.string(),e.array(e.string())])})]),d=e.object({timeout:H,waitUntil:e.optional(e.picklist(["load","domcontentloaded","networkidle0","networkidle2"]),"networkidle2"),url:v,type:e.literal("goto")}),w=e.object({width:e.optional(e.number(),1920),height:e.optional(e.number(),1080),isMobile:e.optional(e.boolean()),isLandscape:e.optional(e.boolean()),type:e.literal("setViewport")}),k=e.object({userAgent:e.string(),type:e.literal("setUserAgent")}),J=e.object({selector:p,visible:f,hidden:f}),Y=e.object({mode:e.literal("selector"),...J.entries}),Z=e.object({mode:e.literal("request"),urlRegexp:b,method:$}),z=e.object({mode:e.literal("response"),urlRegexp:e.optional(b),status:c}),X=e.object({mode:e.literal("networkIdle"),idleTime:c,concurrency:c}),_=e.object({mode:e.literal("navigation")}),x=e.object({type:e.literal("wait"),config:e.variant("mode",[Y,Z,z,X,_])}),C=e.object({type:e.literal("click"),selector:p,offset:e.optional(e.object({x:e.number(),y:e.number()})),delay:c,count:c}),A=e.object({type:e.literal("type"),selector:p,text:v,delay:c}),P=e.object({type:e.literal("keypress"),key:G,delay:c}),T=e.object({type:e.literal("selector"),selector:p,output:e.string(),multi:e.optional(e.boolean(),!1)}),E=e.object({type:e.literal("findData"),input:e.string(),output:e.string(),kind:e.picklist(["property"]),key:e.optional(e.string())}),D=e.object({type:e.literal("getContent"),format:e.optional(e.picklist(["html","text","markdown"]),"html"),cleanContent:f,output:e.string()}),I=e.object({type:e.literal("close")}),F=e.object({type:e.literal("custom"),config:e.optional(e.looseObject({type:e.string()})),fn:e.optional(e.custom(Boolean))}),ee=e.object({type:e.literal("page"),input:e.string(),output:e.optional(e.string()),concurrency:e.optional(e.number(),2),throwError:e.optional(e.boolean(),!1),actions:e.lazy(()=>e.array(S))}),te=[...[d,w,k,x,C,A,P,T,E,D,I,F].map(a=>a.entries.type.literal),"page"],S=e.fallback(e.union([d,w,k,x,C,A,P,T,E,D,ee,I,F]),a=>{if(te.includes((a?.value).type))throw new Error(JSON.stringify(a?.issues));return{type:"custom",config:a?.value}}),R=e.array(S),he=e.object({maxTimeout:c,actionTimeout:c});import*as j from"valibot";import{Browser as ne,computeExecutablePath as ae}from"@puppeteer/browsers";import*as O from"fs";import{Browser as N,computeExecutablePath as oe,install as ie}from"@puppeteer/browsers";async function V(a){let o=await ie({browser:N.CHROME,baseUrl:"https://cdn.npmmirror.com/binaries/chrome-for-testing",...a,unpack:!0})}function we(a,o){return oe({cacheDir:a,browser:N.CHROME,buildId:o})}import{PUPPETEER_REVISIONS as se}from"puppeteer-core/internal/revisions.js";async function Re(a){return m.init(a)}var K=se.chrome,m=class a{browser;static async init(o){let i=ne.CHROME,t=ae({cacheDir:o.cacheDir,browser:i,buildId:K});O.existsSync(t)||(console.log("准备下载"),await V({cacheDir:o.cacheDir,buildId:K,browser:i}));let r=await re.launch({...o,executablePath:t});return new a(r)}constructor(o){this.browser=o}#e;#t=new Map;setConfig(o){this.#e=o}getConfig(){return this.#e}registerCustom(o,i){this.#t.set(o,i)}clearCustom(){this.#t.clear()}getCustom(o){return this.#t.get(o)}async openPage(o,i){let t=new u(await this.browser.newPage(),this,i);return this.#e?.maxTimeout&&t.setMaxTimeout(this.#e.maxTimeout),o(t)}runQueue(o,i){let t=j.safeParse(R,o);if(!t.success)throw new Error(`解析配置错误
|
|
3
|
+
${JSON.stringify(t.issues)}`);return this.openPage(async r=>{if(i)for(let n in i)r.setVariable(n,i[n]);return r.exeQueue(t.output)})}};export{S as ActionDefine,R as ActionListDefine,he as GlobalConfig,m as WebBrowser,u as WebPage,V as download,h as format,we as getExecutablePath,Re as init};
|