@cyia/crawl 0.0.18 → 0.0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/define.d.ts CHANGED
@@ -52,6 +52,9 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
52
52
  }, undefined>, v.ObjectSchema<{
53
53
  readonly mode: v.LiteralSchema<"waitBodyElements", undefined>;
54
54
  readonly threshold: v.OptionalSchema<v.NumberSchema<undefined>, undefined>;
55
+ }, undefined>, v.ObjectSchema<{
56
+ readonly mode: v.LiteralSchema<"delay", undefined>;
57
+ readonly value: v.NumberSchema<undefined>;
55
58
  }, undefined>], undefined>;
56
59
  }, undefined>, v.ObjectSchema<{
57
60
  readonly type: v.LiteralSchema<"click", undefined>;
@@ -179,6 +182,9 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
179
182
  } | {
180
183
  mode: "waitBodyElements";
181
184
  threshold?: number | undefined;
185
+ } | {
186
+ mode: "delay";
187
+ value: number;
182
188
  };
183
189
  } | {
184
190
  type: "click";
@@ -305,6 +311,9 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
305
311
  }, undefined>, v.ObjectSchema<{
306
312
  readonly mode: v.LiteralSchema<"waitBodyElements", undefined>;
307
313
  readonly threshold: v.OptionalSchema<v.NumberSchema<undefined>, undefined>;
314
+ }, undefined>, v.ObjectSchema<{
315
+ readonly mode: v.LiteralSchema<"delay", undefined>;
316
+ readonly value: v.NumberSchema<undefined>;
308
317
  }, undefined>], undefined>;
309
318
  }, undefined>, v.ObjectSchema<{
310
319
  readonly type: v.LiteralSchema<"click", undefined>;
@@ -432,6 +441,9 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
432
441
  } | {
433
442
  mode: "waitBodyElements";
434
443
  threshold?: number | undefined;
444
+ } | {
445
+ mode: "delay";
446
+ value: number;
435
447
  };
436
448
  } | {
437
449
  type: "click";
@@ -1,5 +1,5 @@
1
1
  import { InitOptions, WebBrowser } from './init';
2
- import { QueueList } from './define';
2
+ import { GlobalConfigInputType, QueueList } from './define';
3
3
  import { Subject } from 'rxjs';
4
4
  import { Readability } from '@mozilla/readability';
5
5
  type UrlItemData = {
@@ -31,7 +31,7 @@ export declare class FullWebRequest {
31
31
  skipParseContent?: (url: string) => Promise<boolean>;
32
32
  queueList: (url: string) => Promise<QueueList>;
33
33
  });
34
- start(options?: Omit<InitOptions, 'cacheDir'>): Promise<Map<string, UrlItemData>>;
34
+ start(options?: Omit<InitOptions, 'cacheDir'>, globalConfig?: GlobalConfigInputType): Promise<Map<string, UrlItemData>>;
35
35
  searchWebOne(url: string, context?: {
36
36
  from: string;
37
37
  }): Promise<void>;
package/index.mjs CHANGED
@@ -1,3 +1,3 @@
1
- import ke from"puppeteer-core";import{ElementHandle as oe}from"puppeteer-core";import{load as C}from"cheerio";import w from"turndown";import{Readability as A}from"@mozilla/readability";import{JSDOM as P}from"jsdom";function E(n,r){if(r.cleanContent){let t=new P(n),i=new A(t.window.document).parse();if(i)switch(r.format){case"html":return i.content;case"text":return i.textContent;case"markdown":{var o=new w;return o.turndown(i.content)}default:throw""}let a=C(n,void 0,!0),s=a("body");if(s.find("script,style,iframe,footer,br,hr,svg,header,img").remove(),s.find("*").removeAttr("class"),s.find("*").removeAttr("style"),a("*").contents().filter(function(){return this.type==="comment"||this.type==="text"&&!this.data.trim()}).remove(),a("*").contents().filter(function(){return this.type==="text"&&!!this.data.trim()}).text((p,m)=>m.trim()),r.format==="html")return s.html();if(r.format==="text")return s.text();if(r.format==="markdown"){var o=new w;return o.turndown(s.html())}}else{if(r.format==="html")return n;if(r.format==="text")return C(n,void 0,!0)("body").text();if(r.format==="markdown"){var o=new w;return o.turndown(n)}}}function I(n){let r=new P(n);return new A(r.window.document,{charThreshold:100}).parse()}import{promise as ie}from"fastq";var d=class{page;browser;parent;#e={};constructor(r,o,t){this.page=r,this.browser=o,this.parent=t}ab;timeoutId;setMaxTimeout(r){this.ab=new AbortController,this.timeoutId=setTimeout(()=>{this.ab.abort("timeout")},r)}clearTimeout(){clearTimeout(this.timeoutId)}setVariable(r,o){this.#e[r]=o}getVariable(r){return this.#e[r]}#r(r,o){let t,i=!1;for(let a=0;a<o.length;a++){let s=o[a];if(s===".."){if(!r.parent)throw new Error("未找到父级");r=r.parent}else if(i){if(!t||typeof t!="object")throw new Error(`${o}路径下未找到值`);t=t[s]}else t=r.#e[s],i=!0}return t}#o(r){if(typeof r=="string")return r;if(r.source==="variable")return typeof r.key=="string"?this.#e[r.key]:this.#r(this,r.key)}#t(r,o){if(o)if(typeof o=="string")this.#e[o]=r;else switch(o.method){case"push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(r);break}case"flat-push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);if(!Array.isArray(r))throw new Error(`${JSON.stringify(r)}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(...r);break}case"define":{this.#e[o.key]=r;break}case"merge":{if(typeof this.#e[o.key]!="object")throw new Error(`${o.key}不是对象类型`);this.#e[o.key]||={},this.#e[o.key]={...this.#e[o.key],...r};break}default:break}}async exeQueue(r){let o;for(let t of r)switch(console.log("准备执行",t),t.type){case"click":await this.page.click(t.selector,{offset:t.offset,delay:t.delay,count:t.count});break;case"type":{await this.page.type(t.selector,this.#o(t.text),{delay:t.delay});break}case"goto":{o=await this.page.goto(this.#o(t.url),{waitUntil:t.waitUntil,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"setViewport":{o=await this.page.setViewport({width:t.width,height:t.height,isMobile:t.isMobile,isLandscape:t.isLandscape});break}case"wait":{switch(t.config.mode){case"selector":{o=await this.page.waitForSelector(t.config.selector,{visible:t.config.visible,hidden:t.config.hidden,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"request":{let i=t.config;o=await this.page.waitForRequest(async a=>!(i.urlRegexp&&!i.urlRegexp.test(a.url())||i.method&&i.method!==a.method()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"response":{let i=t.config;o=await this.page.waitForResponse(async a=>!(i.urlRegexp&&!i.urlRegexp.test(a.url())||i.status&&i.status!==a.status()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"networkIdle":{o=await this.page.waitForNetworkIdle({idleTime:t.config.idleTime,concurrency:t.config.concurrency,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"navigation":{o=await this.page.waitForNavigation({signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"waitBodyElements":{let i=t.config.threshold||100;o=await this.page.waitForFunction(a=>document.body.querySelectorAll("*").length>=a,{timeout:this.browser.getConfig()?.actionTimeout,signal:this.ab?.signal},i);break}}break}case"selector":{t.multi?this.#t(o=await this.page.$$(t.selector),t.output):this.#t(o=await this.page.$(t.selector),t.output);break}case"keypress":{await this.page.keyboard.press(t.key,{delay:t.delay});break}case"findData":{let i=this.#e[t.input];Array.isArray(i)?t.kind==="property"&&this.#t(o=await Promise.all(i.map(a=>a.getProperty(t.key).then(s=>s.jsonValue()))),t.output):i instanceof oe&&this.#t(o=i.getProperty(t.key).then(a=>a.jsonValue()),t.output);break}case"getContent":{let i=await this.page.content();this.#t(o=E(i,{cleanContent:t.cleanContent,format:t.format}),t.output);break}case"rawContent":{let i=await this.page.content();this.#t(o=i,t.output);break}case"page":{let i=this.#e[t.input],a=Array.isArray(i)?i:[i],s=ie(async l=>{console.log("准备执行",l);try{let y=await this.browser.openPage(async f=>(f.setVariable("$item",a[l.index]),f.setVariable("$index",l.index),f.setVariable("$first",l.index===0),f.setVariable("$last",l.index===a.length-1),{result:await f.exeQueue(t.actions),page:f}),this);m.push(y)}catch(y){if(t.throwError)throw y;m.push(void 0)}},t.concurrency),p;s.error(l=>{l&&(p=l)});let m=[];for(let l=0;l<a.length;l++)s.push({index:l});if(await s.drained(),t.throwError&&p)throw p;o=m,this.#t(o,t.output);break}case"setUserAgent":{await this.page.setUserAgent({userAgent:t.userAgent});break}case"close":{await this.page.close({runBeforeUnload:!1}),this.clearTimeout();break}case"custom":{if(typeof t.fn=="function")o=await t.fn(this);else{let i=this.browser.getCustom(t.config.type);if(!i)throw new Error(`自定义[${t.config.type}]未实现处理`);o=await i(t.config,this)}break}case"evaluate":{this.#t(o=await this.page.evaluate(t.fn,...t.args??[]),t.output);break}case"read-variable":{o=this.#e[t.input];break}default:break}return o}dispose(){return this.page.close()}};import*as e from"valibot";var ne=e.picklist(["0","1","2","3","4","5","6","7","8","9","Power","Eject","Abort","Help","Backspace","Tab","Numpad5","NumpadEnter","Enter","\r",`
2
- `,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),c=e.optional(e.number()),v=e.optional(e.boolean()),ae=e.optional(e.string());var se=c,le=e.pipe(e.string(),e.transform(n=>new RegExp(n))),ce=e.pipe(e.tuple([e.string(),e.pipe(e.string())]),e.transform(([n,r])=>new RegExp(n,r))),T=e.union([le,ce]),b=e.string(),O=e.union([e.string(),e.object({source:e.literal("variable"),key:e.union([e.string(),e.array(e.string())])})]),h=e.optional(e.union([e.string(),e.object({key:e.string(),method:e.picklist(["push","flat-push","define","merge"])})])),D=e.object({timeout:se,waitUntil:e.optional(e.picklist(["load","domcontentloaded","networkidle0","networkidle2"]),"networkidle2"),url:O,type:e.literal("goto")}),R=e.object({width:e.optional(e.number(),1920),height:e.optional(e.number(),1080),isMobile:e.optional(e.boolean()),isLandscape:e.optional(e.boolean()),type:e.literal("setViewport")}),S=e.object({userAgent:e.string(),type:e.literal("setUserAgent")}),ue=e.object({selector:b,visible:v,hidden:v}),pe=e.object({mode:e.literal("selector"),...ue.entries}),fe=e.object({mode:e.literal("request"),urlRegexp:T,method:ae}),me=e.object({mode:e.literal("response"),urlRegexp:e.optional(T),status:c}),ge=e.object({mode:e.literal("networkIdle"),idleTime:c,concurrency:c}),he=e.object({mode:e.literal("waitBodyElements"),threshold:c}),ye=e.object({mode:e.literal("navigation")}),L=e.object({type:e.literal("wait"),config:e.variant("mode",[pe,fe,me,ge,ye,he])}),j=e.object({type:e.literal("click"),selector:b,offset:e.optional(e.object({x:e.number(),y:e.number()})),delay:c,count:c}),U=e.object({type:e.literal("type"),selector:b,text:O,delay:c}),F=e.object({type:e.literal("keypress"),key:ne,delay:c}),V=e.object({type:e.literal("selector"),selector:b,output:h,multi:e.optional(e.boolean(),!1)}),M=e.object({type:e.literal("findData"),input:e.string(),output:h,kind:e.picklist(["property"]),key:e.optional(e.string())}),N=e.object({type:e.literal("getContent"),format:e.optional(e.picklist(["html","text","markdown"]),"html"),cleanContent:v,output:h}),K=e.object({type:e.literal("rawContent"),output:h}),B=e.object({type:e.literal("evaluate"),fn:e.custom(n=>typeof n=="function"),args:e.optional(e.array(e.any())),output:h}),$=e.object({type:e.literal("close")}),W=e.object({type:e.literal("custom"),config:e.optional(e.looseObject({type:e.string()})),fn:e.optional(e.custom(Boolean))}),G=e.object({type:e.literal("read-variable"),input:e.string()}),de=e.object({type:e.literal("page"),input:e.string(),output:h,concurrency:e.optional(e.number(),2),throwError:e.optional(e.boolean(),!1),actions:e.lazy(()=>e.array(q))}),be=[...[D,R,S,L,j,U,F,V,M,N,K,$,W,G,B].map(n=>n.entries.type.literal),"page"],q=e.fallback(e.union([D,R,S,L,j,U,F,V,M,N,K,de,$,W,G,B]),n=>{if(be.includes((n?.value).type))throw new Error(JSON.stringify(n?.issues));return{type:"custom",config:n?.value}}),H=e.array(q),Ne=e.object({maxTimeout:c,actionTimeout:c});import*as Y from"valibot";import{Browser as xe,computeExecutablePath as Ce}from"@puppeteer/browsers";import*as Z from"fs";import{Browser as Q,computeExecutablePath as we,install as ve}from"@puppeteer/browsers";async function J(n){let r=await ve({browser:Q.CHROME,baseUrl:"https://cdn.npmmirror.com/binaries/chrome-for-testing",...n,unpack:!0})}function We(n,r){return we({cacheDir:n,browser:Q.CHROME,buildId:r})}import{PUPPETEER_REVISIONS as Ae}from"puppeteer-core/internal/revisions.js";async function z(n){return k.init(n)}var _=Ae.chrome,k=class n{browser;static async init(r){let o=xe.CHROME,t=Ce({cacheDir:r.cacheDir,browser:o,buildId:_});Z.existsSync(t)||(console.log("准备下载"),await J({cacheDir:r.cacheDir,buildId:_,browser:o}));let i=await ke.launch({...r,executablePath:t});return new n(i)}constructor(r){this.browser=r}#e;#r=new Map;setConfig(r){this.#e=r}getConfig(){return this.#e}registerCustom(r,o){this.#r.set(r,o)}clearCustom(){this.#r.clear()}getCustom(r){return this.#r.get(r)}async openPage(r,o){let t=new d(await this.browser.newPage(),this,o);return this.#e?.maxTimeout&&t.setMaxTimeout(this.#e.maxTimeout),r(t)}runQueue(r,o){let t=Y.safeParse(H,r);if(!t.success)throw new Error(`解析配置错误
3
- ${JSON.stringify(t.issues)}`);return this.openPage(async i=>{if(o)for(let a in o)i.setVariable(a,o[a]);return{result:await i.exeQueue(t.output),page:i}})}};import{load as Pe}from"cheerio";function X(n,r){let o=Pe(r,{baseURI:n}),t=o("a").map((i,a)=>o(a).attr("href")?o(a).prop("href"):"").get().filter(Boolean);return o("a").each((i,a)=>{o(a).attr("href")&&o(a).attr("href",o(a).prop("href"))}),{links:t,content:o.html()}}function ee(n){let r=new URL(n);return r.hash="",r.toString()}import{URL as Ee}from"url";function te(n){return new Ee(n).hash.startsWith("#/")}import{Subject as Ie}from"rxjs";import Te from"turndown";var re=class{config;browser;dataMap=new Map;#e=new Set;data$=new Ie;constructor(r){this.config=r}async start(r){return this.browser=await z({...r,cacheDir:process.cwd()}),this.browser.setConfig({maxTimeout:12e4,actionTimeout:12e4}),await this.searchWebOne(this.config.rootUrl,void 0),await this.browser.browser.close(),this.data$.complete(),this.dataMap}async searchWebOne(r,o){let t=await this.browser.runQueue([...await this.config.queueList(r),{type:"evaluate",output:"baseURI",fn:()=>document.baseURI},{type:"evaluate",output:"href",fn:()=>window.location.href},{type:"rawContent",output:"data"}]);console.log("解析完成",r);let i=t.page.getVariable("href"),a=t.page.getVariable("baseURI"),s=t.page.getVariable("data"),p=I(s);this.#e.add(r),this.#e.add(i);let{links:m,content:l}=X(a,s);if(!await this.config.skipParseContent?.(r)){let u;if(p){var f=new Te;u=f.turndown(p.content)}let g={requestUrl:r,parsedUrl:i,parentUrl:o?.from,metadata:p,raw:l,markdown:u};if(await t.page.dispose(),this.dataMap.set(r,g),this.data$.next(g),!p)return}let x=[];for(let u of m)await this.config.filterLink(u)&&x.push(u);for(let u of x){let g=u;if((!this.config.hashMode||!te(u))&&(g=ee(u)),this.#e.has(g)){console.log("已索引,跳过",u);continue}await this.searchWebOne(g,{from:r})}}};export{q as ActionDefine,H as ActionListDefine,re as FullWebRequest,Ne as GlobalConfig,k as WebBrowser,d as WebPage,J as download,E as format,I as formatDoc,We as getExecutablePath,z as init};
1
+ import xe from"puppeteer-core";import{ElementHandle as oe}from"puppeteer-core";import{load as C}from"cheerio";import w from"turndown";import{Readability as A}from"@mozilla/readability";import{JSDOM as P}from"jsdom";function T(a,r){if(r.cleanContent){let t=new P(a),i=new A(t.window.document).parse();if(i)switch(r.format){case"html":return i.content;case"text":return i.textContent;case"markdown":{var o=new w;return o.turndown(i.content)}default:throw""}let n=C(a,void 0,!0),s=n("body");if(s.find("script,style,iframe,footer,br,hr,svg,header,img").remove(),s.find("*").removeAttr("class"),s.find("*").removeAttr("style"),n("*").contents().filter(function(){return this.type==="comment"||this.type==="text"&&!this.data.trim()}).remove(),n("*").contents().filter(function(){return this.type==="text"&&!!this.data.trim()}).text((p,m)=>m.trim()),r.format==="html")return s.html();if(r.format==="text")return s.text();if(r.format==="markdown"){var o=new w;return o.turndown(s.html())}}else{if(r.format==="html")return a;if(r.format==="text")return C(a,void 0,!0)("body").text();if(r.format==="markdown"){var o=new w;return o.turndown(a)}}}function I(a){let r=new P(a);return new A(r.window.document,{charThreshold:100}).parse()}import{promise as ie}from"fastq";var d=class{page;browser;parent;#e={};constructor(r,o,t){this.page=r,this.browser=o,this.parent=t}ab;timeoutId;setMaxTimeout(r){this.ab=new AbortController,this.timeoutId=setTimeout(()=>{this.ab.abort("timeout")},r)}clearTimeout(){clearTimeout(this.timeoutId)}setVariable(r,o){this.#e[r]=o}getVariable(r){return this.#e[r]}#r(r,o){let t,i=!1;for(let n=0;n<o.length;n++){let s=o[n];if(s===".."){if(!r.parent)throw new Error("未找到父级");r=r.parent}else if(i){if(!t||typeof t!="object")throw new Error(`${o}路径下未找到值`);t=t[s]}else t=r.#e[s],i=!0}return t}#o(r){if(typeof r=="string")return r;if(r.source==="variable")return typeof r.key=="string"?this.#e[r.key]:this.#r(this,r.key)}#t(r,o){if(o)if(typeof o=="string")this.#e[o]=r;else switch(o.method){case"push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(r);break}case"flat-push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);if(!Array.isArray(r))throw new Error(`${JSON.stringify(r)}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(...r);break}case"define":{this.#e[o.key]=r;break}case"merge":{if(typeof this.#e[o.key]!="object")throw new Error(`${o.key}不是对象类型`);this.#e[o.key]||={},this.#e[o.key]={...this.#e[o.key],...r};break}default:break}}async exeQueue(r){let o;for(let t of r)switch(console.log("准备执行",t),t.type){case"click":await this.page.click(t.selector,{offset:t.offset,delay:t.delay,count:t.count});break;case"type":{await this.page.type(t.selector,this.#o(t.text),{delay:t.delay});break}case"goto":{o=await this.page.goto(this.#o(t.url),{waitUntil:t.waitUntil,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"setViewport":{o=await this.page.setViewport({width:t.width,height:t.height,isMobile:t.isMobile,isLandscape:t.isLandscape});break}case"wait":{switch(t.config.mode){case"selector":{o=await this.page.waitForSelector(t.config.selector,{visible:t.config.visible,hidden:t.config.hidden,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"request":{let i=t.config;o=await this.page.waitForRequest(async n=>!(i.urlRegexp&&!i.urlRegexp.test(n.url())||i.method&&i.method!==n.method()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"response":{let i=t.config;o=await this.page.waitForResponse(async n=>!(i.urlRegexp&&!i.urlRegexp.test(n.url())||i.status&&i.status!==n.status()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"networkIdle":{o=await this.page.waitForNetworkIdle({idleTime:t.config.idleTime,concurrency:t.config.concurrency,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"navigation":{o=await this.page.waitForNavigation({signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"waitBodyElements":{let i=t.config.threshold||100;o=await this.page.waitForFunction(n=>document.body.querySelectorAll("*").length>=n,{timeout:this.browser.getConfig()?.actionTimeout,signal:this.ab?.signal},i);break}case"delay":o=await this.page.waitForFunction(i=>new Promise(n=>{setTimeout(()=>{n(void 0)},i)}),{timeout:this.browser.getConfig()?.actionTimeout,signal:this.ab?.signal},t.config.value)}break}case"selector":{t.multi?this.#t(o=await this.page.$$(t.selector),t.output):this.#t(o=await this.page.$(t.selector),t.output);break}case"keypress":{await this.page.keyboard.press(t.key,{delay:t.delay});break}case"findData":{let i=this.#e[t.input];Array.isArray(i)?t.kind==="property"&&this.#t(o=await Promise.all(i.map(n=>n.getProperty(t.key).then(s=>s.jsonValue()))),t.output):i instanceof oe&&this.#t(o=i.getProperty(t.key).then(n=>n.jsonValue()),t.output);break}case"getContent":{let i=await this.page.content();this.#t(o=T(i,{cleanContent:t.cleanContent,format:t.format}),t.output);break}case"rawContent":{let i=await this.page.content();this.#t(o=i,t.output);break}case"page":{let i=this.#e[t.input],n=Array.isArray(i)?i:[i],s=ie(async l=>{console.log("准备执行",l);try{let y=await this.browser.openPage(async f=>(f.setVariable("$item",n[l.index]),f.setVariable("$index",l.index),f.setVariable("$first",l.index===0),f.setVariable("$last",l.index===n.length-1),{result:await f.exeQueue(t.actions),page:f}),this);m.push(y)}catch(y){if(t.throwError)throw y;m.push(void 0)}},t.concurrency),p;s.error(l=>{l&&(p=l)});let m=[];for(let l=0;l<n.length;l++)s.push({index:l});if(await s.drained(),t.throwError&&p)throw p;o=m,this.#t(o,t.output);break}case"setUserAgent":{await this.page.setUserAgent({userAgent:t.userAgent});break}case"close":{await this.page.close({runBeforeUnload:!1}),this.clearTimeout();break}case"custom":{if(typeof t.fn=="function")o=await t.fn(this);else{let i=this.browser.getCustom(t.config.type);if(!i)throw new Error(`自定义[${t.config.type}]未实现处理`);o=await i(t.config,this)}break}case"evaluate":{this.#t(o=await this.page.evaluate(t.fn,...t.args??[]),t.output);break}case"read-variable":{o=this.#e[t.input];break}default:break}return o}dispose(){return this.page.close()}};import*as e from"valibot";var ne=e.picklist(["0","1","2","3","4","5","6","7","8","9","Power","Eject","Abort","Help","Backspace","Tab","Numpad5","NumpadEnter","Enter","\r",`
2
+ `,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),u=e.optional(e.number()),v=e.optional(e.boolean()),ae=e.optional(e.string());var se=u,le=e.pipe(e.string(),e.transform(a=>new RegExp(a))),ce=e.pipe(e.tuple([e.string(),e.pipe(e.string())]),e.transform(([a,r])=>new RegExp(a,r))),E=e.union([le,ce]),b=e.string(),D=e.union([e.string(),e.object({source:e.literal("variable"),key:e.union([e.string(),e.array(e.string())])})]),h=e.optional(e.union([e.string(),e.object({key:e.string(),method:e.picklist(["push","flat-push","define","merge"])})])),O=e.object({timeout:se,waitUntil:e.optional(e.picklist(["load","domcontentloaded","networkidle0","networkidle2"]),"networkidle2"),url:D,type:e.literal("goto")}),R=e.object({width:e.optional(e.number(),1920),height:e.optional(e.number(),1080),isMobile:e.optional(e.boolean()),isLandscape:e.optional(e.boolean()),type:e.literal("setViewport")}),S=e.object({userAgent:e.string(),type:e.literal("setUserAgent")}),ue=e.object({selector:b,visible:v,hidden:v}),pe=e.object({mode:e.literal("selector"),...ue.entries}),fe=e.object({mode:e.literal("request"),urlRegexp:E,method:ae}),me=e.object({mode:e.literal("response"),urlRegexp:e.optional(E),status:u}),ge=e.object({mode:e.literal("networkIdle"),idleTime:u,concurrency:u}),he=e.object({mode:e.literal("waitBodyElements"),threshold:u}),ye=e.object({mode:e.literal("navigation")}),de=e.object({mode:e.literal("delay"),value:e.number()}),j=e.object({type:e.literal("wait"),config:e.variant("mode",[pe,fe,me,ge,ye,he,de])}),F=e.object({type:e.literal("click"),selector:b,offset:e.optional(e.object({x:e.number(),y:e.number()})),delay:u,count:u}),L=e.object({type:e.literal("type"),selector:b,text:D,delay:u}),U=e.object({type:e.literal("keypress"),key:ne,delay:u}),V=e.object({type:e.literal("selector"),selector:b,output:h,multi:e.optional(e.boolean(),!1)}),M=e.object({type:e.literal("findData"),input:e.string(),output:h,kind:e.picklist(["property"]),key:e.optional(e.string())}),N=e.object({type:e.literal("getContent"),format:e.optional(e.picklist(["html","text","markdown"]),"html"),cleanContent:v,output:h}),K=e.object({type:e.literal("rawContent"),output:h}),B=e.object({type:e.literal("evaluate"),fn:e.custom(a=>typeof a=="function"),args:e.optional(e.array(e.any())),output:h}),$=e.object({type:e.literal("close")}),W=e.object({type:e.literal("custom"),config:e.optional(e.looseObject({type:e.string()})),fn:e.optional(e.custom(Boolean))}),G=e.object({type:e.literal("read-variable"),input:e.string()}),be=e.object({type:e.literal("page"),input:e.string(),output:h,concurrency:e.optional(e.number(),2),throwError:e.optional(e.boolean(),!1),actions:e.lazy(()=>e.array(q))}),we=[...[O,R,S,j,F,L,U,V,M,N,K,$,W,G,B].map(a=>a.entries.type.literal),"page"],q=e.fallback(e.union([O,R,S,j,F,L,U,V,M,N,K,be,$,W,G,B]),a=>{if(we.includes((a?.value).type))throw new Error(JSON.stringify(a?.issues));return{type:"custom",config:a?.value}}),H=e.array(q),Ke=e.object({maxTimeout:u,actionTimeout:u});import*as Y from"valibot";import{Browser as Ce,computeExecutablePath as Ae}from"@puppeteer/browsers";import*as Z from"fs";import{Browser as Q,computeExecutablePath as ve,install as ke}from"@puppeteer/browsers";async function J(a){let r=await ke({browser:Q.CHROME,baseUrl:"https://cdn.npmmirror.com/binaries/chrome-for-testing",...a,unpack:!0})}function Ge(a,r){return ve({cacheDir:a,browser:Q.CHROME,buildId:r})}import{PUPPETEER_REVISIONS as Pe}from"puppeteer-core/internal/revisions.js";async function z(a){return k.init(a)}var _=Pe.chrome,k=class a{browser;static async init(r){let o=Ce.CHROME,t=Ae({cacheDir:r.cacheDir,browser:o,buildId:_});Z.existsSync(t)||(console.log("准备下载"),await J({cacheDir:r.cacheDir,buildId:_,browser:o}));let i=await xe.launch({...r,executablePath:t});return new a(i)}constructor(r){this.browser=r}#e;#r=new Map;setConfig(r){this.#e=r}getConfig(){return this.#e}registerCustom(r,o){this.#r.set(r,o)}clearCustom(){this.#r.clear()}getCustom(r){return this.#r.get(r)}async openPage(r,o){let t=new d(await this.browser.newPage(),this,o);return this.#e?.maxTimeout&&t.setMaxTimeout(this.#e.maxTimeout),r(t)}runQueue(r,o){let t=Y.safeParse(H,r);if(!t.success)throw new Error(`解析配置错误
3
+ ${JSON.stringify(t.issues)}`);return this.openPage(async i=>{if(o)for(let n in o)i.setVariable(n,o[n]);return{result:i.exeQueue(t.output),page:i}})}};import{load as Te}from"cheerio";function X(a,r){let o=Te(r,{baseURI:a}),t=o("a").map((i,n)=>o(n).attr("href")?o(n).prop("href"):"").get().filter(Boolean);return o("a").each((i,n)=>{o(n).attr("href")&&o(n).attr("href",o(n).prop("href"))}),{links:t,content:o.html()}}function ee(a){let r=new URL(a);return r.hash="",r.toString()}import{URL as Ie}from"url";function te(a){return new Ie(a).hash.startsWith("#/")}import{Subject as Ee}from"rxjs";import De from"turndown";var re=class{config;browser;dataMap=new Map;#e=new Set;data$=new Ee;constructor(r){this.config=r}async start(r,o){return this.browser=await z({...r,cacheDir:process.cwd()}),this.browser.setConfig({maxTimeout:12e4,actionTimeout:12e4,...o}),await this.searchWebOne(this.config.rootUrl,void 0),await this.browser.browser.close(),this.data$.complete(),this.dataMap}async searchWebOne(r,o){let t=await this.browser.runQueue([...await this.config.queueList(r),{type:"evaluate",output:"baseURI",fn:()=>document.baseURI},{type:"evaluate",output:"href",fn:()=>window.location.href},{type:"rawContent",output:"data"}]);try{console.log("解析完成",r),await t.result}catch(c){console.log("解析失败",r),console.error(c),await t.page.dispose();return}let i=t.page.getVariable("href"),n=t.page.getVariable("baseURI"),s=t.page.getVariable("data"),p=I(s);this.#e.add(r),this.#e.add(i);let{links:m,content:l}=X(n,s);if(!await this.config.skipParseContent?.(r)){let c;if(p){var f=new De;c=f.turndown(p.content)}let g={requestUrl:r,parsedUrl:i,parentUrl:o?.from,metadata:p,raw:l,markdown:c};if(await t.page.dispose(),this.dataMap.set(r,g),this.data$.next(g),!p)return}let x=[];for(let c of m)await this.config.filterLink(c)&&x.push(c);for(let c of x){let g=c;if((!this.config.hashMode||!te(c))&&(g=ee(c)),this.#e.has(g)){console.log("已索引,跳过",c);continue}await this.searchWebOne(g,{from:r})}}};export{q as ActionDefine,H as ActionListDefine,re as FullWebRequest,Ke as GlobalConfig,k as WebBrowser,d as WebPage,J as download,T as format,I as formatDoc,Ge as getExecutablePath,z as init};
package/init.d.ts CHANGED
@@ -21,7 +21,7 @@ export declare class WebBrowser {
21
21
  getCustom(key: string): PluginFn | undefined;
22
22
  openPage<T>(fn: (page: WebPage) => Promise<T>, parent?: WebPage): Promise<T>;
23
23
  runQueue(list: QueueList, input?: Record<string, any>): Promise<{
24
- result: any;
24
+ result: Promise<any>;
25
25
  page: WebPage;
26
26
  }>;
27
27
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cyia/crawl",
3
- "version": "0.0.18",
3
+ "version": "0.0.20",
4
4
  "author": "wszgrcy",
5
5
  "description": "",
6
6
  "dependencies": {