@cyia/crawl 0.0.13 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,18 +1,39 @@
1
1
  import { WebBrowser } from './init';
2
+ import { QueueList } from './define';
3
+ import { Subject } from 'rxjs';
4
+ import { Readability } from '@mozilla/readability';
5
+ type UrlItemData = {
6
+ requestUrl: string;
7
+ parsedUrl: string;
8
+ parentUrl: string | undefined;
9
+ metadata?: ReturnType<Readability['parse']>;
10
+ raw: string;
11
+ markdown?: string;
12
+ };
2
13
  export declare class FullWebRequest {
3
14
  #private;
4
15
  config: {
5
- url: string;
6
- filterLink: (url: string) => boolean;
16
+ rootUrl: string;
17
+ filterLink: (url: string) => Promise<boolean>;
18
+ hashMode?: boolean;
19
+ /** 跳过解析内容,但是不跳过子集解析,相当于导航页面不保存 */
20
+ skipParseContent?: (url: string) => Promise<boolean>;
21
+ queueList: (url: string) => Promise<QueueList>;
7
22
  };
8
23
  browser: WebBrowser;
9
- dataMap: Map<string, any>;
24
+ dataMap: Map<string, UrlItemData>;
25
+ data$: Subject<unknown>;
10
26
  constructor(config: {
11
- url: string;
12
- filterLink: (url: string) => boolean;
27
+ rootUrl: string;
28
+ filterLink: (url: string) => Promise<boolean>;
29
+ hashMode?: boolean;
30
+ /** 跳过解析内容,但是不跳过子集解析,相当于导航页面不保存 */
31
+ skipParseContent?: (url: string) => Promise<boolean>;
32
+ queueList: (url: string) => Promise<QueueList>;
13
33
  });
14
- start(): Promise<Map<string, any>>;
34
+ start(): Promise<Map<string, UrlItemData>>;
15
35
  searchWebOne(url: string, context?: {
16
36
  from: string;
17
37
  }): Promise<void>;
18
38
  }
39
+ export {};
package/index.mjs CHANGED
@@ -1,3 +1,3 @@
1
- import ye from"puppeteer-core";import{ElementHandle as z}from"puppeteer-core";import{load as v}from"cheerio";import d from"turndown";import{Readability as k}from"@mozilla/readability";import{JSDOM as x}from"jsdom";function A(n,r){if(r.cleanContent){let t=new x(n),i=new k(t.window.document).parse();if(i)switch(r.format){case"html":return i.content;case"text":return i.textContent;case"markdown":{var o=new d;return o.turndown(i.content)}default:throw""}let a=v(n,void 0,!0),s=a("body");if(s.find("script,style,iframe,footer,br,hr,svg,header,img").remove(),s.find("*").removeAttr("class"),s.find("*").removeAttr("style"),a("*").contents().filter(function(){return this.type==="comment"||this.type==="text"&&!this.data.trim()}).remove(),a("*").contents().filter(function(){return this.type==="text"&&!!this.data.trim()}).text((p,u)=>u.trim()),r.format==="html")return s.html();if(r.format==="text")return s.text();if(r.format==="markdown"){var o=new d;return o.turndown(s.html())}}else{if(r.format==="html")return n;if(r.format==="text")return v(n,void 0,!0)("body").text();if(r.format==="markdown"){var o=new d;return o.turndown(n)}}}function C(n){let r=new x(n);return new k(r.window.document,{charThreshold:100}).parse()}import{promise as X}from"fastq";var g=class{page;browser;parent;#e={};constructor(r,o,t){this.page=r,this.browser=o,this.parent=t}ab;timeoutId;setMaxTimeout(r){this.ab=new AbortController,this.timeoutId=setTimeout(()=>{this.ab.abort("timeout")},r)}clearTimeout(){clearTimeout(this.timeoutId)}setVariable(r,o){this.#e[r]=o}getVariable(r){return this.#e[r]}#r(r,o){let t,i=!1;for(let a=0;a<o.length;a++){let s=o[a];if(s===".."){if(!r.parent)throw new Error("未找到父级");r=r.parent}else if(i){if(!t||typeof t!="object")throw new Error(`${o}路径下未找到值`);t=t[s]}else t=r.#e[s],i=!0}return t}#o(r){if(typeof r=="string")return r;if(r.source==="variable")return typeof r.key=="string"?this.#e[r.key]:this.#r(this,r.key)}#t(r,o){if(o)if(typeof o=="string")this.#e[o]=r;else switch(o.method){case"push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(r);break}case"flat-push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);if(!Array.isArray(r))throw new Error(`${JSON.stringify(r)}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(...r);break}case"define":{this.#e[o.key]=r;break}case"merge":{if(typeof this.#e[o.key]!="object")throw new Error(`${o.key}不是对象类型`);this.#e[o.key]||={},this.#e[o.key]={...this.#e[o.key],...r};break}default:break}}async exeQueue(r){let o;for(let t of r)switch(console.log("准备执行",t),t.type){case"click":await this.page.click(t.selector,{offset:t.offset,delay:t.delay,count:t.count});break;case"type":{await this.page.type(t.selector,this.#o(t.text),{delay:t.delay});break}case"goto":{o=await this.page.goto(this.#o(t.url),{waitUntil:t.waitUntil,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"setViewport":{o=await this.page.setViewport({width:t.width,height:t.height,isMobile:t.isMobile,isLandscape:t.isLandscape});break}case"wait":{switch(t.config.mode){case"selector":{o=await this.page.waitForSelector(t.config.selector,{visible:t.config.visible,hidden:t.config.hidden,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"request":{let i=t.config;o=await this.page.waitForRequest(async a=>!(i.urlRegexp&&!i.urlRegexp.test(a.url())||i.method&&i.method!==a.method()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"response":{let i=t.config;o=await this.page.waitForResponse(async a=>!(i.urlRegexp&&!i.urlRegexp.test(a.url())||i.status&&i.status!==a.status()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"networkIdle":{o=await this.page.waitForNetworkIdle({idleTime:t.config.idleTime,concurrency:t.config.concurrency,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"navigation":{o=await this.page.waitForNavigation({signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"waitBodyElements":{let i=t.config.threshold||100;o=await this.page.waitForFunction(a=>document.body.querySelectorAll("*").length>=a,{timeout:this.browser.getConfig()?.actionTimeout,signal:this.ab?.signal},i);break}}break}case"selector":{t.multi?this.#t(o=await this.page.$$(t.selector),t.output):this.#t(o=await this.page.$(t.selector),t.output);break}case"keypress":{await this.page.keyboard.press(t.key,{delay:t.delay});break}case"findData":{let i=this.#e[t.input];Array.isArray(i)?t.kind==="property"&&this.#t(o=await Promise.all(i.map(a=>a.getProperty(t.key).then(s=>s.jsonValue()))),t.output):i instanceof z&&this.#t(o=i.getProperty(t.key).then(a=>a.jsonValue()),t.output);break}case"getContent":{let i=await this.page.content();this.#t(o=A(i,{cleanContent:t.cleanContent,format:t.format}),t.output);break}case"rawContent":{let i=await this.page.content();this.#t(o=i,t.output);break}case"page":{let i=this.#e[t.input],a=Array.isArray(i)?i:[i],s=X(async l=>{console.log("准备执行",l);try{let y=await this.browser.openPage(async f=>(f.setVariable("$item",a[l.index]),f.setVariable("$index",l.index),f.setVariable("$first",l.index===0),f.setVariable("$last",l.index===a.length-1),{result:await f.exeQueue(t.actions),page:f}),this);u.push(y)}catch(y){if(t.throwError)throw y;u.push(void 0)}},t.concurrency),p;s.error(l=>{l&&(p=l)});let u=[];for(let l=0;l<a.length;l++)s.push({index:l});if(await s.drained(),t.throwError&&p)throw p;o=u,this.#t(o,t.output);break}case"setUserAgent":{await this.page.setUserAgent({userAgent:t.userAgent});break}case"close":{await this.page.close({runBeforeUnload:!1}),this.clearTimeout();break}case"custom":{if(typeof t.fn=="function")o=await t.fn(this);else{let i=this.browser.getCustom(t.config.type);if(!i)throw new Error(`自定义[${t.config.type}]未实现处理`);o=await i(t.config,this)}break}case"evaluate":{this.#t(o=await this.page.evaluate(t.fn,...t.args??[]),t.output);break}case"read-variable":{o=this.#e[t.input];break}default:break}return o}dispose(){return this.page.close()}};import*as e from"valibot";var ee=e.picklist(["0","1","2","3","4","5","6","7","8","9","Power","Eject","Abort","Help","Backspace","Tab","Numpad5","NumpadEnter","Enter","\r",`
2
- `,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),c=e.optional(e.number()),b=e.optional(e.boolean()),te=e.optional(e.string());var re=c,oe=e.pipe(e.string(),e.transform(n=>new RegExp(n))),ie=e.pipe(e.tuple([e.string(),e.pipe(e.string())]),e.transform(([n,r])=>new RegExp(n,r))),E=e.union([oe,ie]),h=e.string(),P=e.union([e.string(),e.object({source:e.literal("variable"),key:e.union([e.string(),e.array(e.string())])})]),m=e.optional(e.union([e.string(),e.object({key:e.string(),method:e.picklist(["push","flat-push","define","merge"])})])),T=e.object({timeout:re,waitUntil:e.optional(e.picklist(["load","domcontentloaded","networkidle0","networkidle2"]),"networkidle2"),url:P,type:e.literal("goto")}),I=e.object({width:e.optional(e.number(),1920),height:e.optional(e.number(),1080),isMobile:e.optional(e.boolean()),isLandscape:e.optional(e.boolean()),type:e.literal("setViewport")}),O=e.object({userAgent:e.string(),type:e.literal("setUserAgent")}),ne=e.object({selector:h,visible:b,hidden:b}),ae=e.object({mode:e.literal("selector"),...ne.entries}),se=e.object({mode:e.literal("request"),urlRegexp:E,method:te}),le=e.object({mode:e.literal("response"),urlRegexp:e.optional(E),status:c}),ce=e.object({mode:e.literal("networkIdle"),idleTime:c,concurrency:c}),ue=e.object({mode:e.literal("waitBodyElements"),threshold:c}),pe=e.object({mode:e.literal("navigation")}),D=e.object({type:e.literal("wait"),config:e.variant("mode",[ae,se,le,ce,pe,ue])}),S=e.object({type:e.literal("click"),selector:h,offset:e.optional(e.object({x:e.number(),y:e.number()})),delay:c,count:c}),R=e.object({type:e.literal("type"),selector:h,text:P,delay:c}),F=e.object({type:e.literal("keypress"),key:ee,delay:c}),V=e.object({type:e.literal("selector"),selector:h,output:m,multi:e.optional(e.boolean(),!1)}),j=e.object({type:e.literal("findData"),input:e.string(),output:m,kind:e.picklist(["property"]),key:e.optional(e.string())}),N=e.object({type:e.literal("getContent"),format:e.optional(e.picklist(["html","text","markdown"]),"html"),cleanContent:b,output:m}),L=e.object({type:e.literal("rawContent"),output:m}),M=e.object({type:e.literal("evaluate"),fn:e.custom(n=>typeof n=="function"),args:e.optional(e.array(e.any())),output:m}),K=e.object({type:e.literal("close")}),U=e.object({type:e.literal("custom"),config:e.optional(e.looseObject({type:e.string()})),fn:e.optional(e.custom(Boolean))}),B=e.object({type:e.literal("read-variable"),input:e.string()}),fe=e.object({type:e.literal("page"),input:e.string(),output:m,concurrency:e.optional(e.number(),2),throwError:e.optional(e.boolean(),!1),actions:e.lazy(()=>e.array(W))}),me=[...[T,I,O,D,S,R,F,V,j,N,L,K,U,B,M].map(n=>n.entries.type.literal),"page"],W=e.fallback(e.union([T,I,O,D,S,R,F,V,j,N,L,fe,K,U,B,M]),n=>{if(me.includes((n?.value).type))throw new Error(JSON.stringify(n?.issues));return{type:"custom",config:n?.value}}),$=e.array(W),Se=e.object({maxTimeout:c,actionTimeout:c});import*as J from"valibot";import{Browser as de,computeExecutablePath as be}from"@puppeteer/browsers";import*as H from"fs";import{Browser as G,computeExecutablePath as ge,install as he}from"@puppeteer/browsers";async function q(n){let r=await he({browser:G.CHROME,baseUrl:"https://cdn.npmmirror.com/binaries/chrome-for-testing",...n,unpack:!0})}function je(n,r){return ge({cacheDir:n,browser:G.CHROME,buildId:r})}import{PUPPETEER_REVISIONS as we}from"puppeteer-core/internal/revisions.js";async function _(n){return w.init(n)}var Q=we.chrome,w=class n{browser;static async init(r){let o=de.CHROME,t=be({cacheDir:r.cacheDir,browser:o,buildId:Q});H.existsSync(t)||(console.log("准备下载"),await q({cacheDir:r.cacheDir,buildId:Q,browser:o}));let i=await ye.launch({...r,executablePath:t});return new n(i)}constructor(r){this.browser=r}#e;#r=new Map;setConfig(r){this.#e=r}getConfig(){return this.#e}registerCustom(r,o){this.#r.set(r,o)}clearCustom(){this.#r.clear()}getCustom(r){return this.#r.get(r)}async openPage(r,o){let t=new g(await this.browser.newPage(),this,o);return this.#e?.maxTimeout&&t.setMaxTimeout(this.#e.maxTimeout),r(t)}runQueue(r,o){let t=J.safeParse($,r);if(!t.success)throw new Error(`解析配置错误
3
- ${JSON.stringify(t.issues)}`);return this.openPage(async i=>{if(o)for(let a in o)i.setVariable(a,o[a]);return{result:await i.exeQueue(t.output),page:i}})}};import{load as ve}from"cheerio";function Y(n,r){let o=ve(r,{baseURI:n});return o("a").map((i,a)=>o(a).attr("href")?o(a).prop("href"):"").get().filter(Boolean)}var Z=class{config;browser;dataMap=new Map;#e=new Set;constructor(r){this.config=r}async start(){return this.browser=await _({cacheDir:process.cwd(),headless:!1}),this.browser.setConfig({maxTimeout:12e4,actionTimeout:12e4}),await this.searchWebOne(this.config.url,void 0),await this.browser.browser.close(),this.dataMap}async searchWebOne(r,o){let t=await this.browser.runQueue([{type:"setViewport",width:1920,height:1080},{type:"goto",url:r,waitUntil:"networkidle0"},{type:"evaluate",output:"baseURI",fn:()=>window.location.origin},{type:"evaluate",output:"href",fn:()=>window.location.href},{type:"rawContent",output:"data"}]);console.log("解析完成",r);let i=t.page.getVariable("href"),a=t.page.getVariable("baseURI"),s=t.page.getVariable("data"),p=C(s);if(this.#e.add(r),this.#e.add(i),p)this.dataMap.set(r,{requestUrl:r,parsedUrl:i,parent:o?.from,metadata:p,raw:s}),await t.page.dispose();else{this.dataMap.set(r,{requestUrl:r,parsedUrl:i,parent:o?.from,metadata:void 0,raw:s}),await t.page.dispose();return}let u=Y(a,s);u=u.filter(this.config.filterLink);for(let l of u){if(this.#e.has(l)){console.log("已索引,跳过",l);continue}await this.searchWebOne(l,{from:r})}}};export{W as ActionDefine,$ as ActionListDefine,Z as FullWebRequest,Se as GlobalConfig,w as WebBrowser,g as WebPage,q as download,A as format,C as formatDoc,je as getExecutablePath,_ as init};
1
+ import ve from"puppeteer-core";import{ElementHandle as re}from"puppeteer-core";import{load as x}from"cheerio";import w from"turndown";import{Readability as C}from"@mozilla/readability";import{JSDOM as A}from"jsdom";function P(n,r){if(r.cleanContent){let t=new A(n),i=new C(t.window.document).parse();if(i)switch(r.format){case"html":return i.content;case"text":return i.textContent;case"markdown":{var o=new w;return o.turndown(i.content)}default:throw""}let a=x(n,void 0,!0),s=a("body");if(s.find("script,style,iframe,footer,br,hr,svg,header,img").remove(),s.find("*").removeAttr("class"),s.find("*").removeAttr("style"),a("*").contents().filter(function(){return this.type==="comment"||this.type==="text"&&!this.data.trim()}).remove(),a("*").contents().filter(function(){return this.type==="text"&&!!this.data.trim()}).text((p,u)=>u.trim()),r.format==="html")return s.html();if(r.format==="text")return s.text();if(r.format==="markdown"){var o=new w;return o.turndown(s.html())}}else{if(r.format==="html")return n;if(r.format==="text")return x(n,void 0,!0)("body").text();if(r.format==="markdown"){var o=new w;return o.turndown(n)}}}function E(n){let r=new A(n);return new C(r.window.document,{charThreshold:100}).parse()}import{promise as oe}from"fastq";var d=class{page;browser;parent;#e={};constructor(r,o,t){this.page=r,this.browser=o,this.parent=t}ab;timeoutId;setMaxTimeout(r){this.ab=new AbortController,this.timeoutId=setTimeout(()=>{this.ab.abort("timeout")},r)}clearTimeout(){clearTimeout(this.timeoutId)}setVariable(r,o){this.#e[r]=o}getVariable(r){return this.#e[r]}#r(r,o){let t,i=!1;for(let a=0;a<o.length;a++){let s=o[a];if(s===".."){if(!r.parent)throw new Error("未找到父级");r=r.parent}else if(i){if(!t||typeof t!="object")throw new Error(`${o}路径下未找到值`);t=t[s]}else t=r.#e[s],i=!0}return t}#o(r){if(typeof r=="string")return r;if(r.source==="variable")return typeof r.key=="string"?this.#e[r.key]:this.#r(this,r.key)}#t(r,o){if(o)if(typeof o=="string")this.#e[o]=r;else switch(o.method){case"push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(r);break}case"flat-push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);if(!Array.isArray(r))throw new Error(`${JSON.stringify(r)}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(...r);break}case"define":{this.#e[o.key]=r;break}case"merge":{if(typeof this.#e[o.key]!="object")throw new Error(`${o.key}不是对象类型`);this.#e[o.key]||={},this.#e[o.key]={...this.#e[o.key],...r};break}default:break}}async exeQueue(r){let o;for(let t of r)switch(console.log("准备执行",t),t.type){case"click":await this.page.click(t.selector,{offset:t.offset,delay:t.delay,count:t.count});break;case"type":{await this.page.type(t.selector,this.#o(t.text),{delay:t.delay});break}case"goto":{o=await this.page.goto(this.#o(t.url),{waitUntil:t.waitUntil,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"setViewport":{o=await this.page.setViewport({width:t.width,height:t.height,isMobile:t.isMobile,isLandscape:t.isLandscape});break}case"wait":{switch(t.config.mode){case"selector":{o=await this.page.waitForSelector(t.config.selector,{visible:t.config.visible,hidden:t.config.hidden,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"request":{let i=t.config;o=await this.page.waitForRequest(async a=>!(i.urlRegexp&&!i.urlRegexp.test(a.url())||i.method&&i.method!==a.method()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"response":{let i=t.config;o=await this.page.waitForResponse(async a=>!(i.urlRegexp&&!i.urlRegexp.test(a.url())||i.status&&i.status!==a.status()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"networkIdle":{o=await this.page.waitForNetworkIdle({idleTime:t.config.idleTime,concurrency:t.config.concurrency,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"navigation":{o=await this.page.waitForNavigation({signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"waitBodyElements":{let i=t.config.threshold||100;o=await this.page.waitForFunction(a=>document.body.querySelectorAll("*").length>=a,{timeout:this.browser.getConfig()?.actionTimeout,signal:this.ab?.signal},i);break}}break}case"selector":{t.multi?this.#t(o=await this.page.$$(t.selector),t.output):this.#t(o=await this.page.$(t.selector),t.output);break}case"keypress":{await this.page.keyboard.press(t.key,{delay:t.delay});break}case"findData":{let i=this.#e[t.input];Array.isArray(i)?t.kind==="property"&&this.#t(o=await Promise.all(i.map(a=>a.getProperty(t.key).then(s=>s.jsonValue()))),t.output):i instanceof re&&this.#t(o=i.getProperty(t.key).then(a=>a.jsonValue()),t.output);break}case"getContent":{let i=await this.page.content();this.#t(o=P(i,{cleanContent:t.cleanContent,format:t.format}),t.output);break}case"rawContent":{let i=await this.page.content();this.#t(o=i,t.output);break}case"page":{let i=this.#e[t.input],a=Array.isArray(i)?i:[i],s=oe(async l=>{console.log("准备执行",l);try{let y=await this.browser.openPage(async m=>(m.setVariable("$item",a[l.index]),m.setVariable("$index",l.index),m.setVariable("$first",l.index===0),m.setVariable("$last",l.index===a.length-1),{result:await m.exeQueue(t.actions),page:m}),this);u.push(y)}catch(y){if(t.throwError)throw y;u.push(void 0)}},t.concurrency),p;s.error(l=>{l&&(p=l)});let u=[];for(let l=0;l<a.length;l++)s.push({index:l});if(await s.drained(),t.throwError&&p)throw p;o=u,this.#t(o,t.output);break}case"setUserAgent":{await this.page.setUserAgent({userAgent:t.userAgent});break}case"close":{await this.page.close({runBeforeUnload:!1}),this.clearTimeout();break}case"custom":{if(typeof t.fn=="function")o=await t.fn(this);else{let i=this.browser.getCustom(t.config.type);if(!i)throw new Error(`自定义[${t.config.type}]未实现处理`);o=await i(t.config,this)}break}case"evaluate":{this.#t(o=await this.page.evaluate(t.fn,...t.args??[]),t.output);break}case"read-variable":{o=this.#e[t.input];break}default:break}return o}dispose(){return this.page.close()}};import*as e from"valibot";var ie=e.picklist(["0","1","2","3","4","5","6","7","8","9","Power","Eject","Abort","Help","Backspace","Tab","Numpad5","NumpadEnter","Enter","\r",`
2
+ `,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),c=e.optional(e.number()),v=e.optional(e.boolean()),ne=e.optional(e.string());var ae=c,se=e.pipe(e.string(),e.transform(n=>new RegExp(n))),le=e.pipe(e.tuple([e.string(),e.pipe(e.string())]),e.transform(([n,r])=>new RegExp(n,r))),T=e.union([se,le]),b=e.string(),I=e.union([e.string(),e.object({source:e.literal("variable"),key:e.union([e.string(),e.array(e.string())])})]),h=e.optional(e.union([e.string(),e.object({key:e.string(),method:e.picklist(["push","flat-push","define","merge"])})])),D=e.object({timeout:ae,waitUntil:e.optional(e.picklist(["load","domcontentloaded","networkidle0","networkidle2"]),"networkidle2"),url:I,type:e.literal("goto")}),O=e.object({width:e.optional(e.number(),1920),height:e.optional(e.number(),1080),isMobile:e.optional(e.boolean()),isLandscape:e.optional(e.boolean()),type:e.literal("setViewport")}),R=e.object({userAgent:e.string(),type:e.literal("setUserAgent")}),ce=e.object({selector:b,visible:v,hidden:v}),ue=e.object({mode:e.literal("selector"),...ce.entries}),pe=e.object({mode:e.literal("request"),urlRegexp:T,method:ne}),fe=e.object({mode:e.literal("response"),urlRegexp:e.optional(T),status:c}),me=e.object({mode:e.literal("networkIdle"),idleTime:c,concurrency:c}),ge=e.object({mode:e.literal("waitBodyElements"),threshold:c}),he=e.object({mode:e.literal("navigation")}),S=e.object({type:e.literal("wait"),config:e.variant("mode",[ue,pe,fe,me,he,ge])}),j=e.object({type:e.literal("click"),selector:b,offset:e.optional(e.object({x:e.number(),y:e.number()})),delay:c,count:c}),L=e.object({type:e.literal("type"),selector:b,text:I,delay:c}),U=e.object({type:e.literal("keypress"),key:ie,delay:c}),F=e.object({type:e.literal("selector"),selector:b,output:h,multi:e.optional(e.boolean(),!1)}),V=e.object({type:e.literal("findData"),input:e.string(),output:h,kind:e.picklist(["property"]),key:e.optional(e.string())}),M=e.object({type:e.literal("getContent"),format:e.optional(e.picklist(["html","text","markdown"]),"html"),cleanContent:v,output:h}),N=e.object({type:e.literal("rawContent"),output:h}),K=e.object({type:e.literal("evaluate"),fn:e.custom(n=>typeof n=="function"),args:e.optional(e.array(e.any())),output:h}),B=e.object({type:e.literal("close")}),$=e.object({type:e.literal("custom"),config:e.optional(e.looseObject({type:e.string()})),fn:e.optional(e.custom(Boolean))}),W=e.object({type:e.literal("read-variable"),input:e.string()}),ye=e.object({type:e.literal("page"),input:e.string(),output:h,concurrency:e.optional(e.number(),2),throwError:e.optional(e.boolean(),!1),actions:e.lazy(()=>e.array(G))}),de=[...[D,O,R,S,j,L,U,F,V,M,N,B,$,W,K].map(n=>n.entries.type.literal),"page"],G=e.fallback(e.union([D,O,R,S,j,L,U,F,V,M,N,ye,B,$,W,K]),n=>{if(de.includes((n?.value).type))throw new Error(JSON.stringify(n?.issues));return{type:"custom",config:n?.value}}),q=e.array(G),Me=e.object({maxTimeout:c,actionTimeout:c});import*as _ from"valibot";import{Browser as ke,computeExecutablePath as xe}from"@puppeteer/browsers";import*as Y from"fs";import{Browser as H,computeExecutablePath as be,install as we}from"@puppeteer/browsers";async function Q(n){let r=await we({browser:H.CHROME,baseUrl:"https://cdn.npmmirror.com/binaries/chrome-for-testing",...n,unpack:!0})}function $e(n,r){return be({cacheDir:n,browser:H.CHROME,buildId:r})}import{PUPPETEER_REVISIONS as Ce}from"puppeteer-core/internal/revisions.js";async function Z(n){return k.init(n)}var J=Ce.chrome,k=class n{browser;static async init(r){let o=ke.CHROME,t=xe({cacheDir:r.cacheDir,browser:o,buildId:J});Y.existsSync(t)||(console.log("准备下载"),await Q({cacheDir:r.cacheDir,buildId:J,browser:o}));let i=await ve.launch({...r,executablePath:t});return new n(i)}constructor(r){this.browser=r}#e;#r=new Map;setConfig(r){this.#e=r}getConfig(){return this.#e}registerCustom(r,o){this.#r.set(r,o)}clearCustom(){this.#r.clear()}getCustom(r){return this.#r.get(r)}async openPage(r,o){let t=new d(await this.browser.newPage(),this,o);return this.#e?.maxTimeout&&t.setMaxTimeout(this.#e.maxTimeout),r(t)}runQueue(r,o){let t=_.safeParse(q,r);if(!t.success)throw new Error(`解析配置错误
3
+ ${JSON.stringify(t.issues)}`);return this.openPage(async i=>{if(o)for(let a in o)i.setVariable(a,o[a]);return{result:await i.exeQueue(t.output),page:i}})}};import{load as Ae}from"cheerio";function z(n,r){let o=Ae(r,{baseURI:n}),t=o("a").map((i,a)=>o(a).attr("href")?o(a).prop("href"):"").get().filter(Boolean);return o("a").each((i,a)=>{o(a).attr("href")&&o(a).attr("href",o(a).prop("href"))}),{links:t,content:o.html()}}function X(n){let r=new URL(n);return r.hash="",r.toString()}import{URL as Pe}from"url";function ee(n){return new Pe(n).hash.startsWith("#/")}import{Subject as Ee}from"rxjs";import Te from"turndown";var te=class{config;browser;dataMap=new Map;#e=new Set;data$=new Ee;constructor(r){this.config=r}async start(){return this.browser=await Z({cacheDir:process.cwd(),headless:!1}),this.browser.setConfig({maxTimeout:12e4,actionTimeout:12e4}),await this.searchWebOne(this.config.rootUrl,void 0),await this.browser.browser.close(),this.data$.complete(),this.dataMap}async searchWebOne(r,o){let t=await this.browser.runQueue([...await this.config.queueList(r),{type:"evaluate",output:"baseURI",fn:()=>document.baseURI},{type:"evaluate",output:"href",fn:()=>window.location.href},{type:"rawContent",output:"data"}]);console.log("解析完成",r);let i=t.page.getVariable("href"),a=t.page.getVariable("baseURI"),s=t.page.getVariable("data"),p=E(s);this.#e.add(r),this.#e.add(i);let{links:u,content:l}=z(a,s);if(!await this.config.skipParseContent?.(r)){let f;if(p){var m=new Te;f=m.turndown(p.content)}let g={requestUrl:r,parsedUrl:i,parentUrl:o?.from,metadata:p,raw:l,markdown:f};if(await t.page.dispose(),this.dataMap.set(r,g),this.data$.next(g),!p)return}if(!u.some(f=>f.includes("#"))){u=u.filter(this.config.filterLink);for(let f of u){let g=f;if((!this.config.hashMode||!ee(f))&&(g=X(f)),this.#e.has(g)){console.log("已索引,跳过",f);continue}await this.searchWebOne(g,{from:r})}}}};export{G as ActionDefine,q as ActionListDefine,te as FullWebRequest,Me as GlobalConfig,k as WebBrowser,d as WebPage,Q as download,P as format,E as formatDoc,$e as getExecutablePath,Z as init};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cyia/crawl",
3
- "version": "0.0.13",
3
+ "version": "0.0.14",
4
4
  "author": "wszgrcy",
5
5
  "description": "",
6
6
  "dependencies": {
@@ -1 +1,4 @@
1
- export declare function getPageLinks(baseURI: string, content: string): string[];
1
+ export declare function contentLinkChange(baseURI: string, content: string): {
2
+ links: string[];
3
+ content: string;
4
+ };
@@ -0,0 +1,2 @@
1
+ /** 判断路由是否是hash路由,也就是不用nginx导航的路由 */
2
+ export declare function mayBeHashMode(input: string): boolean;
@@ -0,0 +1 @@
1
+ export declare function removeLinkHash(url: string): string;