@cyia/crawl 0.0.15 → 0.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/full-web-request.d.ts +1 -1
- package/index.mjs +3 -3
- package/package.json +1 -1
package/full-web-request.d.ts
CHANGED
|
@@ -31,7 +31,7 @@ export declare class FullWebRequest {
|
|
|
31
31
|
skipParseContent?: (url: string) => Promise<boolean>;
|
|
32
32
|
queueList: (url: string) => Promise<QueueList>;
|
|
33
33
|
});
|
|
34
|
-
start(options?: InitOptions): Promise<Map<string, UrlItemData>>;
|
|
34
|
+
start(options?: Omit<InitOptions, 'cacheDir'>): Promise<Map<string, UrlItemData>>;
|
|
35
35
|
searchWebOne(url: string, context?: {
|
|
36
36
|
from: string;
|
|
37
37
|
}): Promise<void>;
|
package/index.mjs
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import
|
|
2
|
-
`,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),
|
|
3
|
-
${JSON.stringify(t.issues)}`);return this.openPage(async i=>{if(o)for(let a in o)i.setVariable(a,o[a]);return{result:await i.exeQueue(t.output),page:i}})}};import{load as
|
|
1
|
+
import ke from"puppeteer-core";import{ElementHandle as oe}from"puppeteer-core";import{load as C}from"cheerio";import w from"turndown";import{Readability as A}from"@mozilla/readability";import{JSDOM as P}from"jsdom";function E(n,r){if(r.cleanContent){let t=new P(n),i=new A(t.window.document).parse();if(i)switch(r.format){case"html":return i.content;case"text":return i.textContent;case"markdown":{var o=new w;return o.turndown(i.content)}default:throw""}let a=C(n,void 0,!0),s=a("body");if(s.find("script,style,iframe,footer,br,hr,svg,header,img").remove(),s.find("*").removeAttr("class"),s.find("*").removeAttr("style"),a("*").contents().filter(function(){return this.type==="comment"||this.type==="text"&&!this.data.trim()}).remove(),a("*").contents().filter(function(){return this.type==="text"&&!!this.data.trim()}).text((p,f)=>f.trim()),r.format==="html")return s.html();if(r.format==="text")return s.text();if(r.format==="markdown"){var o=new w;return o.turndown(s.html())}}else{if(r.format==="html")return n;if(r.format==="text")return C(n,void 0,!0)("body").text();if(r.format==="markdown"){var o=new w;return o.turndown(n)}}}function I(n){let r=new P(n);return new A(r.window.document,{charThreshold:100}).parse()}import{promise as ie}from"fastq";var d=class{page;browser;parent;#e={};constructor(r,o,t){this.page=r,this.browser=o,this.parent=t}ab;timeoutId;setMaxTimeout(r){this.ab=new AbortController,this.timeoutId=setTimeout(()=>{this.ab.abort("timeout")},r)}clearTimeout(){clearTimeout(this.timeoutId)}setVariable(r,o){this.#e[r]=o}getVariable(r){return this.#e[r]}#r(r,o){let t,i=!1;for(let a=0;a<o.length;a++){let s=o[a];if(s===".."){if(!r.parent)throw new Error("未找到父级");r=r.parent}else if(i){if(!t||typeof t!="object")throw new Error(`${o}路径下未找到值`);t=t[s]}else t=r.#e[s],i=!0}return t}#o(r){if(typeof r=="string")return r;if(r.source==="variable")return typeof r.key=="string"?this.#e[r.key]:this.#r(this,r.key)}#t(r,o){if(o)if(typeof o=="string")this.#e[o]=r;else switch(o.method){case"push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(r);break}case"flat-push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);if(!Array.isArray(r))throw new Error(`${JSON.stringify(r)}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(...r);break}case"define":{this.#e[o.key]=r;break}case"merge":{if(typeof this.#e[o.key]!="object")throw new Error(`${o.key}不是对象类型`);this.#e[o.key]||={},this.#e[o.key]={...this.#e[o.key],...r};break}default:break}}async exeQueue(r){let o;for(let t of r)switch(console.log("准备执行",t),t.type){case"click":await this.page.click(t.selector,{offset:t.offset,delay:t.delay,count:t.count});break;case"type":{await this.page.type(t.selector,this.#o(t.text),{delay:t.delay});break}case"goto":{o=await this.page.goto(this.#o(t.url),{waitUntil:t.waitUntil,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"setViewport":{o=await this.page.setViewport({width:t.width,height:t.height,isMobile:t.isMobile,isLandscape:t.isLandscape});break}case"wait":{switch(t.config.mode){case"selector":{o=await this.page.waitForSelector(t.config.selector,{visible:t.config.visible,hidden:t.config.hidden,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"request":{let i=t.config;o=await this.page.waitForRequest(async a=>!(i.urlRegexp&&!i.urlRegexp.test(a.url())||i.method&&i.method!==a.method()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"response":{let i=t.config;o=await this.page.waitForResponse(async a=>!(i.urlRegexp&&!i.urlRegexp.test(a.url())||i.status&&i.status!==a.status()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"networkIdle":{o=await this.page.waitForNetworkIdle({idleTime:t.config.idleTime,concurrency:t.config.concurrency,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"navigation":{o=await this.page.waitForNavigation({signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"waitBodyElements":{let i=t.config.threshold||100;o=await this.page.waitForFunction(a=>document.body.querySelectorAll("*").length>=a,{timeout:this.browser.getConfig()?.actionTimeout,signal:this.ab?.signal},i);break}}break}case"selector":{t.multi?this.#t(o=await this.page.$$(t.selector),t.output):this.#t(o=await this.page.$(t.selector),t.output);break}case"keypress":{await this.page.keyboard.press(t.key,{delay:t.delay});break}case"findData":{let i=this.#e[t.input];Array.isArray(i)?t.kind==="property"&&this.#t(o=await Promise.all(i.map(a=>a.getProperty(t.key).then(s=>s.jsonValue()))),t.output):i instanceof oe&&this.#t(o=i.getProperty(t.key).then(a=>a.jsonValue()),t.output);break}case"getContent":{let i=await this.page.content();this.#t(o=E(i,{cleanContent:t.cleanContent,format:t.format}),t.output);break}case"rawContent":{let i=await this.page.content();this.#t(o=i,t.output);break}case"page":{let i=this.#e[t.input],a=Array.isArray(i)?i:[i],s=ie(async l=>{console.log("准备执行",l);try{let y=await this.browser.openPage(async m=>(m.setVariable("$item",a[l.index]),m.setVariable("$index",l.index),m.setVariable("$first",l.index===0),m.setVariable("$last",l.index===a.length-1),{result:await m.exeQueue(t.actions),page:m}),this);f.push(y)}catch(y){if(t.throwError)throw y;f.push(void 0)}},t.concurrency),p;s.error(l=>{l&&(p=l)});let f=[];for(let l=0;l<a.length;l++)s.push({index:l});if(await s.drained(),t.throwError&&p)throw p;o=f,this.#t(o,t.output);break}case"setUserAgent":{await this.page.setUserAgent({userAgent:t.userAgent});break}case"close":{await this.page.close({runBeforeUnload:!1}),this.clearTimeout();break}case"custom":{if(typeof t.fn=="function")o=await t.fn(this);else{let i=this.browser.getCustom(t.config.type);if(!i)throw new Error(`自定义[${t.config.type}]未实现处理`);o=await i(t.config,this)}break}case"evaluate":{this.#t(o=await this.page.evaluate(t.fn,...t.args??[]),t.output);break}case"read-variable":{o=this.#e[t.input];break}default:break}return o}dispose(){return this.page.close()}};import*as e from"valibot";var ne=e.picklist(["0","1","2","3","4","5","6","7","8","9","Power","Eject","Abort","Help","Backspace","Tab","Numpad5","NumpadEnter","Enter","\r",`
|
|
2
|
+
`,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),u=e.optional(e.number()),v=e.optional(e.boolean()),ae=e.optional(e.string());var se=u,le=e.pipe(e.string(),e.transform(n=>new RegExp(n))),ce=e.pipe(e.tuple([e.string(),e.pipe(e.string())]),e.transform(([n,r])=>new RegExp(n,r))),T=e.union([le,ce]),b=e.string(),O=e.union([e.string(),e.object({source:e.literal("variable"),key:e.union([e.string(),e.array(e.string())])})]),h=e.optional(e.union([e.string(),e.object({key:e.string(),method:e.picklist(["push","flat-push","define","merge"])})])),D=e.object({timeout:se,waitUntil:e.optional(e.picklist(["load","domcontentloaded","networkidle0","networkidle2"]),"networkidle2"),url:O,type:e.literal("goto")}),R=e.object({width:e.optional(e.number(),1920),height:e.optional(e.number(),1080),isMobile:e.optional(e.boolean()),isLandscape:e.optional(e.boolean()),type:e.literal("setViewport")}),S=e.object({userAgent:e.string(),type:e.literal("setUserAgent")}),ue=e.object({selector:b,visible:v,hidden:v}),pe=e.object({mode:e.literal("selector"),...ue.entries}),fe=e.object({mode:e.literal("request"),urlRegexp:T,method:ae}),me=e.object({mode:e.literal("response"),urlRegexp:e.optional(T),status:u}),ge=e.object({mode:e.literal("networkIdle"),idleTime:u,concurrency:u}),he=e.object({mode:e.literal("waitBodyElements"),threshold:u}),ye=e.object({mode:e.literal("navigation")}),L=e.object({type:e.literal("wait"),config:e.variant("mode",[pe,fe,me,ge,ye,he])}),j=e.object({type:e.literal("click"),selector:b,offset:e.optional(e.object({x:e.number(),y:e.number()})),delay:u,count:u}),U=e.object({type:e.literal("type"),selector:b,text:O,delay:u}),F=e.object({type:e.literal("keypress"),key:ne,delay:u}),V=e.object({type:e.literal("selector"),selector:b,output:h,multi:e.optional(e.boolean(),!1)}),M=e.object({type:e.literal("findData"),input:e.string(),output:h,kind:e.picklist(["property"]),key:e.optional(e.string())}),N=e.object({type:e.literal("getContent"),format:e.optional(e.picklist(["html","text","markdown"]),"html"),cleanContent:v,output:h}),K=e.object({type:e.literal("rawContent"),output:h}),B=e.object({type:e.literal("evaluate"),fn:e.custom(n=>typeof n=="function"),args:e.optional(e.array(e.any())),output:h}),$=e.object({type:e.literal("close")}),W=e.object({type:e.literal("custom"),config:e.optional(e.looseObject({type:e.string()})),fn:e.optional(e.custom(Boolean))}),G=e.object({type:e.literal("read-variable"),input:e.string()}),de=e.object({type:e.literal("page"),input:e.string(),output:h,concurrency:e.optional(e.number(),2),throwError:e.optional(e.boolean(),!1),actions:e.lazy(()=>e.array(q))}),be=[...[D,R,S,L,j,U,F,V,M,N,K,$,W,G,B].map(n=>n.entries.type.literal),"page"],q=e.fallback(e.union([D,R,S,L,j,U,F,V,M,N,K,de,$,W,G,B]),n=>{if(be.includes((n?.value).type))throw new Error(JSON.stringify(n?.issues));return{type:"custom",config:n?.value}}),H=e.array(q),Ne=e.object({maxTimeout:u,actionTimeout:u});import*as Y from"valibot";import{Browser as xe,computeExecutablePath as Ce}from"@puppeteer/browsers";import*as Z from"fs";import{Browser as Q,computeExecutablePath as we,install as ve}from"@puppeteer/browsers";async function J(n){let r=await ve({browser:Q.CHROME,baseUrl:"https://cdn.npmmirror.com/binaries/chrome-for-testing",...n,unpack:!0})}function We(n,r){return we({cacheDir:n,browser:Q.CHROME,buildId:r})}import{PUPPETEER_REVISIONS as Ae}from"puppeteer-core/internal/revisions.js";async function z(n){return k.init(n)}var _=Ae.chrome,k=class n{browser;static async init(r){let o=xe.CHROME,t=Ce({cacheDir:r.cacheDir,browser:o,buildId:_});Z.existsSync(t)||(console.log("准备下载"),await J({cacheDir:r.cacheDir,buildId:_,browser:o}));let i=await ke.launch({...r,executablePath:t});return new n(i)}constructor(r){this.browser=r}#e;#r=new Map;setConfig(r){this.#e=r}getConfig(){return this.#e}registerCustom(r,o){this.#r.set(r,o)}clearCustom(){this.#r.clear()}getCustom(r){return this.#r.get(r)}async openPage(r,o){let t=new d(await this.browser.newPage(),this,o);return this.#e?.maxTimeout&&t.setMaxTimeout(this.#e.maxTimeout),r(t)}runQueue(r,o){let t=Y.safeParse(H,r);if(!t.success)throw new Error(`解析配置错误
|
|
3
|
+
${JSON.stringify(t.issues)}`);return this.openPage(async i=>{if(o)for(let a in o)i.setVariable(a,o[a]);return{result:await i.exeQueue(t.output),page:i}})}};import{load as Pe}from"cheerio";function X(n,r){let o=Pe(r,{baseURI:n}),t=o("a").map((i,a)=>o(a).attr("href")?o(a).prop("href"):"").get().filter(Boolean);return o("a").each((i,a)=>{o(a).attr("href")&&o(a).attr("href",o(a).prop("href"))}),{links:t,content:o.html()}}function ee(n){let r=new URL(n);return r.hash="",r.toString()}import{URL as Ee}from"url";function te(n){return new Ee(n).hash.startsWith("#/")}import{Subject as Ie}from"rxjs";import Te from"turndown";var re=class{config;browser;dataMap=new Map;#e=new Set;data$=new Ie;constructor(r){this.config=r}async start(r){return this.browser=await z({...r,cacheDir:process.cwd()}),this.browser.setConfig({maxTimeout:12e4,actionTimeout:12e4}),await this.searchWebOne(this.config.rootUrl,void 0),await this.browser.browser.close(),this.data$.complete(),this.dataMap}async searchWebOne(r,o){let t=await this.browser.runQueue([...await this.config.queueList(r),{type:"evaluate",output:"baseURI",fn:()=>document.baseURI},{type:"evaluate",output:"href",fn:()=>window.location.href},{type:"rawContent",output:"data"}]);console.log("解析完成",r);let i=t.page.getVariable("href"),a=t.page.getVariable("baseURI"),s=t.page.getVariable("data"),p=I(s);this.#e.add(r),this.#e.add(i);let{links:f,content:l}=X(a,s);if(!await this.config.skipParseContent?.(r)){let c;if(p){var m=new Te;c=m.turndown(p.content)}let g={requestUrl:r,parsedUrl:i,parentUrl:o?.from,metadata:p,raw:l,markdown:c};if(await t.page.dispose(),this.dataMap.set(r,g),this.data$.next(g),!p)return}if(f.some(c=>c.includes("#")))return;let x=[];for(let c of f)await this.config.filterLink(c)&&x.push(c);for(let c of x){let g=c;if((!this.config.hashMode||!te(c))&&(g=ee(c)),this.#e.has(g)){console.log("已索引,跳过",c);continue}await this.searchWebOne(g,{from:r})}}};export{q as ActionDefine,H as ActionListDefine,re as FullWebRequest,Ne as GlobalConfig,k as WebBrowser,d as WebPage,J as download,E as format,I as formatDoc,We as getExecutablePath,z as init};
|