@cyia/crawl 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/define.d.ts CHANGED
@@ -5,6 +5,12 @@ declare const Value: v.UnionSchema<[v.StringSchema<undefined>, v.ObjectSchema<{
5
5
  readonly key: v.UnionSchema<[v.StringSchema<undefined>, v.ArraySchema<v.StringSchema<undefined>, undefined>], undefined>;
6
6
  }, undefined>], undefined>;
7
7
  export type ValueType = v.InferOutput<typeof Value>;
8
+ declare const OutputP: v.OptionalSchema<v.UnionSchema<[v.StringSchema<undefined>, v.ObjectSchema<{
9
+ readonly key: v.StringSchema<undefined>;
10
+ readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
11
+ }, undefined>], undefined>, undefined>;
12
+ type OutputI = v.InferInput<typeof OutputP>;
13
+ export type OutputO = v.InferOutput<typeof OutputP>;
8
14
  export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectSchema<{
9
15
  readonly timeout: v.OptionalSchema<v.NumberSchema<undefined>, undefined>;
10
16
  readonly waitUntil: v.OptionalSchema<v.PicklistSchema<["load", "domcontentloaded", "networkidle0", "networkidle2"], undefined>, "networkidle2">;
@@ -68,30 +74,39 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
68
74
  }, undefined>, v.ObjectSchema<{
69
75
  readonly type: v.LiteralSchema<"selector", undefined>;
70
76
  readonly selector: v.StringSchema<undefined>;
71
- readonly output: v.StringSchema<undefined>;
77
+ readonly output: v.OptionalSchema<v.UnionSchema<[v.StringSchema<undefined>, v.ObjectSchema<{
78
+ readonly key: v.StringSchema<undefined>;
79
+ readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
80
+ }, undefined>], undefined>, undefined>;
72
81
  readonly multi: v.OptionalSchema<v.BooleanSchema<undefined>, false>;
73
82
  }, undefined>, v.ObjectSchema<{
74
83
  readonly type: v.LiteralSchema<"findData", undefined>;
75
84
  readonly input: v.StringSchema<undefined>;
76
- readonly output: v.StringSchema<undefined>;
85
+ readonly output: v.OptionalSchema<v.UnionSchema<[v.StringSchema<undefined>, v.ObjectSchema<{
86
+ readonly key: v.StringSchema<undefined>;
87
+ readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
88
+ }, undefined>], undefined>, undefined>;
77
89
  readonly kind: v.PicklistSchema<["property"], undefined>;
78
90
  readonly key: v.OptionalSchema<v.StringSchema<undefined>, undefined>;
79
91
  }, undefined>, v.ObjectSchema<{
80
92
  readonly type: v.LiteralSchema<"getContent", undefined>;
81
93
  readonly format: v.OptionalSchema<v.PicklistSchema<["html", "text", "markdown"], undefined>, "html">;
82
94
  readonly cleanContent: v.OptionalSchema<v.BooleanSchema<undefined>, undefined>;
83
- readonly output: v.StringSchema<undefined>;
95
+ readonly output: v.OptionalSchema<v.UnionSchema<[v.StringSchema<undefined>, v.ObjectSchema<{
96
+ readonly key: v.StringSchema<undefined>;
97
+ readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
98
+ }, undefined>], undefined>, undefined>;
84
99
  }, undefined>, v.GenericSchema<{
85
100
  type: "page";
86
101
  input: string;
87
- output?: string;
102
+ output?: OutputI;
88
103
  actions: v.InferInput<ActionType>[];
89
104
  concurrency?: number;
90
105
  throwError?: boolean;
91
106
  }, {
92
107
  type: "page";
93
108
  input: string;
94
- output?: string;
109
+ output?: OutputO;
95
110
  actions: v.InferOutput<ActionType>[];
96
111
  concurrency: number;
97
112
  throwError: boolean;
@@ -103,6 +118,9 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
103
118
  readonly type: v.StringSchema<undefined>;
104
119
  }, undefined>, undefined>;
105
120
  readonly fn: v.OptionalSchema<v.CustomSchema<(input: WebPage) => Promise<any>, undefined>, undefined>;
121
+ }, undefined>, v.ObjectSchema<{
122
+ readonly type: v.LiteralSchema<"read-variable", undefined>;
123
+ readonly input: v.StringSchema<undefined>;
106
124
  }, undefined>], undefined>, (item: v.OutputDataset<{
107
125
  timeout?: number | undefined;
108
126
  waitUntil: "load" | "domcontentloaded" | "networkidle0" | "networkidle2";
@@ -172,23 +190,32 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
172
190
  } | {
173
191
  type: "selector";
174
192
  selector: string;
175
- output: string;
193
+ output?: string | {
194
+ key: string;
195
+ method: "push" | "flat-push" | "define" | "merge";
196
+ } | undefined;
176
197
  multi: boolean;
177
198
  } | {
178
199
  type: "findData";
179
200
  input: string;
180
- output: string;
201
+ output?: string | {
202
+ key: string;
203
+ method: "push" | "flat-push" | "define" | "merge";
204
+ } | undefined;
181
205
  kind: "property";
182
206
  key?: string | undefined;
183
207
  } | {
184
208
  type: "getContent";
185
209
  format: "html" | "text" | "markdown";
186
210
  cleanContent?: boolean | undefined;
187
- output: string;
211
+ output?: string | {
212
+ key: string;
213
+ method: "push" | "flat-push" | "define" | "merge";
214
+ } | undefined;
188
215
  } | {
189
216
  type: "page";
190
217
  input: string;
191
- output?: string;
218
+ output?: OutputO;
192
219
  actions: v.InferOutput<ActionType>[];
193
220
  concurrency: number;
194
221
  throwError: boolean;
@@ -202,7 +229,10 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
202
229
  [key: string]: unknown;
203
230
  }) | undefined;
204
231
  fn?: ((input: WebPage) => Promise<any>) | undefined;
205
- }, v.NumberIssue | v.BaseIssue<unknown> | v.BooleanIssue | v.StringIssue | v.TupleIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue> | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue>> | v.PicklistIssue | v.UnionIssue<v.StringIssue | v.TupleIssue> | v.LooseObjectIssue | v.VariantIssue | v.CustomIssue | v.UnionIssue<v.NumberIssue | v.BaseIssue<unknown> | v.BooleanIssue | v.StringIssue | v.TupleIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue> | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue>> | v.PicklistIssue | v.UnionIssue<v.StringIssue | v.TupleIssue> | v.LooseObjectIssue | v.VariantIssue | v.CustomIssue>> | undefined) => {
232
+ } | {
233
+ type: "read-variable";
234
+ input: string;
235
+ }, v.NumberIssue | v.BaseIssue<unknown> | v.BooleanIssue | v.StringIssue | v.TupleIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue> | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue>> | v.PicklistIssue | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.PicklistIssue> | v.UnionIssue<v.StringIssue | v.TupleIssue> | v.LooseObjectIssue | v.VariantIssue | v.CustomIssue | v.UnionIssue<v.NumberIssue | v.BaseIssue<unknown> | v.BooleanIssue | v.StringIssue | v.TupleIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue> | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue>> | v.PicklistIssue | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.PicklistIssue> | v.UnionIssue<v.StringIssue | v.TupleIssue> | v.LooseObjectIssue | v.VariantIssue | v.CustomIssue>> | undefined) => {
206
236
  type: "custom";
207
237
  config: any;
208
238
  }>;
@@ -269,30 +299,39 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
269
299
  }, undefined>, v.ObjectSchema<{
270
300
  readonly type: v.LiteralSchema<"selector", undefined>;
271
301
  readonly selector: v.StringSchema<undefined>;
272
- readonly output: v.StringSchema<undefined>;
302
+ readonly output: v.OptionalSchema<v.UnionSchema<[v.StringSchema<undefined>, v.ObjectSchema<{
303
+ readonly key: v.StringSchema<undefined>;
304
+ readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
305
+ }, undefined>], undefined>, undefined>;
273
306
  readonly multi: v.OptionalSchema<v.BooleanSchema<undefined>, false>;
274
307
  }, undefined>, v.ObjectSchema<{
275
308
  readonly type: v.LiteralSchema<"findData", undefined>;
276
309
  readonly input: v.StringSchema<undefined>;
277
- readonly output: v.StringSchema<undefined>;
310
+ readonly output: v.OptionalSchema<v.UnionSchema<[v.StringSchema<undefined>, v.ObjectSchema<{
311
+ readonly key: v.StringSchema<undefined>;
312
+ readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
313
+ }, undefined>], undefined>, undefined>;
278
314
  readonly kind: v.PicklistSchema<["property"], undefined>;
279
315
  readonly key: v.OptionalSchema<v.StringSchema<undefined>, undefined>;
280
316
  }, undefined>, v.ObjectSchema<{
281
317
  readonly type: v.LiteralSchema<"getContent", undefined>;
282
318
  readonly format: v.OptionalSchema<v.PicklistSchema<["html", "text", "markdown"], undefined>, "html">;
283
319
  readonly cleanContent: v.OptionalSchema<v.BooleanSchema<undefined>, undefined>;
284
- readonly output: v.StringSchema<undefined>;
320
+ readonly output: v.OptionalSchema<v.UnionSchema<[v.StringSchema<undefined>, v.ObjectSchema<{
321
+ readonly key: v.StringSchema<undefined>;
322
+ readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
323
+ }, undefined>], undefined>, undefined>;
285
324
  }, undefined>, v.GenericSchema<{
286
325
  type: "page";
287
326
  input: string;
288
- output?: string;
327
+ output?: OutputI;
289
328
  actions: v.InferInput<ActionType>[];
290
329
  concurrency?: number;
291
330
  throwError?: boolean;
292
331
  }, {
293
332
  type: "page";
294
333
  input: string;
295
- output?: string;
334
+ output?: OutputO;
296
335
  actions: v.InferOutput<ActionType>[];
297
336
  concurrency: number;
298
337
  throwError: boolean;
@@ -304,6 +343,9 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
304
343
  readonly type: v.StringSchema<undefined>;
305
344
  }, undefined>, undefined>;
306
345
  readonly fn: v.OptionalSchema<v.CustomSchema<(input: WebPage) => Promise<any>, undefined>, undefined>;
346
+ }, undefined>, v.ObjectSchema<{
347
+ readonly type: v.LiteralSchema<"read-variable", undefined>;
348
+ readonly input: v.StringSchema<undefined>;
307
349
  }, undefined>], undefined>, (item: v.OutputDataset<{
308
350
  timeout?: number | undefined;
309
351
  waitUntil: "load" | "domcontentloaded" | "networkidle0" | "networkidle2";
@@ -373,23 +415,32 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
373
415
  } | {
374
416
  type: "selector";
375
417
  selector: string;
376
- output: string;
418
+ output?: string | {
419
+ key: string;
420
+ method: "push" | "flat-push" | "define" | "merge";
421
+ } | undefined;
377
422
  multi: boolean;
378
423
  } | {
379
424
  type: "findData";
380
425
  input: string;
381
- output: string;
426
+ output?: string | {
427
+ key: string;
428
+ method: "push" | "flat-push" | "define" | "merge";
429
+ } | undefined;
382
430
  kind: "property";
383
431
  key?: string | undefined;
384
432
  } | {
385
433
  type: "getContent";
386
434
  format: "html" | "text" | "markdown";
387
435
  cleanContent?: boolean | undefined;
388
- output: string;
436
+ output?: string | {
437
+ key: string;
438
+ method: "push" | "flat-push" | "define" | "merge";
439
+ } | undefined;
389
440
  } | {
390
441
  type: "page";
391
442
  input: string;
392
- output?: string;
443
+ output?: OutputO;
393
444
  actions: v.InferOutput<ActionType>[];
394
445
  concurrency: number;
395
446
  throwError: boolean;
@@ -403,7 +454,10 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
403
454
  [key: string]: unknown;
404
455
  }) | undefined;
405
456
  fn?: ((input: WebPage) => Promise<any>) | undefined;
406
- }, v.NumberIssue | v.BaseIssue<unknown> | v.BooleanIssue | v.StringIssue | v.TupleIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue> | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue>> | v.PicklistIssue | v.UnionIssue<v.StringIssue | v.TupleIssue> | v.LooseObjectIssue | v.VariantIssue | v.CustomIssue | v.UnionIssue<v.NumberIssue | v.BaseIssue<unknown> | v.BooleanIssue | v.StringIssue | v.TupleIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue> | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue>> | v.PicklistIssue | v.UnionIssue<v.StringIssue | v.TupleIssue> | v.LooseObjectIssue | v.VariantIssue | v.CustomIssue>> | undefined) => {
457
+ } | {
458
+ type: "read-variable";
459
+ input: string;
460
+ }, v.NumberIssue | v.BaseIssue<unknown> | v.BooleanIssue | v.StringIssue | v.TupleIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue> | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue>> | v.PicklistIssue | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.PicklistIssue> | v.UnionIssue<v.StringIssue | v.TupleIssue> | v.LooseObjectIssue | v.VariantIssue | v.CustomIssue | v.UnionIssue<v.NumberIssue | v.BaseIssue<unknown> | v.BooleanIssue | v.StringIssue | v.TupleIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue> | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue>> | v.PicklistIssue | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.PicklistIssue> | v.UnionIssue<v.StringIssue | v.TupleIssue> | v.LooseObjectIssue | v.VariantIssue | v.CustomIssue>> | undefined) => {
407
461
  type: "custom";
408
462
  config: any;
409
463
  }>, undefined>;
package/format.d.ts CHANGED
@@ -1,4 +1,4 @@
1
1
  export declare function format(rawHtml: string, options: {
2
2
  cleanContent?: boolean;
3
3
  format: 'html' | 'text' | 'markdown';
4
- }): string | undefined;
4
+ }): string | null | undefined;
package/index.mjs CHANGED
@@ -1,3 +1,925 @@
1
- import ne from"puppeteer-core";import{ElementHandle as W}from"puppeteer-core";import{load as B}from"cheerio";import U from"turndown";function b(a,o){let i=B(a,void 0,!0),t=i("body");if(o.cleanContent&&(t.find("script,style,iframe,footer,br,hr,svg,header,img").remove(),t.find("*").removeAttr("class"),t.find("*").removeAttr("style"),i("*").contents().filter(function(){return this.type==="comment"||this.type==="text"&&!this.data.trim()}).remove(),i("*").contents().filter(function(){return this.type==="text"&&!!this.data.trim()}).text((n,s)=>s.trim())),o.format==="html")return t.html();if(o.format==="text")return t.text();if(o.format==="markdown"){var r=new U;return r.turndown(t.html())}}import{promise as G}from"fastq";var p=class{page;browser;parent;#e={};constructor(o,i,t){this.page=o,this.browser=i,this.parent=t}ab;timeoutId;setMaxTimeout(o){this.ab=new AbortController,this.timeoutId=setTimeout(()=>{this.ab.abort("timeout")},o)}clearTimeout(){clearTimeout(this.timeoutId)}setVariable(o,i){this.#e[o]=i}getVariable(o){return this.#e[o]}#t(o,i){let t,r=!1;for(let n=0;n<i.length;n++){let s=i[n];if(s===".."){if(!o.parent)throw new Error("未找到父级");o=o.parent}else if(r){if(!t||typeof t!="object")throw new Error(`${i}路径下未找到值`);t=t[s]}else t=o.#e[s],r=!0}return t}#o(o){if(typeof o=="string")return o;if(o.source==="variable")return typeof o.key=="string"?this.#e[o.key]:this.#t(this,o.key)}async exeQueue(o){let i;for(let t of o)switch(console.log("准备执行",t),t.type){case"click":await this.page.click(t.selector,{offset:t.offset,delay:t.delay,count:t.count});break;case"type":{await this.page.type(t.selector,this.#o(t.text),{delay:t.delay});break}case"goto":{i=await this.page.goto(this.#o(t.url),{waitUntil:t.waitUntil,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"setViewport":{i=await this.page.setViewport({width:t.width,height:t.height,isMobile:t.isMobile,isLandscape:t.isLandscape});break}case"wait":{switch(t.config.mode){case"selector":{i=await this.page.waitForSelector(t.config.selector,{visible:t.config.visible,hidden:t.config.hidden,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"request":{let r=t.config;i=await this.page.waitForRequest(async n=>!(r.urlRegexp&&!r.urlRegexp.test(n.url())||r.method&&r.method!==n.method()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"response":{let r=t.config;i=await this.page.waitForResponse(async n=>!(r.urlRegexp&&!r.urlRegexp.test(n.url())||r.status&&r.status!==n.status()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"networkIdle":{i=await this.page.waitForNetworkIdle({idleTime:t.config.idleTime,concurrency:t.config.concurrency,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"navigation":{i=await this.page.waitForNavigation({signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}}break}case"selector":{t.multi?i=this.#e[t.output]=await this.page.$$(t.selector):i=this.#e[t.output]=await this.page.$(t.selector);break}case"keypress":{await this.page.keyboard.press(t.key,{delay:t.delay});break}case"findData":{let r=this.#e[t.input];Array.isArray(r)?t.kind==="property"&&(i=this.#e[t.output]=await Promise.all(r.map(n=>n.getProperty(t.key).then(s=>s.jsonValue())))):r instanceof W&&(i=this.#e[t.output]=r.getProperty(t.key).then(n=>n.jsonValue()));break}case"getContent":{let r=await this.page.content();i=this.#e[t.output]=b(r,{cleanContent:t.cleanContent,format:t.format});break}case"page":{let r=this.#e[t.input],n=Array.isArray(r)?r:[r],s=G(async l=>{console.log("准备执行",l);try{let y=await this.browser.openPage(async u=>(u.setVariable("$item",n[l.index]),u.setVariable("$index",l.index),u.setVariable("$first",l.index===0),u.setVariable("$last",l.index===n.length-1),await u.exeQueue(t.actions)),this);g.push(y)}catch(y){if(t.throwError)throw y;g.push(void 0)}},t.concurrency),m;s.error(l=>{l&&(m=l)});let g=[];for(let l=0;l<n.length;l++)s.push({index:l});if(await s.drained(),t.throwError&&m)throw m;i=g,t.output&&(this.#e[t.output]=i);break}case"setUserAgent":{await this.page.setUserAgent(t.userAgent);break}case"close":{await this.page.close({runBeforeUnload:!1}),this.clearTimeout();break}case"custom":{if(typeof t.fn=="function")i=await t.fn(this);else{let r=this.browser.getCustom(t.config.type);if(!r)throw new Error(`自定义[${t.config.type}]未实现处理`);i=await r(t.config,this)}break}default:break}return i}};import*as e from"valibot";var $=e.picklist(["0","1","2","3","4","5","6","7","8","9","Power","Eject","Abort","Help","Backspace","Tab","Numpad5","NumpadEnter","Enter","\r",`
2
- `,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),c=e.optional(e.number()),h=e.optional(e.boolean()),H=e.optional(e.string());var q=c,Q=e.pipe(e.string(),e.transform(a=>new RegExp(a))),J=e.pipe(e.tuple([e.string(),e.pipe(e.string())]),e.transform(([a,o])=>new RegExp(a,o))),d=e.union([Q,J]),f=e.string(),w=e.union([e.string(),e.object({source:e.literal("variable"),key:e.union([e.string(),e.array(e.string())])})]),k=e.object({timeout:q,waitUntil:e.optional(e.picklist(["load","domcontentloaded","networkidle0","networkidle2"]),"networkidle2"),url:w,type:e.literal("goto")}),x=e.object({width:e.optional(e.number(),1920),height:e.optional(e.number(),1080),isMobile:e.optional(e.boolean()),isLandscape:e.optional(e.boolean()),type:e.literal("setViewport")}),C=e.object({userAgent:e.string(),type:e.literal("setUserAgent")}),Y=e.object({selector:f,visible:h,hidden:h}),Z=e.object({mode:e.literal("selector"),...Y.entries}),z=e.object({mode:e.literal("request"),urlRegexp:d,method:H}),X=e.object({mode:e.literal("response"),urlRegexp:e.optional(d),status:c}),_=e.object({mode:e.literal("networkIdle"),idleTime:c,concurrency:c}),ee=e.object({mode:e.literal("navigation")}),A=e.object({type:e.literal("wait"),config:e.variant("mode",[Z,z,X,_,ee])}),P=e.object({type:e.literal("click"),selector:f,offset:e.optional(e.object({x:e.number(),y:e.number()})),delay:c,count:c}),T=e.object({type:e.literal("type"),selector:f,text:w,delay:c}),E=e.object({type:e.literal("keypress"),key:$,delay:c}),D=e.object({type:e.literal("selector"),selector:f,output:e.string(),multi:e.optional(e.boolean(),!1)}),I=e.object({type:e.literal("findData"),input:e.string(),output:e.string(),kind:e.picklist(["property"]),key:e.optional(e.string())}),F=e.object({type:e.literal("getContent"),format:e.optional(e.picklist(["html","text","markdown"]),"html"),cleanContent:h,output:e.string()}),S=e.object({type:e.literal("close")}),R=e.object({type:e.literal("custom"),config:e.optional(e.looseObject({type:e.string()})),fn:e.optional(e.custom(Boolean))}),te=e.object({type:e.literal("page"),input:e.string(),output:e.optional(e.string()),concurrency:e.optional(e.number(),2),throwError:e.optional(e.boolean(),!1),actions:e.lazy(()=>e.array(N))}),oe=[...[k,x,C,A,P,T,E,D,I,F,S,R].map(a=>a.entries.type.literal),"page"],N=e.fallback(e.union([k,x,C,A,P,T,E,D,I,F,te,S,R]),a=>{if(oe.includes((a?.value).type))throw new Error(JSON.stringify(a?.issues));return{type:"custom",config:a?.value}}),V=e.array(N),ve=e.object({maxTimeout:c,actionTimeout:c});import*as L from"valibot";import{Browser as ae,computeExecutablePath as se}from"@puppeteer/browsers";import*as M from"fs";import{Browser as K,computeExecutablePath as ie,install as re}from"@puppeteer/browsers";async function j(a){let o=await re({browser:K.CHROME,baseUrl:"https://cdn.npmmirror.com/binaries/chrome-for-testing",...a,unpack:!0})}function ke(a,o){return ie({cacheDir:a,browser:K.CHROME,buildId:o})}import{PUPPETEER_REVISIONS as le}from"puppeteer-core/internal/revisions.js";async function Ne(a){return v.init(a)}var O=le.chrome,v=class a{browser;static async init(o){let i=ae.CHROME,t=se({cacheDir:o.cacheDir,browser:i,buildId:O});M.existsSync(t)||(console.log("准备下载"),await j({cacheDir:o.cacheDir,buildId:O,browser:i}));let r=await ne.launch({...o,executablePath:t});return new a(r)}constructor(o){this.browser=o}#e;#t=new Map;setConfig(o){this.#e=o}getConfig(){return this.#e}registerCustom(o,i){this.#t.set(o,i)}clearCustom(){this.#t.clear()}getCustom(o){return this.#t.get(o)}async openPage(o,i){let t=new p(await this.browser.newPage(),this,i);return this.#e?.maxTimeout&&t.setMaxTimeout(this.#e.maxTimeout),o(t)}runQueue(o,i){let t=L.safeParse(V,o);if(!t.success)throw new Error(`解析配置错误
3
- ${JSON.stringify(t.issues)}`);return this.openPage(async r=>{if(i)for(let n in i)r.setVariable(n,i[n]);return r.exeQueue(t.output)})}};export{N as ActionDefine,V as ActionListDefine,ve as GlobalConfig,v as WebBrowser,p as WebPage,j as download,b as format,ke as getExecutablePath,Ne as init};
1
+ // src/init.ts
2
+ import puppeteer from "puppeteer-core";
3
+
4
+ // src/page.ts
5
+ import { ElementHandle } from "puppeteer-core";
6
+
7
+ // src/format.ts
8
+ import { load } from "cheerio";
9
+ import TurndownService from "turndown";
10
+ import { Readability } from "@mozilla/readability";
11
+ import { JSDOM } from "jsdom";
12
+ function format(rawHtml, options) {
13
+ if (options.cleanContent) {
14
+ let dom = new JSDOM(rawHtml);
15
+ let result = new Readability(dom.window.document).parse();
16
+ if (result) {
17
+ switch (options.format) {
18
+ case "html":
19
+ return result.content;
20
+ case "text":
21
+ return result.textContent;
22
+ case "markdown": {
23
+ var turndownService = new TurndownService();
24
+ return turndownService.turndown(result.content);
25
+ }
26
+ default:
27
+ throw "";
28
+ }
29
+ }
30
+ let $ = load(rawHtml, void 0, true);
31
+ let $body = $("body");
32
+ $body.find("script,style,iframe,footer,br,hr,svg,header,img").remove();
33
+ $body.find("*").removeAttr("class");
34
+ $body.find("*").removeAttr("style");
35
+ $("*").contents().filter(function() {
36
+ return this.type === "comment" || this.type === "text" && !this.data.trim();
37
+ }).remove();
38
+ $("*").contents().filter(function() {
39
+ return this.type === "text" && !!this.data.trim();
40
+ }).text((i, text) => {
41
+ return text.trim();
42
+ });
43
+ if (options.format === "html") {
44
+ return $body.html();
45
+ } else if (options.format === "text") {
46
+ return $body.text();
47
+ } else if (options.format === "markdown") {
48
+ var turndownService = new TurndownService();
49
+ return turndownService.turndown($body.html());
50
+ }
51
+ } else {
52
+ if (options.format === "html") {
53
+ return rawHtml;
54
+ } else if (options.format === "text") {
55
+ let $ = load(rawHtml, void 0, true);
56
+ let $body = $("body");
57
+ return $body.text();
58
+ } else if (options.format === "markdown") {
59
+ var turndownService = new TurndownService();
60
+ return turndownService.turndown(rawHtml);
61
+ }
62
+ }
63
+ }
64
+
65
+ // src/page.ts
66
+ import { promise as fastq } from "fastq";
67
+ var WebPage = class {
68
+ page;
69
+ browser;
70
+ parent;
71
+ #obj = {};
72
+ constructor(page, browser, parent) {
73
+ this.page = page;
74
+ this.browser = browser;
75
+ this.parent = parent;
76
+ }
77
+ ab;
78
+ timeoutId;
79
+ setMaxTimeout(timeout) {
80
+ this.ab = new AbortController();
81
+ this.timeoutId = setTimeout(() => {
82
+ this.ab.abort("timeout");
83
+ }, timeout);
84
+ }
85
+ clearTimeout() {
86
+ clearTimeout(this.timeoutId);
87
+ }
88
+ setVariable(key, value) {
89
+ this.#obj[key] = value;
90
+ }
91
+ getVariable(key) {
92
+ return this.#obj[key];
93
+ }
94
+ #navigatePath(page, paths) {
95
+ let value = void 0;
96
+ let findValue = false;
97
+ for (let i = 0; i < paths.length; i++) {
98
+ const item = paths[i];
99
+ if (item === "..") {
100
+ if (!page.parent) {
101
+ throw new Error("未找到父级");
102
+ }
103
+ page = page.parent;
104
+ } else if (findValue) {
105
+ if (!value || typeof value !== "object") {
106
+ throw new Error(`${paths}路径下未找到值`);
107
+ }
108
+ value = value[item];
109
+ } else {
110
+ value = page.#obj[item];
111
+ findValue = true;
112
+ }
113
+ }
114
+ return value;
115
+ }
116
+ #getValue(value) {
117
+ if (typeof value === "string") {
118
+ return value;
119
+ } else if (value.source === "variable") {
120
+ if (typeof value.key === "string") {
121
+ return this.#obj[value.key];
122
+ } else {
123
+ return this.#navigatePath(this, value.key);
124
+ }
125
+ }
126
+ }
127
+ #setOutput(value, output) {
128
+ if (!output) {
129
+ return;
130
+ }
131
+ if (typeof output === "string") {
132
+ this.#obj[output] = value;
133
+ } else {
134
+ switch (output.method) {
135
+ case "push": {
136
+ if (!Array.isArray(this.#obj[output.key])) {
137
+ throw new Error(`${output.key}不是数组类型`);
138
+ }
139
+ this.#obj[output.key] ||= [];
140
+ this.#obj[output.key].push(value);
141
+ break;
142
+ }
143
+ case "flat-push": {
144
+ if (!Array.isArray(this.#obj[output.key])) {
145
+ throw new Error(`${output.key}不是数组类型`);
146
+ }
147
+ if (!Array.isArray(value)) {
148
+ throw new Error(`${JSON.stringify(value)}不是数组类型`);
149
+ }
150
+ this.#obj[output.key] ||= [];
151
+ this.#obj[output.key].push(...value);
152
+ break;
153
+ }
154
+ case "define": {
155
+ this.#obj[output.key] = value;
156
+ break;
157
+ }
158
+ case "merge": {
159
+ if (typeof this.#obj[output.key] !== "object") {
160
+ throw new Error(`${output.key}不是对象类型`);
161
+ }
162
+ this.#obj[output.key] ||= {};
163
+ this.#obj[output.key] = { ...this.#obj[output.key], ...value };
164
+ break;
165
+ }
166
+ default:
167
+ break;
168
+ }
169
+ }
170
+ }
171
+ async exeQueue(list) {
172
+ let value;
173
+ for (const item of list) {
174
+ console.log("准备执行", item);
175
+ switch (item.type) {
176
+ case "click":
177
+ await this.page.click(item.selector, {
178
+ offset: item.offset,
179
+ delay: item.delay,
180
+ count: item.count
181
+ });
182
+ break;
183
+ case "type": {
184
+ await this.page.type(item.selector, this.#getValue(item.text), {
185
+ delay: item.delay
186
+ });
187
+ break;
188
+ }
189
+ case "goto": {
190
+ value = await this.page.goto(this.#getValue(item.url), {
191
+ waitUntil: item.waitUntil,
192
+ signal: this.ab?.signal,
193
+ timeout: this.browser.getConfig()?.actionTimeout
194
+ });
195
+ break;
196
+ }
197
+ case "setViewport": {
198
+ value = await this.page.setViewport({
199
+ width: item.width,
200
+ height: item.height,
201
+ isMobile: item.isMobile,
202
+ isLandscape: item.isLandscape
203
+ });
204
+ break;
205
+ }
206
+ case "wait": {
207
+ switch (item.config.mode) {
208
+ case "selector": {
209
+ value = await this.page.waitForSelector(item.config.selector, {
210
+ visible: item.config.visible,
211
+ hidden: item.config.hidden,
212
+ signal: this.ab?.signal,
213
+ timeout: this.browser.getConfig()?.actionTimeout
214
+ });
215
+ break;
216
+ }
217
+ case "request": {
218
+ const config = item.config;
219
+ value = await this.page.waitForRequest(
220
+ async (req) => {
221
+ if (config.urlRegexp) {
222
+ let result = config.urlRegexp.test(req.url());
223
+ if (!result) {
224
+ return false;
225
+ }
226
+ }
227
+ if (config.method && config.method !== req.method()) {
228
+ return false;
229
+ }
230
+ return true;
231
+ },
232
+ {
233
+ signal: this.ab?.signal,
234
+ timeout: this.browser.getConfig()?.actionTimeout
235
+ }
236
+ );
237
+ break;
238
+ }
239
+ case "response": {
240
+ const config = item.config;
241
+ value = await this.page.waitForResponse(
242
+ async (res) => {
243
+ if (config.urlRegexp) {
244
+ let result = config.urlRegexp.test(res.url());
245
+ if (!result) {
246
+ return false;
247
+ }
248
+ }
249
+ if (config.status && config.status !== res.status()) {
250
+ return false;
251
+ }
252
+ return true;
253
+ },
254
+ {
255
+ signal: this.ab?.signal,
256
+ timeout: this.browser.getConfig()?.actionTimeout
257
+ }
258
+ );
259
+ break;
260
+ }
261
+ case "networkIdle": {
262
+ value = await this.page.waitForNetworkIdle({
263
+ idleTime: item.config.idleTime,
264
+ concurrency: item.config.concurrency,
265
+ signal: this.ab?.signal,
266
+ timeout: this.browser.getConfig()?.actionTimeout
267
+ });
268
+ break;
269
+ }
270
+ case "navigation": {
271
+ value = await this.page.waitForNavigation({
272
+ signal: this.ab?.signal,
273
+ timeout: this.browser.getConfig()?.actionTimeout
274
+ });
275
+ break;
276
+ }
277
+ }
278
+ break;
279
+ }
280
+ case "selector": {
281
+ if (item.multi) {
282
+ this.#setOutput(value = await this.page.$$(item.selector), item.output);
283
+ } else {
284
+ this.#setOutput(value = await this.page.$(item.selector), item.output);
285
+ }
286
+ break;
287
+ }
288
+ case "keypress": {
289
+ await this.page.keyboard.press(item.key, { delay: item.delay });
290
+ break;
291
+ }
292
+ case "findData": {
293
+ let data = this.#obj[item.input];
294
+ if (Array.isArray(data)) {
295
+ if (item.kind === "property") {
296
+ this.#setOutput(
297
+ value = await Promise.all(
298
+ data.map((el) => {
299
+ return el.getProperty(item.key).then((a) => {
300
+ return a.jsonValue();
301
+ });
302
+ })
303
+ ),
304
+ item.output
305
+ );
306
+ }
307
+ } else {
308
+ if (data instanceof ElementHandle) {
309
+ this.#setOutput(
310
+ value = data.getProperty(item.key).then((a) => {
311
+ return a.jsonValue();
312
+ }),
313
+ item.output
314
+ );
315
+ }
316
+ }
317
+ break;
318
+ }
319
+ case "getContent": {
320
+ let content = await this.page.content();
321
+ this.#setOutput(value = format(content, { cleanContent: item.cleanContent, format: item.format }), item.output);
322
+ break;
323
+ }
324
+ case "page": {
325
+ let inputValue = this.#obj[item.input];
326
+ let list2 = Array.isArray(inputValue) ? inputValue : [inputValue];
327
+ let queue = fastq(async (input) => {
328
+ console.log("准备执行", input);
329
+ try {
330
+ let result = await this.browser.openPage(async (page) => {
331
+ page.setVariable("$item", list2[input.index]);
332
+ page.setVariable("$index", input.index);
333
+ page.setVariable("$first", input.index === 0);
334
+ page.setVariable("$last", input.index === list2.length - 1);
335
+ return await page.exeQueue(item.actions);
336
+ }, this);
337
+ resultList.push(result);
338
+ } catch (error) {
339
+ if (item.throwError) {
340
+ throw error;
341
+ } else {
342
+ resultList.push(void 0);
343
+ }
344
+ }
345
+ }, item.concurrency);
346
+ let queueError;
347
+ queue.error((error) => {
348
+ if (error) {
349
+ queueError = error;
350
+ }
351
+ });
352
+ let resultList = [];
353
+ for (let index = 0; index < list2.length; index++) {
354
+ queue.push({ index });
355
+ }
356
+ await queue.drained();
357
+ if (item.throwError && queueError) {
358
+ throw queueError;
359
+ }
360
+ value = resultList;
361
+ this.#setOutput(value, item.output);
362
+ break;
363
+ }
364
+ case "setUserAgent": {
365
+ await this.page.setUserAgent(item.userAgent);
366
+ break;
367
+ }
368
+ case "close": {
369
+ await this.page.close({ runBeforeUnload: false });
370
+ this.clearTimeout();
371
+ break;
372
+ }
373
+ case "custom": {
374
+ if (typeof item.fn === "function") {
375
+ value = await item.fn(this);
376
+ } else {
377
+ let plugin = this.browser.getCustom(item.config.type);
378
+ if (!plugin) {
379
+ throw new Error(`自定义[${item.config.type}]未实现处理`);
380
+ }
381
+ value = await plugin(item.config, this);
382
+ }
383
+ break;
384
+ }
385
+ case "read-variable": {
386
+ value = this.#obj[item.input];
387
+ break;
388
+ }
389
+ default:
390
+ break;
391
+ }
392
+ }
393
+ return value;
394
+ }
395
+ };
396
+
397
+ // src/define.ts
398
+ import * as v from "valibot";
399
+ var KEYLIST = v.picklist([
400
+ "0",
401
+ "1",
402
+ "2",
403
+ "3",
404
+ "4",
405
+ "5",
406
+ "6",
407
+ "7",
408
+ "8",
409
+ "9",
410
+ "Power",
411
+ "Eject",
412
+ "Abort",
413
+ "Help",
414
+ "Backspace",
415
+ "Tab",
416
+ "Numpad5",
417
+ "NumpadEnter",
418
+ "Enter",
419
+ "\r",
420
+ "\n",
421
+ "ShiftLeft",
422
+ "ShiftRight",
423
+ "ControlLeft",
424
+ "ControlRight",
425
+ "AltLeft",
426
+ "AltRight",
427
+ "Pause",
428
+ "CapsLock",
429
+ "Escape",
430
+ "Convert",
431
+ "NonConvert",
432
+ "Space",
433
+ "Numpad9",
434
+ "PageUp",
435
+ "Numpad3",
436
+ "PageDown",
437
+ "End",
438
+ "Numpad1",
439
+ "Home",
440
+ "Numpad7",
441
+ "ArrowLeft",
442
+ "Numpad4",
443
+ "Numpad8",
444
+ "ArrowUp",
445
+ "ArrowRight",
446
+ "Numpad6",
447
+ "Numpad2",
448
+ "ArrowDown",
449
+ "Select",
450
+ "Open",
451
+ "PrintScreen",
452
+ "Insert",
453
+ "Numpad0",
454
+ "Delete",
455
+ "NumpadDecimal",
456
+ "Digit0",
457
+ "Digit1",
458
+ "Digit2",
459
+ "Digit3",
460
+ "Digit4",
461
+ "Digit5",
462
+ "Digit6",
463
+ "Digit7",
464
+ "Digit8",
465
+ "Digit9",
466
+ "KeyA",
467
+ "KeyB",
468
+ "KeyC",
469
+ "KeyD",
470
+ "KeyE",
471
+ "KeyF",
472
+ "KeyG",
473
+ "KeyH",
474
+ "KeyI",
475
+ "KeyJ",
476
+ "KeyK",
477
+ "KeyL",
478
+ "KeyM",
479
+ "KeyN",
480
+ "KeyO",
481
+ "KeyP",
482
+ "KeyQ",
483
+ "KeyR",
484
+ "KeyS",
485
+ "KeyT",
486
+ "KeyU",
487
+ "KeyV",
488
+ "KeyW",
489
+ "KeyX",
490
+ "KeyY",
491
+ "KeyZ",
492
+ "MetaLeft",
493
+ "MetaRight",
494
+ "ContextMenu",
495
+ "NumpadMultiply",
496
+ "NumpadAdd",
497
+ "NumpadSubtract",
498
+ "NumpadDivide",
499
+ "F1",
500
+ "F2",
501
+ "F3",
502
+ "F4",
503
+ "F5",
504
+ "F6",
505
+ "F7",
506
+ "F8",
507
+ "F9",
508
+ "F10",
509
+ "F11",
510
+ "F12",
511
+ "F13",
512
+ "F14",
513
+ "F15",
514
+ "F16",
515
+ "F17",
516
+ "F18",
517
+ "F19",
518
+ "F20",
519
+ "F21",
520
+ "F22",
521
+ "F23",
522
+ "F24",
523
+ "NumLock",
524
+ "ScrollLock",
525
+ "AudioVolumeMute",
526
+ "AudioVolumeDown",
527
+ "AudioVolumeUp",
528
+ "MediaTrackNext",
529
+ "MediaTrackPrevious",
530
+ "MediaStop",
531
+ "MediaPlayPause",
532
+ "Semicolon",
533
+ "Equal",
534
+ "NumpadEqual",
535
+ "Comma",
536
+ "Minus",
537
+ "Period",
538
+ "Slash",
539
+ "Backquote",
540
+ "BracketLeft",
541
+ "Backslash",
542
+ "BracketRight",
543
+ "Quote",
544
+ "AltGraph",
545
+ "Props",
546
+ "Cancel",
547
+ "Clear",
548
+ "Shift",
549
+ "Control",
550
+ "Alt",
551
+ "Accept",
552
+ "ModeChange",
553
+ " ",
554
+ "Print",
555
+ "Execute",
556
+ "\0",
557
+ "a",
558
+ "b",
559
+ "c",
560
+ "d",
561
+ "e",
562
+ "f",
563
+ "g",
564
+ "h",
565
+ "i",
566
+ "j",
567
+ "k",
568
+ "l",
569
+ "m",
570
+ "n",
571
+ "o",
572
+ "p",
573
+ "q",
574
+ "r",
575
+ "s",
576
+ "t",
577
+ "u",
578
+ "v",
579
+ "w",
580
+ "x",
581
+ "y",
582
+ "z",
583
+ "Meta",
584
+ "*",
585
+ "+",
586
+ "-",
587
+ "/",
588
+ ";",
589
+ "=",
590
+ ",",
591
+ ".",
592
+ "`",
593
+ "[",
594
+ "\\",
595
+ "]",
596
+ "'",
597
+ "Attn",
598
+ "CrSel",
599
+ "ExSel",
600
+ "EraseEof",
601
+ "Play",
602
+ "ZoomOut",
603
+ ")",
604
+ "!",
605
+ "@",
606
+ "#",
607
+ "$",
608
+ "%",
609
+ "^",
610
+ "&",
611
+ "(",
612
+ "A",
613
+ "B",
614
+ "C",
615
+ "D",
616
+ "E",
617
+ "F",
618
+ "G",
619
+ "H",
620
+ "I",
621
+ "J",
622
+ "K",
623
+ "L",
624
+ "M",
625
+ "N",
626
+ "O",
627
+ "P",
628
+ "Q",
629
+ "R",
630
+ "S",
631
+ "T",
632
+ "U",
633
+ "V",
634
+ "W",
635
+ "X",
636
+ "Y",
637
+ "Z",
638
+ ":",
639
+ "<",
640
+ "_",
641
+ ">",
642
+ "?",
643
+ "~",
644
+ "{",
645
+ ",",
646
+ "}",
647
+ '"',
648
+ "SoftLeft",
649
+ "SoftRight",
650
+ "Camera",
651
+ "Call",
652
+ "EndCall",
653
+ "VolumeDown",
654
+ "VolumeUp"
655
+ ]);
656
+ var OptNumber = v.optional(v.number());
657
+ var OptBoolean = v.optional(v.boolean());
658
+ var OptString = v.optional(v.string());
659
+ var TimeoutDefine = OptNumber;
660
+ var RegExpStr = v.pipe(
661
+ v.string(),
662
+ v.transform((input) => {
663
+ return new RegExp(input);
664
+ })
665
+ );
666
+ var RegexpTup = v.pipe(
667
+ v.tuple([v.string(), v.pipe(v.string())]),
668
+ v.transform(([input, flag]) => {
669
+ return new RegExp(input, flag);
670
+ })
671
+ );
672
+ var RegexpUni = v.union([RegExpStr, RegexpTup]);
673
+ var Selector = v.string();
674
+ var Value = v.union([v.string(), v.object({ source: v.literal("variable"), key: v.union([v.string(), v.array(v.string())]) })]);
675
+ var OutputP = v.optional(
676
+ v.union([v.string(), v.object({ key: v.string(), method: v.picklist(["push", "flat-push", "define", "merge"]) })])
677
+ );
678
+ var GoToA = v.object({
679
+ timeout: TimeoutDefine,
680
+ waitUntil: v.optional(v.picklist(["load", "domcontentloaded", "networkidle0", "networkidle2"]), "networkidle2"),
681
+ url: Value,
682
+ type: v.literal("goto")
683
+ });
684
+ var SetViewportA = v.object({
685
+ width: v.optional(v.number(), 1920),
686
+ height: v.optional(v.number(), 1080),
687
+ isMobile: v.optional(v.boolean()),
688
+ isLandscape: v.optional(v.boolean()),
689
+ type: v.literal("setViewport")
690
+ });
691
+ var SetUserAgentA = v.object({
692
+ userAgent: v.string(),
693
+ type: v.literal("setUserAgent")
694
+ });
695
+ var SelectorCommon = v.object({
696
+ selector: Selector,
697
+ visible: OptBoolean,
698
+ hidden: OptBoolean
699
+ });
700
+ var WaitSelector = v.object({
701
+ mode: v.literal("selector"),
702
+ ...SelectorCommon.entries
703
+ });
704
+ var WaitRequest = v.object({
705
+ mode: v.literal("request"),
706
+ urlRegexp: RegexpUni,
707
+ method: OptString
708
+ });
709
+ var WaitResponse = v.object({
710
+ mode: v.literal("response"),
711
+ urlRegexp: v.optional(RegexpUni),
712
+ // method: OptString,
713
+ status: OptNumber
714
+ });
715
+ var WaitNetworkIdle = v.object({
716
+ mode: v.literal("networkIdle"),
717
+ idleTime: OptNumber,
718
+ concurrency: OptNumber
719
+ });
720
+ var WaitNavigation = v.object({
721
+ mode: v.literal("navigation")
722
+ });
723
+ var WaitA = v.object({
724
+ type: v.literal("wait"),
725
+ config: v.variant("mode", [WaitSelector, WaitRequest, WaitResponse, WaitNetworkIdle, WaitNavigation])
726
+ });
727
+ var ClickA = v.object({
728
+ type: v.literal("click"),
729
+ selector: Selector,
730
+ offset: v.optional(v.object({ x: v.number(), y: v.number() })),
731
+ delay: OptNumber,
732
+ count: OptNumber
733
+ });
734
+ var TypeA = v.object({
735
+ type: v.literal("type"),
736
+ selector: Selector,
737
+ text: Value,
738
+ delay: OptNumber
739
+ });
740
+ var KeyPress = v.object({
741
+ type: v.literal("keypress"),
742
+ key: KEYLIST,
743
+ delay: OptNumber
744
+ });
745
+ var SelectEl = v.object({
746
+ type: v.literal("selector"),
747
+ // ...SelectorCommon.entries,
748
+ selector: Selector,
749
+ output: OutputP,
750
+ multi: v.optional(v.boolean(), false)
751
+ });
752
+ var FindData = v.object({
753
+ type: v.literal("findData"),
754
+ input: v.string(),
755
+ output: OutputP,
756
+ kind: v.picklist(["property"]),
757
+ key: v.optional(v.string())
758
+ // multi: v.optional(v.boolean(), true),
759
+ });
760
+ var GetContent = v.object({
761
+ type: v.literal("getContent"),
762
+ format: v.optional(v.picklist(["html", "text", "markdown"]), "html"),
763
+ cleanContent: OptBoolean,
764
+ output: OutputP
765
+ });
766
+ var CloseA = v.object({
767
+ type: v.literal("close")
768
+ });
769
+ var PluginA = v.object({
770
+ type: v.literal("custom"),
771
+ config: v.optional(v.looseObject({ type: v.string() })),
772
+ fn: v.optional(v.custom(Boolean))
773
+ });
774
+ var ReadVariable = v.object({
775
+ type: v.literal("read-variable"),
776
+ input: v.string()
777
+ });
778
+ var PageA = v.object({
779
+ type: v.literal("page"),
780
+ input: v.string(),
781
+ output: OutputP,
782
+ concurrency: v.optional(v.number(), 2),
783
+ throwError: v.optional(v.boolean(), false),
784
+ actions: v.lazy(() => v.array(ActionDefine))
785
+ });
786
+ var TypeList = [
787
+ ...[
788
+ GoToA,
789
+ SetViewportA,
790
+ SetUserAgentA,
791
+ WaitA,
792
+ ClickA,
793
+ TypeA,
794
+ KeyPress,
795
+ SelectEl,
796
+ FindData,
797
+ GetContent,
798
+ // PageA,
799
+ CloseA,
800
+ PluginA,
801
+ ReadVariable
802
+ ].map((item) => item.entries.type.literal),
803
+ "page"
804
+ ];
805
+ var ActionDefine = v.fallback(
806
+ v.union([
807
+ GoToA,
808
+ SetViewportA,
809
+ SetUserAgentA,
810
+ WaitA,
811
+ ClickA,
812
+ TypeA,
813
+ KeyPress,
814
+ SelectEl,
815
+ FindData,
816
+ GetContent,
817
+ PageA,
818
+ CloseA,
819
+ PluginA,
820
+ ReadVariable
821
+ ]),
822
+ (item) => {
823
+ if (TypeList.includes((item?.value).type)) {
824
+ throw new Error(JSON.stringify(item?.issues));
825
+ }
826
+ return { type: "custom", config: item?.value };
827
+ }
828
+ );
829
+ var ActionListDefine = v.array(ActionDefine);
830
+ var GlobalConfig = v.object({
831
+ maxTimeout: OptNumber,
832
+ actionTimeout: OptNumber
833
+ });
834
+
835
+ // src/init.ts
836
+ import * as v2 from "valibot";
837
+ import { Browser as BV, computeExecutablePath as computeExecutablePath2 } from "@puppeteer/browsers";
838
+ import * as fs from "fs";
839
+
840
+ // src/download.ts
841
+ import { Browser, computeExecutablePath, install } from "@puppeteer/browsers";
842
+ async function download(options) {
843
+ let result = await install({
844
+ browser: Browser.CHROME,
845
+ baseUrl: "https://cdn.npmmirror.com/binaries/chrome-for-testing",
846
+ ...options,
847
+ unpack: true
848
+ });
849
+ }
850
+ function getExecutablePath(dir, buildId) {
851
+ return computeExecutablePath({ cacheDir: dir, browser: Browser.CHROME, buildId });
852
+ }
853
+
854
+ // src/init.ts
855
+ import { PUPPETEER_REVISIONS } from "puppeteer-core/internal/revisions.js";
856
+ async function init(options) {
857
+ return WebBrowser.init(options);
858
+ }
859
+ var CHROME_VERSION = PUPPETEER_REVISIONS.chrome;
860
+ var WebBrowser = class _WebBrowser {
861
+ browser;
862
+ static async init(options) {
863
+ let bvType = BV.CHROME;
864
+ let executablePath = computeExecutablePath2({ cacheDir: options.cacheDir, browser: bvType, buildId: CHROME_VERSION });
865
+ if (!fs.existsSync(executablePath)) {
866
+ console.log("准备下载");
867
+ await download({ cacheDir: options.cacheDir, buildId: CHROME_VERSION, browser: bvType });
868
+ }
869
+ const browser = await puppeteer.launch({ ...options, executablePath });
870
+ return new _WebBrowser(browser);
871
+ }
872
+ constructor(browser) {
873
+ this.browser = browser;
874
+ }
875
+ #config;
876
+ #pluginMap = /* @__PURE__ */ new Map();
877
+ setConfig(config) {
878
+ this.#config = config;
879
+ }
880
+ getConfig() {
881
+ return this.#config;
882
+ }
883
+ registerCustom(type, fn) {
884
+ this.#pluginMap.set(type, fn);
885
+ }
886
+ clearCustom() {
887
+ this.#pluginMap.clear();
888
+ }
889
+ getCustom(key) {
890
+ return this.#pluginMap.get(key);
891
+ }
892
+ async openPage(fn, parent) {
893
+ let page = new WebPage(await this.browser.newPage(), this, parent);
894
+ if (this.#config?.maxTimeout) {
895
+ page.setMaxTimeout(this.#config.maxTimeout);
896
+ }
897
+ return fn(page);
898
+ }
899
+ runQueue(list, input) {
900
+ let result = v2.safeParse(ActionListDefine, list);
901
+ if (!result.success) {
902
+ throw new Error(`解析配置错误
903
+ ${JSON.stringify(result.issues)}`);
904
+ }
905
+ return this.openPage(async (page) => {
906
+ if (input) {
907
+ for (const key in input) {
908
+ page.setVariable(key, input[key]);
909
+ }
910
+ }
911
+ return page.exeQueue(result.output);
912
+ });
913
+ }
914
+ };
915
+ export {
916
+ ActionDefine,
917
+ ActionListDefine,
918
+ GlobalConfig,
919
+ WebBrowser,
920
+ WebPage,
921
+ download,
922
+ format,
923
+ getExecutablePath,
924
+ init
925
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cyia/crawl",
3
- "version": "0.0.11",
3
+ "version": "0.0.12",
4
4
  "author": "wszgrcy",
5
5
  "description": "",
6
6
  "dependencies": {
@@ -9,7 +9,9 @@
9
9
  "html-entities": "2.6.0",
10
10
  "puppeteer-core": "24.6.0",
11
11
  "valibot": "1.0.0",
12
- "turndown": "^7.2.0"
12
+ "turndown": "^7.2.0",
13
+ "@mozilla/readability": "^0.6.0",
14
+ "jsdom": "^26.0.0"
13
15
  },
14
16
  "exports": {
15
17
  ".": {