@cyia/crawl 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/define.d.ts CHANGED
@@ -49,6 +49,9 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
49
49
  readonly concurrency: v.OptionalSchema<v.NumberSchema<undefined>, undefined>;
50
50
  }, undefined>, v.ObjectSchema<{
51
51
  readonly mode: v.LiteralSchema<"navigation", undefined>;
52
+ }, undefined>, v.ObjectSchema<{
53
+ readonly mode: v.LiteralSchema<"waitBodyElements", undefined>;
54
+ readonly threshold: v.OptionalSchema<v.NumberSchema<undefined>, undefined>;
52
55
  }, undefined>], undefined>;
53
56
  }, undefined>, v.ObjectSchema<{
54
57
  readonly type: v.LiteralSchema<"click", undefined>;
@@ -96,6 +99,12 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
96
99
  readonly key: v.StringSchema<undefined>;
97
100
  readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
98
101
  }, undefined>], undefined>, undefined>;
102
+ }, undefined>, v.ObjectSchema<{
103
+ readonly type: v.LiteralSchema<"rawContent", undefined>;
104
+ readonly output: v.OptionalSchema<v.UnionSchema<[v.StringSchema<undefined>, v.ObjectSchema<{
105
+ readonly key: v.StringSchema<undefined>;
106
+ readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
107
+ }, undefined>], undefined>, undefined>;
99
108
  }, undefined>, v.GenericSchema<{
100
109
  type: "page";
101
110
  input: string;
@@ -121,16 +130,21 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
121
130
  }, undefined>, v.ObjectSchema<{
122
131
  readonly type: v.LiteralSchema<"read-variable", undefined>;
123
132
  readonly input: v.StringSchema<undefined>;
133
+ }, undefined>, v.ObjectSchema<{
134
+ readonly type: v.LiteralSchema<"evaluate", undefined>;
135
+ readonly fn: v.CustomSchema<(...args: any[]) => any, undefined>;
136
+ readonly args: v.OptionalSchema<v.ArraySchema<v.AnySchema, undefined>, undefined>;
137
+ readonly output: v.OptionalSchema<v.UnionSchema<[v.StringSchema<undefined>, v.ObjectSchema<{
138
+ readonly key: v.StringSchema<undefined>;
139
+ readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
140
+ }, undefined>], undefined>, undefined>;
124
141
  }, undefined>], undefined>, (item: v.OutputDataset<{
125
142
  timeout?: number | undefined;
126
143
  waitUntil: "load" | "domcontentloaded" | "networkidle0" | "networkidle2";
127
- url: (string | {
128
- source: "variable";
129
- key: (string | string[] | undefined) & (string | string[]);
130
- } | undefined) & (string | {
144
+ url: string | {
131
145
  source: "variable";
132
- key: (string | string[] | undefined) & (string | string[]);
133
- });
146
+ key: string | string[];
147
+ };
134
148
  type: "goto";
135
149
  } | {
136
150
  width: number;
@@ -162,6 +176,9 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
162
176
  concurrency?: number | undefined;
163
177
  } | {
164
178
  mode: "navigation";
179
+ } | {
180
+ mode: "waitBodyElements";
181
+ threshold?: number | undefined;
165
182
  };
166
183
  } | {
167
184
  type: "click";
@@ -175,13 +192,10 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
175
192
  } | {
176
193
  type: "type";
177
194
  selector: string;
178
- text: (string | {
179
- source: "variable";
180
- key: (string | string[] | undefined) & (string | string[]);
181
- } | undefined) & (string | {
195
+ text: string | {
182
196
  source: "variable";
183
- key: (string | string[] | undefined) & (string | string[]);
184
- });
197
+ key: string | string[];
198
+ };
185
199
  delay?: number | undefined;
186
200
  } | {
187
201
  type: "keypress";
@@ -212,6 +226,12 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
212
226
  key: string;
213
227
  method: "push" | "flat-push" | "define" | "merge";
214
228
  } | undefined;
229
+ } | {
230
+ type: "rawContent";
231
+ output?: string | {
232
+ key: string;
233
+ method: "push" | "flat-push" | "define" | "merge";
234
+ } | undefined;
215
235
  } | {
216
236
  type: "page";
217
237
  input: string;
@@ -232,6 +252,14 @@ export declare const ActionDefine: v.SchemaWithFallback<v.UnionSchema<[v.ObjectS
232
252
  } | {
233
253
  type: "read-variable";
234
254
  input: string;
255
+ } | {
256
+ type: "evaluate";
257
+ fn: (...args: any[]) => any;
258
+ args?: any[] | undefined;
259
+ output?: string | {
260
+ key: string;
261
+ method: "push" | "flat-push" | "define" | "merge";
262
+ } | undefined;
235
263
  }, v.NumberIssue | v.BaseIssue<unknown> | v.BooleanIssue | v.StringIssue | v.TupleIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue> | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue>> | v.PicklistIssue | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.PicklistIssue> | v.UnionIssue<v.StringIssue | v.TupleIssue> | v.LooseObjectIssue | v.VariantIssue | v.CustomIssue | v.UnionIssue<v.NumberIssue | v.BaseIssue<unknown> | v.BooleanIssue | v.StringIssue | v.TupleIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue> | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue>> | v.PicklistIssue | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.PicklistIssue> | v.UnionIssue<v.StringIssue | v.TupleIssue> | v.LooseObjectIssue | v.VariantIssue | v.CustomIssue>> | undefined) => {
236
264
  type: "custom";
237
265
  config: any;
@@ -274,6 +302,9 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
274
302
  readonly concurrency: v.OptionalSchema<v.NumberSchema<undefined>, undefined>;
275
303
  }, undefined>, v.ObjectSchema<{
276
304
  readonly mode: v.LiteralSchema<"navigation", undefined>;
305
+ }, undefined>, v.ObjectSchema<{
306
+ readonly mode: v.LiteralSchema<"waitBodyElements", undefined>;
307
+ readonly threshold: v.OptionalSchema<v.NumberSchema<undefined>, undefined>;
277
308
  }, undefined>], undefined>;
278
309
  }, undefined>, v.ObjectSchema<{
279
310
  readonly type: v.LiteralSchema<"click", undefined>;
@@ -321,6 +352,12 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
321
352
  readonly key: v.StringSchema<undefined>;
322
353
  readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
323
354
  }, undefined>], undefined>, undefined>;
355
+ }, undefined>, v.ObjectSchema<{
356
+ readonly type: v.LiteralSchema<"rawContent", undefined>;
357
+ readonly output: v.OptionalSchema<v.UnionSchema<[v.StringSchema<undefined>, v.ObjectSchema<{
358
+ readonly key: v.StringSchema<undefined>;
359
+ readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
360
+ }, undefined>], undefined>, undefined>;
324
361
  }, undefined>, v.GenericSchema<{
325
362
  type: "page";
326
363
  input: string;
@@ -346,16 +383,21 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
346
383
  }, undefined>, v.ObjectSchema<{
347
384
  readonly type: v.LiteralSchema<"read-variable", undefined>;
348
385
  readonly input: v.StringSchema<undefined>;
386
+ }, undefined>, v.ObjectSchema<{
387
+ readonly type: v.LiteralSchema<"evaluate", undefined>;
388
+ readonly fn: v.CustomSchema<(...args: any[]) => any, undefined>;
389
+ readonly args: v.OptionalSchema<v.ArraySchema<v.AnySchema, undefined>, undefined>;
390
+ readonly output: v.OptionalSchema<v.UnionSchema<[v.StringSchema<undefined>, v.ObjectSchema<{
391
+ readonly key: v.StringSchema<undefined>;
392
+ readonly method: v.PicklistSchema<["push", "flat-push", "define", "merge"], undefined>;
393
+ }, undefined>], undefined>, undefined>;
349
394
  }, undefined>], undefined>, (item: v.OutputDataset<{
350
395
  timeout?: number | undefined;
351
396
  waitUntil: "load" | "domcontentloaded" | "networkidle0" | "networkidle2";
352
- url: (string | {
353
- source: "variable";
354
- key: (string | string[] | undefined) & (string | string[]);
355
- } | undefined) & (string | {
397
+ url: string | {
356
398
  source: "variable";
357
- key: (string | string[] | undefined) & (string | string[]);
358
- });
399
+ key: string | string[];
400
+ };
359
401
  type: "goto";
360
402
  } | {
361
403
  width: number;
@@ -387,6 +429,9 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
387
429
  concurrency?: number | undefined;
388
430
  } | {
389
431
  mode: "navigation";
432
+ } | {
433
+ mode: "waitBodyElements";
434
+ threshold?: number | undefined;
390
435
  };
391
436
  } | {
392
437
  type: "click";
@@ -400,13 +445,10 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
400
445
  } | {
401
446
  type: "type";
402
447
  selector: string;
403
- text: (string | {
404
- source: "variable";
405
- key: (string | string[] | undefined) & (string | string[]);
406
- } | undefined) & (string | {
448
+ text: string | {
407
449
  source: "variable";
408
- key: (string | string[] | undefined) & (string | string[]);
409
- });
450
+ key: string | string[];
451
+ };
410
452
  delay?: number | undefined;
411
453
  } | {
412
454
  type: "keypress";
@@ -437,6 +479,12 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
437
479
  key: string;
438
480
  method: "push" | "flat-push" | "define" | "merge";
439
481
  } | undefined;
482
+ } | {
483
+ type: "rawContent";
484
+ output?: string | {
485
+ key: string;
486
+ method: "push" | "flat-push" | "define" | "merge";
487
+ } | undefined;
440
488
  } | {
441
489
  type: "page";
442
490
  input: string;
@@ -457,6 +505,14 @@ export declare const ActionListDefine: v.ArraySchema<v.SchemaWithFallback<v.Unio
457
505
  } | {
458
506
  type: "read-variable";
459
507
  input: string;
508
+ } | {
509
+ type: "evaluate";
510
+ fn: (...args: any[]) => any;
511
+ args?: any[] | undefined;
512
+ output?: string | {
513
+ key: string;
514
+ method: "push" | "flat-push" | "define" | "merge";
515
+ } | undefined;
460
516
  }, v.NumberIssue | v.BaseIssue<unknown> | v.BooleanIssue | v.StringIssue | v.TupleIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue> | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue>> | v.PicklistIssue | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.PicklistIssue> | v.UnionIssue<v.StringIssue | v.TupleIssue> | v.LooseObjectIssue | v.VariantIssue | v.CustomIssue | v.UnionIssue<v.NumberIssue | v.BaseIssue<unknown> | v.BooleanIssue | v.StringIssue | v.TupleIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue> | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.LiteralIssue | v.ArrayIssue | v.UnionIssue<v.StringIssue | v.ArrayIssue>> | v.PicklistIssue | v.UnionIssue<v.StringIssue | v.ObjectIssue | v.PicklistIssue> | v.UnionIssue<v.StringIssue | v.TupleIssue> | v.LooseObjectIssue | v.VariantIssue | v.CustomIssue>> | undefined) => {
461
517
  type: "custom";
462
518
  config: any;
package/format.d.ts CHANGED
@@ -2,3 +2,15 @@ export declare function format(rawHtml: string, options: {
2
2
  cleanContent?: boolean;
3
3
  format: 'html' | 'text' | 'markdown';
4
4
  }): string | null | undefined;
5
+ export declare function formatDoc(rawHtml: string): {
6
+ title: string | null | undefined;
7
+ content: string | null | undefined;
8
+ textContent: string | null | undefined;
9
+ length: number | null | undefined;
10
+ excerpt: string | null | undefined;
11
+ byline: string | null | undefined;
12
+ dir: string | null | undefined;
13
+ siteName: string | null | undefined;
14
+ lang: string | null | undefined;
15
+ publishedTime: string | null | undefined;
16
+ } | null;
@@ -0,0 +1,18 @@
1
+ import { WebBrowser } from './init';
2
+ export declare class FullWebRequest {
3
+ #private;
4
+ config: {
5
+ url: string;
6
+ filterLink: (url: string) => boolean;
7
+ };
8
+ browser: WebBrowser;
9
+ dataMap: Map<string, any>;
10
+ constructor(config: {
11
+ url: string;
12
+ filterLink: (url: string) => boolean;
13
+ });
14
+ start(): Promise<Map<string, any>>;
15
+ searchWebOne(url: string, context?: {
16
+ from: string;
17
+ }): Promise<void>;
18
+ }
package/index.d.ts CHANGED
@@ -3,3 +3,4 @@ export * from './define';
3
3
  export * from './page';
4
4
  export * from './download';
5
5
  export * from './format';
6
+ export * from './full-web-request';
package/index.mjs CHANGED
@@ -1,925 +1,3 @@
1
- // src/init.ts
2
- import puppeteer from "puppeteer-core";
3
-
4
- // src/page.ts
5
- import { ElementHandle } from "puppeteer-core";
6
-
7
- // src/format.ts
8
- import { load } from "cheerio";
9
- import TurndownService from "turndown";
10
- import { Readability } from "@mozilla/readability";
11
- import { JSDOM } from "jsdom";
12
- function format(rawHtml, options) {
13
- if (options.cleanContent) {
14
- let dom = new JSDOM(rawHtml);
15
- let result = new Readability(dom.window.document).parse();
16
- if (result) {
17
- switch (options.format) {
18
- case "html":
19
- return result.content;
20
- case "text":
21
- return result.textContent;
22
- case "markdown": {
23
- var turndownService = new TurndownService();
24
- return turndownService.turndown(result.content);
25
- }
26
- default:
27
- throw "";
28
- }
29
- }
30
- let $ = load(rawHtml, void 0, true);
31
- let $body = $("body");
32
- $body.find("script,style,iframe,footer,br,hr,svg,header,img").remove();
33
- $body.find("*").removeAttr("class");
34
- $body.find("*").removeAttr("style");
35
- $("*").contents().filter(function() {
36
- return this.type === "comment" || this.type === "text" && !this.data.trim();
37
- }).remove();
38
- $("*").contents().filter(function() {
39
- return this.type === "text" && !!this.data.trim();
40
- }).text((i, text) => {
41
- return text.trim();
42
- });
43
- if (options.format === "html") {
44
- return $body.html();
45
- } else if (options.format === "text") {
46
- return $body.text();
47
- } else if (options.format === "markdown") {
48
- var turndownService = new TurndownService();
49
- return turndownService.turndown($body.html());
50
- }
51
- } else {
52
- if (options.format === "html") {
53
- return rawHtml;
54
- } else if (options.format === "text") {
55
- let $ = load(rawHtml, void 0, true);
56
- let $body = $("body");
57
- return $body.text();
58
- } else if (options.format === "markdown") {
59
- var turndownService = new TurndownService();
60
- return turndownService.turndown(rawHtml);
61
- }
62
- }
63
- }
64
-
65
- // src/page.ts
66
- import { promise as fastq } from "fastq";
67
- var WebPage = class {
68
- page;
69
- browser;
70
- parent;
71
- #obj = {};
72
- constructor(page, browser, parent) {
73
- this.page = page;
74
- this.browser = browser;
75
- this.parent = parent;
76
- }
77
- ab;
78
- timeoutId;
79
- setMaxTimeout(timeout) {
80
- this.ab = new AbortController();
81
- this.timeoutId = setTimeout(() => {
82
- this.ab.abort("timeout");
83
- }, timeout);
84
- }
85
- clearTimeout() {
86
- clearTimeout(this.timeoutId);
87
- }
88
- setVariable(key, value) {
89
- this.#obj[key] = value;
90
- }
91
- getVariable(key) {
92
- return this.#obj[key];
93
- }
94
- #navigatePath(page, paths) {
95
- let value = void 0;
96
- let findValue = false;
97
- for (let i = 0; i < paths.length; i++) {
98
- const item = paths[i];
99
- if (item === "..") {
100
- if (!page.parent) {
101
- throw new Error("未找到父级");
102
- }
103
- page = page.parent;
104
- } else if (findValue) {
105
- if (!value || typeof value !== "object") {
106
- throw new Error(`${paths}路径下未找到值`);
107
- }
108
- value = value[item];
109
- } else {
110
- value = page.#obj[item];
111
- findValue = true;
112
- }
113
- }
114
- return value;
115
- }
116
- #getValue(value) {
117
- if (typeof value === "string") {
118
- return value;
119
- } else if (value.source === "variable") {
120
- if (typeof value.key === "string") {
121
- return this.#obj[value.key];
122
- } else {
123
- return this.#navigatePath(this, value.key);
124
- }
125
- }
126
- }
127
- #setOutput(value, output) {
128
- if (!output) {
129
- return;
130
- }
131
- if (typeof output === "string") {
132
- this.#obj[output] = value;
133
- } else {
134
- switch (output.method) {
135
- case "push": {
136
- if (!Array.isArray(this.#obj[output.key])) {
137
- throw new Error(`${output.key}不是数组类型`);
138
- }
139
- this.#obj[output.key] ||= [];
140
- this.#obj[output.key].push(value);
141
- break;
142
- }
143
- case "flat-push": {
144
- if (!Array.isArray(this.#obj[output.key])) {
145
- throw new Error(`${output.key}不是数组类型`);
146
- }
147
- if (!Array.isArray(value)) {
148
- throw new Error(`${JSON.stringify(value)}不是数组类型`);
149
- }
150
- this.#obj[output.key] ||= [];
151
- this.#obj[output.key].push(...value);
152
- break;
153
- }
154
- case "define": {
155
- this.#obj[output.key] = value;
156
- break;
157
- }
158
- case "merge": {
159
- if (typeof this.#obj[output.key] !== "object") {
160
- throw new Error(`${output.key}不是对象类型`);
161
- }
162
- this.#obj[output.key] ||= {};
163
- this.#obj[output.key] = { ...this.#obj[output.key], ...value };
164
- break;
165
- }
166
- default:
167
- break;
168
- }
169
- }
170
- }
171
- async exeQueue(list) {
172
- let value;
173
- for (const item of list) {
174
- console.log("准备执行", item);
175
- switch (item.type) {
176
- case "click":
177
- await this.page.click(item.selector, {
178
- offset: item.offset,
179
- delay: item.delay,
180
- count: item.count
181
- });
182
- break;
183
- case "type": {
184
- await this.page.type(item.selector, this.#getValue(item.text), {
185
- delay: item.delay
186
- });
187
- break;
188
- }
189
- case "goto": {
190
- value = await this.page.goto(this.#getValue(item.url), {
191
- waitUntil: item.waitUntil,
192
- signal: this.ab?.signal,
193
- timeout: this.browser.getConfig()?.actionTimeout
194
- });
195
- break;
196
- }
197
- case "setViewport": {
198
- value = await this.page.setViewport({
199
- width: item.width,
200
- height: item.height,
201
- isMobile: item.isMobile,
202
- isLandscape: item.isLandscape
203
- });
204
- break;
205
- }
206
- case "wait": {
207
- switch (item.config.mode) {
208
- case "selector": {
209
- value = await this.page.waitForSelector(item.config.selector, {
210
- visible: item.config.visible,
211
- hidden: item.config.hidden,
212
- signal: this.ab?.signal,
213
- timeout: this.browser.getConfig()?.actionTimeout
214
- });
215
- break;
216
- }
217
- case "request": {
218
- const config = item.config;
219
- value = await this.page.waitForRequest(
220
- async (req) => {
221
- if (config.urlRegexp) {
222
- let result = config.urlRegexp.test(req.url());
223
- if (!result) {
224
- return false;
225
- }
226
- }
227
- if (config.method && config.method !== req.method()) {
228
- return false;
229
- }
230
- return true;
231
- },
232
- {
233
- signal: this.ab?.signal,
234
- timeout: this.browser.getConfig()?.actionTimeout
235
- }
236
- );
237
- break;
238
- }
239
- case "response": {
240
- const config = item.config;
241
- value = await this.page.waitForResponse(
242
- async (res) => {
243
- if (config.urlRegexp) {
244
- let result = config.urlRegexp.test(res.url());
245
- if (!result) {
246
- return false;
247
- }
248
- }
249
- if (config.status && config.status !== res.status()) {
250
- return false;
251
- }
252
- return true;
253
- },
254
- {
255
- signal: this.ab?.signal,
256
- timeout: this.browser.getConfig()?.actionTimeout
257
- }
258
- );
259
- break;
260
- }
261
- case "networkIdle": {
262
- value = await this.page.waitForNetworkIdle({
263
- idleTime: item.config.idleTime,
264
- concurrency: item.config.concurrency,
265
- signal: this.ab?.signal,
266
- timeout: this.browser.getConfig()?.actionTimeout
267
- });
268
- break;
269
- }
270
- case "navigation": {
271
- value = await this.page.waitForNavigation({
272
- signal: this.ab?.signal,
273
- timeout: this.browser.getConfig()?.actionTimeout
274
- });
275
- break;
276
- }
277
- }
278
- break;
279
- }
280
- case "selector": {
281
- if (item.multi) {
282
- this.#setOutput(value = await this.page.$$(item.selector), item.output);
283
- } else {
284
- this.#setOutput(value = await this.page.$(item.selector), item.output);
285
- }
286
- break;
287
- }
288
- case "keypress": {
289
- await this.page.keyboard.press(item.key, { delay: item.delay });
290
- break;
291
- }
292
- case "findData": {
293
- let data = this.#obj[item.input];
294
- if (Array.isArray(data)) {
295
- if (item.kind === "property") {
296
- this.#setOutput(
297
- value = await Promise.all(
298
- data.map((el) => {
299
- return el.getProperty(item.key).then((a) => {
300
- return a.jsonValue();
301
- });
302
- })
303
- ),
304
- item.output
305
- );
306
- }
307
- } else {
308
- if (data instanceof ElementHandle) {
309
- this.#setOutput(
310
- value = data.getProperty(item.key).then((a) => {
311
- return a.jsonValue();
312
- }),
313
- item.output
314
- );
315
- }
316
- }
317
- break;
318
- }
319
- case "getContent": {
320
- let content = await this.page.content();
321
- this.#setOutput(value = format(content, { cleanContent: item.cleanContent, format: item.format }), item.output);
322
- break;
323
- }
324
- case "page": {
325
- let inputValue = this.#obj[item.input];
326
- let list2 = Array.isArray(inputValue) ? inputValue : [inputValue];
327
- let queue = fastq(async (input) => {
328
- console.log("准备执行", input);
329
- try {
330
- let result = await this.browser.openPage(async (page) => {
331
- page.setVariable("$item", list2[input.index]);
332
- page.setVariable("$index", input.index);
333
- page.setVariable("$first", input.index === 0);
334
- page.setVariable("$last", input.index === list2.length - 1);
335
- return await page.exeQueue(item.actions);
336
- }, this);
337
- resultList.push(result);
338
- } catch (error) {
339
- if (item.throwError) {
340
- throw error;
341
- } else {
342
- resultList.push(void 0);
343
- }
344
- }
345
- }, item.concurrency);
346
- let queueError;
347
- queue.error((error) => {
348
- if (error) {
349
- queueError = error;
350
- }
351
- });
352
- let resultList = [];
353
- for (let index = 0; index < list2.length; index++) {
354
- queue.push({ index });
355
- }
356
- await queue.drained();
357
- if (item.throwError && queueError) {
358
- throw queueError;
359
- }
360
- value = resultList;
361
- this.#setOutput(value, item.output);
362
- break;
363
- }
364
- case "setUserAgent": {
365
- await this.page.setUserAgent(item.userAgent);
366
- break;
367
- }
368
- case "close": {
369
- await this.page.close({ runBeforeUnload: false });
370
- this.clearTimeout();
371
- break;
372
- }
373
- case "custom": {
374
- if (typeof item.fn === "function") {
375
- value = await item.fn(this);
376
- } else {
377
- let plugin = this.browser.getCustom(item.config.type);
378
- if (!plugin) {
379
- throw new Error(`自定义[${item.config.type}]未实现处理`);
380
- }
381
- value = await plugin(item.config, this);
382
- }
383
- break;
384
- }
385
- case "read-variable": {
386
- value = this.#obj[item.input];
387
- break;
388
- }
389
- default:
390
- break;
391
- }
392
- }
393
- return value;
394
- }
395
- };
396
-
397
- // src/define.ts
398
- import * as v from "valibot";
399
- var KEYLIST = v.picklist([
400
- "0",
401
- "1",
402
- "2",
403
- "3",
404
- "4",
405
- "5",
406
- "6",
407
- "7",
408
- "8",
409
- "9",
410
- "Power",
411
- "Eject",
412
- "Abort",
413
- "Help",
414
- "Backspace",
415
- "Tab",
416
- "Numpad5",
417
- "NumpadEnter",
418
- "Enter",
419
- "\r",
420
- "\n",
421
- "ShiftLeft",
422
- "ShiftRight",
423
- "ControlLeft",
424
- "ControlRight",
425
- "AltLeft",
426
- "AltRight",
427
- "Pause",
428
- "CapsLock",
429
- "Escape",
430
- "Convert",
431
- "NonConvert",
432
- "Space",
433
- "Numpad9",
434
- "PageUp",
435
- "Numpad3",
436
- "PageDown",
437
- "End",
438
- "Numpad1",
439
- "Home",
440
- "Numpad7",
441
- "ArrowLeft",
442
- "Numpad4",
443
- "Numpad8",
444
- "ArrowUp",
445
- "ArrowRight",
446
- "Numpad6",
447
- "Numpad2",
448
- "ArrowDown",
449
- "Select",
450
- "Open",
451
- "PrintScreen",
452
- "Insert",
453
- "Numpad0",
454
- "Delete",
455
- "NumpadDecimal",
456
- "Digit0",
457
- "Digit1",
458
- "Digit2",
459
- "Digit3",
460
- "Digit4",
461
- "Digit5",
462
- "Digit6",
463
- "Digit7",
464
- "Digit8",
465
- "Digit9",
466
- "KeyA",
467
- "KeyB",
468
- "KeyC",
469
- "KeyD",
470
- "KeyE",
471
- "KeyF",
472
- "KeyG",
473
- "KeyH",
474
- "KeyI",
475
- "KeyJ",
476
- "KeyK",
477
- "KeyL",
478
- "KeyM",
479
- "KeyN",
480
- "KeyO",
481
- "KeyP",
482
- "KeyQ",
483
- "KeyR",
484
- "KeyS",
485
- "KeyT",
486
- "KeyU",
487
- "KeyV",
488
- "KeyW",
489
- "KeyX",
490
- "KeyY",
491
- "KeyZ",
492
- "MetaLeft",
493
- "MetaRight",
494
- "ContextMenu",
495
- "NumpadMultiply",
496
- "NumpadAdd",
497
- "NumpadSubtract",
498
- "NumpadDivide",
499
- "F1",
500
- "F2",
501
- "F3",
502
- "F4",
503
- "F5",
504
- "F6",
505
- "F7",
506
- "F8",
507
- "F9",
508
- "F10",
509
- "F11",
510
- "F12",
511
- "F13",
512
- "F14",
513
- "F15",
514
- "F16",
515
- "F17",
516
- "F18",
517
- "F19",
518
- "F20",
519
- "F21",
520
- "F22",
521
- "F23",
522
- "F24",
523
- "NumLock",
524
- "ScrollLock",
525
- "AudioVolumeMute",
526
- "AudioVolumeDown",
527
- "AudioVolumeUp",
528
- "MediaTrackNext",
529
- "MediaTrackPrevious",
530
- "MediaStop",
531
- "MediaPlayPause",
532
- "Semicolon",
533
- "Equal",
534
- "NumpadEqual",
535
- "Comma",
536
- "Minus",
537
- "Period",
538
- "Slash",
539
- "Backquote",
540
- "BracketLeft",
541
- "Backslash",
542
- "BracketRight",
543
- "Quote",
544
- "AltGraph",
545
- "Props",
546
- "Cancel",
547
- "Clear",
548
- "Shift",
549
- "Control",
550
- "Alt",
551
- "Accept",
552
- "ModeChange",
553
- " ",
554
- "Print",
555
- "Execute",
556
- "\0",
557
- "a",
558
- "b",
559
- "c",
560
- "d",
561
- "e",
562
- "f",
563
- "g",
564
- "h",
565
- "i",
566
- "j",
567
- "k",
568
- "l",
569
- "m",
570
- "n",
571
- "o",
572
- "p",
573
- "q",
574
- "r",
575
- "s",
576
- "t",
577
- "u",
578
- "v",
579
- "w",
580
- "x",
581
- "y",
582
- "z",
583
- "Meta",
584
- "*",
585
- "+",
586
- "-",
587
- "/",
588
- ";",
589
- "=",
590
- ",",
591
- ".",
592
- "`",
593
- "[",
594
- "\\",
595
- "]",
596
- "'",
597
- "Attn",
598
- "CrSel",
599
- "ExSel",
600
- "EraseEof",
601
- "Play",
602
- "ZoomOut",
603
- ")",
604
- "!",
605
- "@",
606
- "#",
607
- "$",
608
- "%",
609
- "^",
610
- "&",
611
- "(",
612
- "A",
613
- "B",
614
- "C",
615
- "D",
616
- "E",
617
- "F",
618
- "G",
619
- "H",
620
- "I",
621
- "J",
622
- "K",
623
- "L",
624
- "M",
625
- "N",
626
- "O",
627
- "P",
628
- "Q",
629
- "R",
630
- "S",
631
- "T",
632
- "U",
633
- "V",
634
- "W",
635
- "X",
636
- "Y",
637
- "Z",
638
- ":",
639
- "<",
640
- "_",
641
- ">",
642
- "?",
643
- "~",
644
- "{",
645
- ",",
646
- "}",
647
- '"',
648
- "SoftLeft",
649
- "SoftRight",
650
- "Camera",
651
- "Call",
652
- "EndCall",
653
- "VolumeDown",
654
- "VolumeUp"
655
- ]);
656
- var OptNumber = v.optional(v.number());
657
- var OptBoolean = v.optional(v.boolean());
658
- var OptString = v.optional(v.string());
659
- var TimeoutDefine = OptNumber;
660
- var RegExpStr = v.pipe(
661
- v.string(),
662
- v.transform((input) => {
663
- return new RegExp(input);
664
- })
665
- );
666
- var RegexpTup = v.pipe(
667
- v.tuple([v.string(), v.pipe(v.string())]),
668
- v.transform(([input, flag]) => {
669
- return new RegExp(input, flag);
670
- })
671
- );
672
- var RegexpUni = v.union([RegExpStr, RegexpTup]);
673
- var Selector = v.string();
674
- var Value = v.union([v.string(), v.object({ source: v.literal("variable"), key: v.union([v.string(), v.array(v.string())]) })]);
675
- var OutputP = v.optional(
676
- v.union([v.string(), v.object({ key: v.string(), method: v.picklist(["push", "flat-push", "define", "merge"]) })])
677
- );
678
- var GoToA = v.object({
679
- timeout: TimeoutDefine,
680
- waitUntil: v.optional(v.picklist(["load", "domcontentloaded", "networkidle0", "networkidle2"]), "networkidle2"),
681
- url: Value,
682
- type: v.literal("goto")
683
- });
684
- var SetViewportA = v.object({
685
- width: v.optional(v.number(), 1920),
686
- height: v.optional(v.number(), 1080),
687
- isMobile: v.optional(v.boolean()),
688
- isLandscape: v.optional(v.boolean()),
689
- type: v.literal("setViewport")
690
- });
691
- var SetUserAgentA = v.object({
692
- userAgent: v.string(),
693
- type: v.literal("setUserAgent")
694
- });
695
- var SelectorCommon = v.object({
696
- selector: Selector,
697
- visible: OptBoolean,
698
- hidden: OptBoolean
699
- });
700
- var WaitSelector = v.object({
701
- mode: v.literal("selector"),
702
- ...SelectorCommon.entries
703
- });
704
- var WaitRequest = v.object({
705
- mode: v.literal("request"),
706
- urlRegexp: RegexpUni,
707
- method: OptString
708
- });
709
- var WaitResponse = v.object({
710
- mode: v.literal("response"),
711
- urlRegexp: v.optional(RegexpUni),
712
- // method: OptString,
713
- status: OptNumber
714
- });
715
- var WaitNetworkIdle = v.object({
716
- mode: v.literal("networkIdle"),
717
- idleTime: OptNumber,
718
- concurrency: OptNumber
719
- });
720
- var WaitNavigation = v.object({
721
- mode: v.literal("navigation")
722
- });
723
- var WaitA = v.object({
724
- type: v.literal("wait"),
725
- config: v.variant("mode", [WaitSelector, WaitRequest, WaitResponse, WaitNetworkIdle, WaitNavigation])
726
- });
727
- var ClickA = v.object({
728
- type: v.literal("click"),
729
- selector: Selector,
730
- offset: v.optional(v.object({ x: v.number(), y: v.number() })),
731
- delay: OptNumber,
732
- count: OptNumber
733
- });
734
- var TypeA = v.object({
735
- type: v.literal("type"),
736
- selector: Selector,
737
- text: Value,
738
- delay: OptNumber
739
- });
740
- var KeyPress = v.object({
741
- type: v.literal("keypress"),
742
- key: KEYLIST,
743
- delay: OptNumber
744
- });
745
- var SelectEl = v.object({
746
- type: v.literal("selector"),
747
- // ...SelectorCommon.entries,
748
- selector: Selector,
749
- output: OutputP,
750
- multi: v.optional(v.boolean(), false)
751
- });
752
- var FindData = v.object({
753
- type: v.literal("findData"),
754
- input: v.string(),
755
- output: OutputP,
756
- kind: v.picklist(["property"]),
757
- key: v.optional(v.string())
758
- // multi: v.optional(v.boolean(), true),
759
- });
760
- var GetContent = v.object({
761
- type: v.literal("getContent"),
762
- format: v.optional(v.picklist(["html", "text", "markdown"]), "html"),
763
- cleanContent: OptBoolean,
764
- output: OutputP
765
- });
766
- var CloseA = v.object({
767
- type: v.literal("close")
768
- });
769
- var PluginA = v.object({
770
- type: v.literal("custom"),
771
- config: v.optional(v.looseObject({ type: v.string() })),
772
- fn: v.optional(v.custom(Boolean))
773
- });
774
- var ReadVariable = v.object({
775
- type: v.literal("read-variable"),
776
- input: v.string()
777
- });
778
- var PageA = v.object({
779
- type: v.literal("page"),
780
- input: v.string(),
781
- output: OutputP,
782
- concurrency: v.optional(v.number(), 2),
783
- throwError: v.optional(v.boolean(), false),
784
- actions: v.lazy(() => v.array(ActionDefine))
785
- });
786
- var TypeList = [
787
- ...[
788
- GoToA,
789
- SetViewportA,
790
- SetUserAgentA,
791
- WaitA,
792
- ClickA,
793
- TypeA,
794
- KeyPress,
795
- SelectEl,
796
- FindData,
797
- GetContent,
798
- // PageA,
799
- CloseA,
800
- PluginA,
801
- ReadVariable
802
- ].map((item) => item.entries.type.literal),
803
- "page"
804
- ];
805
- var ActionDefine = v.fallback(
806
- v.union([
807
- GoToA,
808
- SetViewportA,
809
- SetUserAgentA,
810
- WaitA,
811
- ClickA,
812
- TypeA,
813
- KeyPress,
814
- SelectEl,
815
- FindData,
816
- GetContent,
817
- PageA,
818
- CloseA,
819
- PluginA,
820
- ReadVariable
821
- ]),
822
- (item) => {
823
- if (TypeList.includes((item?.value).type)) {
824
- throw new Error(JSON.stringify(item?.issues));
825
- }
826
- return { type: "custom", config: item?.value };
827
- }
828
- );
829
- var ActionListDefine = v.array(ActionDefine);
830
- var GlobalConfig = v.object({
831
- maxTimeout: OptNumber,
832
- actionTimeout: OptNumber
833
- });
834
-
835
- // src/init.ts
836
- import * as v2 from "valibot";
837
- import { Browser as BV, computeExecutablePath as computeExecutablePath2 } from "@puppeteer/browsers";
838
- import * as fs from "fs";
839
-
840
- // src/download.ts
841
- import { Browser, computeExecutablePath, install } from "@puppeteer/browsers";
842
- async function download(options) {
843
- let result = await install({
844
- browser: Browser.CHROME,
845
- baseUrl: "https://cdn.npmmirror.com/binaries/chrome-for-testing",
846
- ...options,
847
- unpack: true
848
- });
849
- }
850
- function getExecutablePath(dir, buildId) {
851
- return computeExecutablePath({ cacheDir: dir, browser: Browser.CHROME, buildId });
852
- }
853
-
854
- // src/init.ts
855
- import { PUPPETEER_REVISIONS } from "puppeteer-core/internal/revisions.js";
856
- async function init(options) {
857
- return WebBrowser.init(options);
858
- }
859
- var CHROME_VERSION = PUPPETEER_REVISIONS.chrome;
860
- var WebBrowser = class _WebBrowser {
861
- browser;
862
- static async init(options) {
863
- let bvType = BV.CHROME;
864
- let executablePath = computeExecutablePath2({ cacheDir: options.cacheDir, browser: bvType, buildId: CHROME_VERSION });
865
- if (!fs.existsSync(executablePath)) {
866
- console.log("准备下载");
867
- await download({ cacheDir: options.cacheDir, buildId: CHROME_VERSION, browser: bvType });
868
- }
869
- const browser = await puppeteer.launch({ ...options, executablePath });
870
- return new _WebBrowser(browser);
871
- }
872
- constructor(browser) {
873
- this.browser = browser;
874
- }
875
- #config;
876
- #pluginMap = /* @__PURE__ */ new Map();
877
- setConfig(config) {
878
- this.#config = config;
879
- }
880
- getConfig() {
881
- return this.#config;
882
- }
883
- registerCustom(type, fn) {
884
- this.#pluginMap.set(type, fn);
885
- }
886
- clearCustom() {
887
- this.#pluginMap.clear();
888
- }
889
- getCustom(key) {
890
- return this.#pluginMap.get(key);
891
- }
892
- async openPage(fn, parent) {
893
- let page = new WebPage(await this.browser.newPage(), this, parent);
894
- if (this.#config?.maxTimeout) {
895
- page.setMaxTimeout(this.#config.maxTimeout);
896
- }
897
- return fn(page);
898
- }
899
- runQueue(list, input) {
900
- let result = v2.safeParse(ActionListDefine, list);
901
- if (!result.success) {
902
- throw new Error(`解析配置错误
903
- ${JSON.stringify(result.issues)}`);
904
- }
905
- return this.openPage(async (page) => {
906
- if (input) {
907
- for (const key in input) {
908
- page.setVariable(key, input[key]);
909
- }
910
- }
911
- return page.exeQueue(result.output);
912
- });
913
- }
914
- };
915
- export {
916
- ActionDefine,
917
- ActionListDefine,
918
- GlobalConfig,
919
- WebBrowser,
920
- WebPage,
921
- download,
922
- format,
923
- getExecutablePath,
924
- init
925
- };
1
+ import ye from"puppeteer-core";import{ElementHandle as z}from"puppeteer-core";import{load as v}from"cheerio";import d from"turndown";import{Readability as k}from"@mozilla/readability";import{JSDOM as x}from"jsdom";function A(n,r){if(r.cleanContent){let t=new x(n),i=new k(t.window.document).parse();if(i)switch(r.format){case"html":return i.content;case"text":return i.textContent;case"markdown":{var o=new d;return o.turndown(i.content)}default:throw""}let a=v(n,void 0,!0),s=a("body");if(s.find("script,style,iframe,footer,br,hr,svg,header,img").remove(),s.find("*").removeAttr("class"),s.find("*").removeAttr("style"),a("*").contents().filter(function(){return this.type==="comment"||this.type==="text"&&!this.data.trim()}).remove(),a("*").contents().filter(function(){return this.type==="text"&&!!this.data.trim()}).text((p,u)=>u.trim()),r.format==="html")return s.html();if(r.format==="text")return s.text();if(r.format==="markdown"){var o=new d;return o.turndown(s.html())}}else{if(r.format==="html")return n;if(r.format==="text")return v(n,void 0,!0)("body").text();if(r.format==="markdown"){var o=new d;return o.turndown(n)}}}function C(n){let r=new x(n);return new k(r.window.document,{charThreshold:100}).parse()}import{promise as X}from"fastq";var g=class{page;browser;parent;#e={};constructor(r,o,t){this.page=r,this.browser=o,this.parent=t}ab;timeoutId;setMaxTimeout(r){this.ab=new AbortController,this.timeoutId=setTimeout(()=>{this.ab.abort("timeout")},r)}clearTimeout(){clearTimeout(this.timeoutId)}setVariable(r,o){this.#e[r]=o}getVariable(r){return this.#e[r]}#r(r,o){let t,i=!1;for(let a=0;a<o.length;a++){let s=o[a];if(s===".."){if(!r.parent)throw new Error("未找到父级");r=r.parent}else if(i){if(!t||typeof t!="object")throw new Error(`${o}路径下未找到值`);t=t[s]}else t=r.#e[s],i=!0}return t}#o(r){if(typeof r=="string")return r;if(r.source==="variable")return typeof r.key=="string"?this.#e[r.key]:this.#r(this,r.key)}#t(r,o){if(o)if(typeof o=="string")this.#e[o]=r;else switch(o.method){case"push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(r);break}case"flat-push":{if(!Array.isArray(this.#e[o.key]))throw new Error(`${o.key}不是数组类型`);if(!Array.isArray(r))throw new Error(`${JSON.stringify(r)}不是数组类型`);this.#e[o.key]||=[],this.#e[o.key].push(...r);break}case"define":{this.#e[o.key]=r;break}case"merge":{if(typeof this.#e[o.key]!="object")throw new Error(`${o.key}不是对象类型`);this.#e[o.key]||={},this.#e[o.key]={...this.#e[o.key],...r};break}default:break}}async exeQueue(r){let o;for(let t of r)switch(console.log("准备执行",t),t.type){case"click":await this.page.click(t.selector,{offset:t.offset,delay:t.delay,count:t.count});break;case"type":{await this.page.type(t.selector,this.#o(t.text),{delay:t.delay});break}case"goto":{o=await this.page.goto(this.#o(t.url),{waitUntil:t.waitUntil,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"setViewport":{o=await this.page.setViewport({width:t.width,height:t.height,isMobile:t.isMobile,isLandscape:t.isLandscape});break}case"wait":{switch(t.config.mode){case"selector":{o=await this.page.waitForSelector(t.config.selector,{visible:t.config.visible,hidden:t.config.hidden,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"request":{let i=t.config;o=await this.page.waitForRequest(async a=>!(i.urlRegexp&&!i.urlRegexp.test(a.url())||i.method&&i.method!==a.method()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"response":{let i=t.config;o=await this.page.waitForResponse(async a=>!(i.urlRegexp&&!i.urlRegexp.test(a.url())||i.status&&i.status!==a.status()),{signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"networkIdle":{o=await this.page.waitForNetworkIdle({idleTime:t.config.idleTime,concurrency:t.config.concurrency,signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"navigation":{o=await this.page.waitForNavigation({signal:this.ab?.signal,timeout:this.browser.getConfig()?.actionTimeout});break}case"waitBodyElements":{let i=t.config.threshold||100;o=await this.page.waitForFunction(a=>document.body.querySelectorAll("*").length>=a,{timeout:this.browser.getConfig()?.actionTimeout,signal:this.ab?.signal},i);break}}break}case"selector":{t.multi?this.#t(o=await this.page.$$(t.selector),t.output):this.#t(o=await this.page.$(t.selector),t.output);break}case"keypress":{await this.page.keyboard.press(t.key,{delay:t.delay});break}case"findData":{let i=this.#e[t.input];Array.isArray(i)?t.kind==="property"&&this.#t(o=await Promise.all(i.map(a=>a.getProperty(t.key).then(s=>s.jsonValue()))),t.output):i instanceof z&&this.#t(o=i.getProperty(t.key).then(a=>a.jsonValue()),t.output);break}case"getContent":{let i=await this.page.content();this.#t(o=A(i,{cleanContent:t.cleanContent,format:t.format}),t.output);break}case"rawContent":{let i=await this.page.content();this.#t(o=i,t.output);break}case"page":{let i=this.#e[t.input],a=Array.isArray(i)?i:[i],s=X(async l=>{console.log("准备执行",l);try{let y=await this.browser.openPage(async f=>(f.setVariable("$item",a[l.index]),f.setVariable("$index",l.index),f.setVariable("$first",l.index===0),f.setVariable("$last",l.index===a.length-1),{result:await f.exeQueue(t.actions),page:f}),this);u.push(y)}catch(y){if(t.throwError)throw y;u.push(void 0)}},t.concurrency),p;s.error(l=>{l&&(p=l)});let u=[];for(let l=0;l<a.length;l++)s.push({index:l});if(await s.drained(),t.throwError&&p)throw p;o=u,this.#t(o,t.output);break}case"setUserAgent":{await this.page.setUserAgent({userAgent:t.userAgent});break}case"close":{await this.page.close({runBeforeUnload:!1}),this.clearTimeout();break}case"custom":{if(typeof t.fn=="function")o=await t.fn(this);else{let i=this.browser.getCustom(t.config.type);if(!i)throw new Error(`自定义[${t.config.type}]未实现处理`);o=await i(t.config,this)}break}case"evaluate":{this.#t(o=await this.page.evaluate(t.fn,...t.args??[]),t.output);break}case"read-variable":{o=this.#e[t.input];break}default:break}return o}dispose(){return this.page.close()}};import*as e from"valibot";var ee=e.picklist(["0","1","2","3","4","5","6","7","8","9","Power","Eject","Abort","Help","Backspace","Tab","Numpad5","NumpadEnter","Enter","\r",`
2
+ `,"ShiftLeft","ShiftRight","ControlLeft","ControlRight","AltLeft","AltRight","Pause","CapsLock","Escape","Convert","NonConvert","Space","Numpad9","PageUp","Numpad3","PageDown","End","Numpad1","Home","Numpad7","ArrowLeft","Numpad4","Numpad8","ArrowUp","ArrowRight","Numpad6","Numpad2","ArrowDown","Select","Open","PrintScreen","Insert","Numpad0","Delete","NumpadDecimal","Digit0","Digit1","Digit2","Digit3","Digit4","Digit5","Digit6","Digit7","Digit8","Digit9","KeyA","KeyB","KeyC","KeyD","KeyE","KeyF","KeyG","KeyH","KeyI","KeyJ","KeyK","KeyL","KeyM","KeyN","KeyO","KeyP","KeyQ","KeyR","KeyS","KeyT","KeyU","KeyV","KeyW","KeyX","KeyY","KeyZ","MetaLeft","MetaRight","ContextMenu","NumpadMultiply","NumpadAdd","NumpadSubtract","NumpadDivide","F1","F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","NumLock","ScrollLock","AudioVolumeMute","AudioVolumeDown","AudioVolumeUp","MediaTrackNext","MediaTrackPrevious","MediaStop","MediaPlayPause","Semicolon","Equal","NumpadEqual","Comma","Minus","Period","Slash","Backquote","BracketLeft","Backslash","BracketRight","Quote","AltGraph","Props","Cancel","Clear","Shift","Control","Alt","Accept","ModeChange"," ","Print","Execute","\0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","Meta","*","+","-","/",";","=",",",".","`","[","\\","]","'","Attn","CrSel","ExSel","EraseEof","Play","ZoomOut",")","!","@","#","$","%","^","&","(","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",":","<","_",">","?","~","{",",","}",'"',"SoftLeft","SoftRight","Camera","Call","EndCall","VolumeDown","VolumeUp"]),c=e.optional(e.number()),b=e.optional(e.boolean()),te=e.optional(e.string());var re=c,oe=e.pipe(e.string(),e.transform(n=>new RegExp(n))),ie=e.pipe(e.tuple([e.string(),e.pipe(e.string())]),e.transform(([n,r])=>new RegExp(n,r))),E=e.union([oe,ie]),h=e.string(),P=e.union([e.string(),e.object({source:e.literal("variable"),key:e.union([e.string(),e.array(e.string())])})]),m=e.optional(e.union([e.string(),e.object({key:e.string(),method:e.picklist(["push","flat-push","define","merge"])})])),T=e.object({timeout:re,waitUntil:e.optional(e.picklist(["load","domcontentloaded","networkidle0","networkidle2"]),"networkidle2"),url:P,type:e.literal("goto")}),I=e.object({width:e.optional(e.number(),1920),height:e.optional(e.number(),1080),isMobile:e.optional(e.boolean()),isLandscape:e.optional(e.boolean()),type:e.literal("setViewport")}),O=e.object({userAgent:e.string(),type:e.literal("setUserAgent")}),ne=e.object({selector:h,visible:b,hidden:b}),ae=e.object({mode:e.literal("selector"),...ne.entries}),se=e.object({mode:e.literal("request"),urlRegexp:E,method:te}),le=e.object({mode:e.literal("response"),urlRegexp:e.optional(E),status:c}),ce=e.object({mode:e.literal("networkIdle"),idleTime:c,concurrency:c}),ue=e.object({mode:e.literal("waitBodyElements"),threshold:c}),pe=e.object({mode:e.literal("navigation")}),D=e.object({type:e.literal("wait"),config:e.variant("mode",[ae,se,le,ce,pe,ue])}),S=e.object({type:e.literal("click"),selector:h,offset:e.optional(e.object({x:e.number(),y:e.number()})),delay:c,count:c}),R=e.object({type:e.literal("type"),selector:h,text:P,delay:c}),F=e.object({type:e.literal("keypress"),key:ee,delay:c}),V=e.object({type:e.literal("selector"),selector:h,output:m,multi:e.optional(e.boolean(),!1)}),j=e.object({type:e.literal("findData"),input:e.string(),output:m,kind:e.picklist(["property"]),key:e.optional(e.string())}),N=e.object({type:e.literal("getContent"),format:e.optional(e.picklist(["html","text","markdown"]),"html"),cleanContent:b,output:m}),L=e.object({type:e.literal("rawContent"),output:m}),M=e.object({type:e.literal("evaluate"),fn:e.custom(n=>typeof n=="function"),args:e.optional(e.array(e.any())),output:m}),K=e.object({type:e.literal("close")}),U=e.object({type:e.literal("custom"),config:e.optional(e.looseObject({type:e.string()})),fn:e.optional(e.custom(Boolean))}),B=e.object({type:e.literal("read-variable"),input:e.string()}),fe=e.object({type:e.literal("page"),input:e.string(),output:m,concurrency:e.optional(e.number(),2),throwError:e.optional(e.boolean(),!1),actions:e.lazy(()=>e.array(W))}),me=[...[T,I,O,D,S,R,F,V,j,N,L,K,U,B,M].map(n=>n.entries.type.literal),"page"],W=e.fallback(e.union([T,I,O,D,S,R,F,V,j,N,L,fe,K,U,B,M]),n=>{if(me.includes((n?.value).type))throw new Error(JSON.stringify(n?.issues));return{type:"custom",config:n?.value}}),$=e.array(W),Se=e.object({maxTimeout:c,actionTimeout:c});import*as J from"valibot";import{Browser as de,computeExecutablePath as be}from"@puppeteer/browsers";import*as H from"fs";import{Browser as G,computeExecutablePath as ge,install as he}from"@puppeteer/browsers";async function q(n){let r=await he({browser:G.CHROME,baseUrl:"https://cdn.npmmirror.com/binaries/chrome-for-testing",...n,unpack:!0})}function je(n,r){return ge({cacheDir:n,browser:G.CHROME,buildId:r})}import{PUPPETEER_REVISIONS as we}from"puppeteer-core/internal/revisions.js";async function _(n){return w.init(n)}var Q=we.chrome,w=class n{browser;static async init(r){let o=de.CHROME,t=be({cacheDir:r.cacheDir,browser:o,buildId:Q});H.existsSync(t)||(console.log("准备下载"),await q({cacheDir:r.cacheDir,buildId:Q,browser:o}));let i=await ye.launch({...r,executablePath:t});return new n(i)}constructor(r){this.browser=r}#e;#r=new Map;setConfig(r){this.#e=r}getConfig(){return this.#e}registerCustom(r,o){this.#r.set(r,o)}clearCustom(){this.#r.clear()}getCustom(r){return this.#r.get(r)}async openPage(r,o){let t=new g(await this.browser.newPage(),this,o);return this.#e?.maxTimeout&&t.setMaxTimeout(this.#e.maxTimeout),r(t)}runQueue(r,o){let t=J.safeParse($,r);if(!t.success)throw new Error(`解析配置错误
3
+ ${JSON.stringify(t.issues)}`);return this.openPage(async i=>{if(o)for(let a in o)i.setVariable(a,o[a]);return{result:await i.exeQueue(t.output),page:i}})}};import{load as ve}from"cheerio";function Y(n,r){let o=ve(r,{baseURI:n});return o("a").map((i,a)=>o(a).attr("href")?o(a).prop("href"):"").get().filter(Boolean)}var Z=class{config;browser;dataMap=new Map;#e=new Set;constructor(r){this.config=r}async start(){return this.browser=await _({cacheDir:process.cwd(),headless:!1}),this.browser.setConfig({maxTimeout:12e4,actionTimeout:12e4}),await this.searchWebOne(this.config.url,void 0),await this.browser.browser.close(),this.dataMap}async searchWebOne(r,o){let t=await this.browser.runQueue([{type:"setViewport",width:1920,height:1080},{type:"goto",url:r,waitUntil:"networkidle0"},{type:"evaluate",output:"baseURI",fn:()=>window.location.origin},{type:"evaluate",output:"href",fn:()=>window.location.href},{type:"rawContent",output:"data"}]);console.log("解析完成",r);let i=t.page.getVariable("href"),a=t.page.getVariable("baseURI"),s=t.page.getVariable("data"),p=C(s);if(this.#e.add(r),this.#e.add(i),p)this.dataMap.set(r,{requestUrl:r,parsedUrl:i,parent:o?.from,metadata:p,raw:s}),await t.page.dispose();else{this.dataMap.set(r,{requestUrl:r,parsedUrl:i,parent:o?.from,metadata:void 0,raw:s}),await t.page.dispose();return}let u=Y(a,s);u=u.filter(this.config.filterLink);for(let l of u){if(this.#e.has(l)){console.log("已索引,跳过",l);continue}await this.searchWebOne(l,{from:r})}}};export{W as ActionDefine,$ as ActionListDefine,Z as FullWebRequest,Se as GlobalConfig,w as WebBrowser,g as WebPage,q as download,A as format,C as formatDoc,je as getExecutablePath,_ as init};
package/init.d.ts CHANGED
@@ -20,6 +20,9 @@ export declare class WebBrowser {
20
20
  clearCustom(): void;
21
21
  getCustom(key: string): PluginFn | undefined;
22
22
  openPage<T>(fn: (page: WebPage) => Promise<T>, parent?: WebPage): Promise<T>;
23
- runQueue(list: QueueList, input?: Record<string, any>): Promise<any>;
23
+ runQueue(list: QueueList, input?: Record<string, any>): Promise<{
24
+ result: any;
25
+ page: WebPage;
26
+ }>;
24
27
  }
25
28
  export {};
package/package.json CHANGED
@@ -1,22 +1,24 @@
1
1
  {
2
2
  "name": "@cyia/crawl",
3
- "version": "0.0.12",
3
+ "version": "0.0.13",
4
4
  "author": "wszgrcy",
5
5
  "description": "",
6
6
  "dependencies": {
7
- "cheerio": "1.0.0",
8
- "fastq": "1.19.1",
9
- "html-entities": "2.6.0",
10
- "puppeteer-core": "24.6.0",
11
- "valibot": "1.0.0",
12
- "turndown": "^7.2.0",
13
7
  "@mozilla/readability": "^0.6.0",
14
- "jsdom": "^26.0.0"
8
+ "cheerio": "^1.2.0",
9
+ "fastq": "^1.20.1",
10
+ "html-entities": "^2.6.0",
11
+ "htmlparser2": "^10.1.0",
12
+ "jsdom": "^27.4.0",
13
+ "puppeteer-core": "^24.36.0",
14
+ "turndown": "^7.2.2",
15
+ "valibot": "^1.2.0"
15
16
  },
16
17
  "exports": {
17
18
  ".": {
18
19
  "types": "./index.d.ts",
19
- "default": "./index.mjs"
20
+ "default": "./index.mjs",
21
+ "node": "./index.mjs"
20
22
  }
21
23
  },
22
24
  "publishConfig": {
package/page.d.ts CHANGED
@@ -15,4 +15,5 @@ export declare class WebPage {
15
15
  setVariable(key: string, value: any): void;
16
16
  getVariable(key: string): any;
17
17
  exeQueue(list: v.InferOutput<typeof ActionDefine>[]): Promise<any>;
18
+ dispose(): Promise<void>;
18
19
  }
@@ -0,0 +1 @@
1
+ export declare function getPageLinks(baseURI: string, content: string): string[];