@letsscrapedata/scraper 0.0.76 → 0.0.77

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -2,412 +2,7 @@ import { BrowserControllerType, LsdBrowserType, LsdLaunchOptions, LsdConnectOpti
2
2
  import { Proxy } from '@letsscrapedata/proxy';
3
3
  import { LogFunction } from '@letsscrapedata/utils';
4
4
 
5
- interface XmlParaCfg {
6
- paraname: string;
7
- name: string;
8
- desc: string;
9
- uitype: string;
10
- defval: string;
11
- listid: string;
12
- listparas: string;
13
- min: string;
14
- max: string;
15
- pattern: string;
16
- alert: string;
17
- }
18
- interface XmlActionBreakCfg {
19
- type: string;
20
- id: string;
21
- }
22
- interface XmlActionCaptchaCfg {
23
- type: string;
24
- try: string;
25
- errname: string;
26
- id: string;
27
- }
28
- interface XmlActionClickCfg {
29
- type: string;
30
- wait: string;
31
- gen: boolean;
32
- cap: boolean;
33
- clicktype: string;
34
- try: string;
35
- errname: string;
36
- datapage: string;
37
- popupsubtask: boolean;
38
- login: boolean;
39
- captcha: boolean;
40
- navigate: boolean;
41
- eurl: string;
42
- eloc: string;
43
- pn1: string;
44
- pv1: string;
45
- pn2: string;
46
- pv2: string;
47
- downloadpath: string;
48
- filename: string;
49
- pathvarname: string;
50
- id: string;
51
- }
52
- interface XmlActionContinueCfg {
53
- type: string;
54
- id: string;
55
- }
56
- interface XmlActionMiscCfg {
57
- type: string;
58
- id: string;
59
- }
60
- interface XmlActionExitCfg {
61
- type: string;
62
- errname: string;
63
- id: string;
64
- }
65
- interface XmlActionExtractCfg {
66
- type: string;
67
- tabname: string;
68
- id: string;
69
- }
70
- interface XmlActionGotoCfg {
71
- type: string;
72
- url: string;
73
- reuse: boolean;
74
- wait: string;
75
- encodeuri: boolean;
76
- gen: boolean;
77
- cap: boolean;
78
- datapage: string;
79
- popupsubtask: boolean;
80
- login: boolean;
81
- captcha: boolean;
82
- eurl: string;
83
- eloc: string;
84
- pn1: string;
85
- pv1: string;
86
- pn2: string;
87
- pv2: string;
88
- headers: string;
89
- referer: string;
90
- id: string;
91
- }
92
- interface XmlActionHoverCfg {
93
- type: string;
94
- try: string;
95
- gen: boolean;
96
- cap: boolean;
97
- wait: string;
98
- errname: string;
99
- id: string;
100
- }
101
- interface XmlActionIfelseCfg {
102
- type: string;
103
- id: string;
104
- }
105
- interface XmlActionInputCfg {
106
- type: string;
107
- content: string;
108
- enter: boolean;
109
- replace: boolean;
110
- gen: boolean;
111
- cap: boolean;
112
- try: string;
113
- wait: string;
114
- errname: string;
115
- datapage: string;
116
- popupsubtask: boolean;
117
- login: boolean;
118
- captcha: boolean;
119
- eurl: string;
120
- eloc: string;
121
- pn1: string;
122
- pv1: string;
123
- pn2: string;
124
- pv2: string;
125
- id: string;
126
- }
127
- interface XmlActionInterceptClearCfg {
128
- type: string;
129
- subtype: string;
130
- }
131
- interface XmlActionInterceptSetCfg {
132
- type: string;
133
- subtype: string;
134
- }
135
- interface XmlActionLoopdowhileElementCfg {
136
- type: string;
137
- subtype: string;
138
- iswhile: boolean;
139
- varname: string;
140
- maxloops: string;
141
- click: boolean;
142
- navigate: boolean;
143
- gen: boolean;
144
- cap: boolean;
145
- errname: string;
146
- wait: string;
147
- id: string;
148
- }
149
- interface XmlActionLoopdowhileTemplstrCfg {
150
- type: string;
151
- subtype: string;
152
- iswhile: boolean;
153
- varname: string;
154
- maxloops: string;
155
- id: string;
156
- }
157
- interface XmlActionLoopforCfg {
158
- type: string;
159
- from: string;
160
- to: string;
161
- step: string;
162
- roundtype: string;
163
- varname: string;
164
- maxloops: string;
165
- errname: string;
166
- id: string;
167
- }
168
- interface XmlActionLoopinelesCfg {
169
- type: string;
170
- varname: string;
171
- maxloops: string;
172
- start: string;
173
- end: string;
174
- step: string;
175
- errname: string;
176
- id: string;
177
- }
178
- interface XmlActionLoopinstrCfg {
179
- type: string;
180
- list: string;
181
- split: string;
182
- varname: string;
183
- maxloops: string;
184
- trim: boolean;
185
- errname: string;
186
- id: string;
187
- }
188
- interface XmlActionScrollByCfg {
189
- type: string;
190
- subtype: string;
191
- height: string;
192
- unit: string;
193
- maxtimes: string;
194
- interval: string;
195
- gen: boolean;
196
- cap: boolean;
197
- datapage: string;
198
- popupsubtask: boolean;
199
- login: boolean;
200
- captcha: boolean;
201
- eurl: string;
202
- eloc: string;
203
- pn1: string;
204
- pv1: string;
205
- pn2: string;
206
- pv2: string;
207
- id: string;
208
- }
209
- interface XmlActionScrollIntoviewCfg {
210
- type: string;
211
- subtype: string;
212
- gen: boolean;
213
- cap: boolean;
214
- errname: string;
215
- datapage: string;
216
- popupsubtask: boolean;
217
- login: boolean;
218
- captcha: boolean;
219
- eurl: string;
220
- eloc: string;
221
- pn1: string;
222
- pv1: string;
223
- pn2: string;
224
- pv2: string;
225
- id: string;
226
- }
227
- interface XmlActionScrollToCfg {
228
- type: string;
229
- subtype: string;
230
- height: string;
231
- unit: string;
232
- gen: boolean;
233
- cap: boolean;
234
- datapage: string;
235
- popupsubtask: boolean;
236
- login: boolean;
237
- captcha: boolean;
238
- eurl: string;
239
- eloc: string;
240
- pn1: string;
241
- pv1: string;
242
- pn2: string;
243
- pv2: string;
244
- id: string;
245
- }
246
- interface XmlActionSelectCfg {
247
- type: string;
248
- selecttype: string;
249
- selectval: string;
250
- gen: boolean;
251
- cap: boolean;
252
- try: string;
253
- wait: string;
254
- errname: string;
255
- datapage: string;
256
- popupsubtask: boolean;
257
- login: boolean;
258
- captcha: boolean;
259
- eurl: string;
260
- eloc: string;
261
- pn1: string;
262
- pv1: string;
263
- pn2: string;
264
- pv2: string;
265
- id: string;
266
- }
267
- interface XmlActionSetvarDbqueryCfg {
268
- type: string;
269
- subtype: string;
270
- varname: string;
271
- defaultval: string;
272
- valerrname: string;
273
- pattern: string;
274
- flags: string;
275
- path: string;
276
- id: string;
277
- }
278
- interface XmlActionSetvarElementCfg {
279
- type: string;
280
- subtype: string;
281
- varname: string;
282
- defaultval: string;
283
- try: string;
284
- valerrname: string;
285
- pattern: string;
286
- flags: string;
287
- path: string;
288
- id: string;
289
- }
290
- interface XmlActionSetvarFileCfg {
291
- type: string;
292
- subtype: string;
293
- varname: string;
294
- defaultval: string;
295
- proxy: boolean;
296
- valerrname: string;
297
- pattern: string;
298
- flags: string;
299
- path: string;
300
- id: string;
301
- }
302
- interface XmlActionSetvarGetCfg {
303
- type: string;
304
- subtype: string;
305
- varname: string;
306
- defaultval: string;
307
- valerrname: string;
308
- pattern: string;
309
- flags: string;
310
- path: string;
311
- id: string;
312
- }
313
- interface XmlActionSetvarOcrCfg {
314
- type: string;
315
- subtype: string;
316
- varname: string;
317
- defaultval: string;
318
- valerrname: string;
319
- pattern: string;
320
- flags: string;
321
- path: string;
322
- id: string;
323
- }
324
- interface XmlActionSetvarSubtaskCfg {
325
- type: string;
326
- subtype: string;
327
- varname: string;
328
- defaultval: string;
329
- valerrname: string;
330
- pattern: string;
331
- flags: string;
332
- path: string;
333
- id: string;
334
- }
335
- interface XmlActionSetvarTemplstrCfg {
336
- type: string;
337
- subtype: string;
338
- varname: string;
339
- defaultval: string;
340
- valerrname: string;
341
- pattern: string;
342
- flags: string;
343
- path: string;
344
- id: string;
345
- }
346
- interface XmlActionSubtaskCfg {
347
- type: string;
348
- subtasks: string;
349
- sameasparent: boolean;
350
- id: string;
351
- }
352
- interface XmlActionWaitElementCfg {
353
- type: string;
354
- subtype: string;
355
- timeout: string;
356
- state: string;
357
- errname: string;
358
- wait: string;
359
- id: string;
360
- }
361
- interface XmlActionWaitNavigationCfg {
362
- type: string;
363
- subtype: string;
364
- timeout: string;
365
- waituntil: string;
366
- url: string;
367
- errname: string;
368
- wait: string;
369
- id: string;
370
- }
371
- interface XmlActionWaitSleepCfg {
372
- type: string;
373
- subtype: string;
374
- minms: string;
375
- maxms: string;
376
- errname: string;
377
- wait: string;
378
- id: string;
379
- }
380
- interface XmlFontsvgCfg {
381
- exloc: string;
382
- inloc: string;
383
- csmaptype: string;
384
- bsfilter: string;
385
- }
386
- interface XmlFontselectorCfg {
387
- name: string;
388
- fontfamily: string;
389
- }
390
- interface XmlFontfamilyCfg {
391
- name: string;
392
- fontcodes: string;
393
- fontchars: string;
394
- }
395
- interface XmlFontcodesCfg {
396
- name: string;
397
- codes: string;
398
- }
399
- interface XmlFontcharsCfg {
400
- name: string;
401
- chars: string;
402
- }
403
- type XmlActionConfig = XmlActionBreakCfg | XmlActionCaptchaCfg | XmlActionClickCfg | XmlActionContinueCfg | XmlActionMiscCfg | XmlActionExitCfg | XmlActionExtractCfg | XmlActionGotoCfg | XmlActionHoverCfg | XmlActionIfelseCfg | XmlActionInputCfg | XmlActionInterceptClearCfg | XmlActionInterceptSetCfg | XmlActionLoopdowhileElementCfg | XmlActionLoopdowhileTemplstrCfg | XmlActionLoopforCfg | XmlActionLoopinelesCfg | XmlActionLoopinstrCfg | XmlActionScrollByCfg | XmlActionScrollIntoviewCfg | XmlActionScrollToCfg | XmlActionSelectCfg | XmlActionSetvarDbqueryCfg | XmlActionSetvarElementCfg | XmlActionSetvarFileCfg | XmlActionSetvarGetCfg | XmlActionSetvarOcrCfg | XmlActionSetvarSubtaskCfg | XmlActionSetvarTemplstrCfg | XmlActionSubtaskCfg | XmlActionWaitElementCfg | XmlActionWaitNavigationCfg | XmlActionWaitSleepCfg;
404
-
405
- type TokenCaptchaType = "amazon" | "funcaptcha" | "geetest" | "keycaptcha" | "mtcaptcha" | "recaptcha" | "turnstile";
406
- type RecognitionCaptchaType = "text" | "coordinate" | "grid" | "slider" | "rotation";
407
- type CaptchaType = TokenCaptchaType | RecognitionCaptchaType;
408
-
409
5
  type TemplateId = number;
410
- type DomainId = number;
411
6
  type HttpHeaders = Record<string, string>;
412
7
  interface ScraperStateData extends BrowserStateData {
413
8
  /**
@@ -419,58 +14,6 @@ interface ScraperStateData extends BrowserStateData {
419
14
  */
420
15
  userData: Record<string, string>;
421
16
  }
422
- type InParas = Record<string, string>;
423
- interface FontttfConfig {
424
- exloc: string;
425
- inloc: string;
426
- minuc: number;
427
- maxuc: number;
428
- startidx: number;
429
- fsfilter: string;
430
- fufilter: string;
431
- parsetype: string;
432
- }
433
- /**
434
- * fonts config in xml
435
- * * length: 0 ~ 20, default []
436
- */
437
- interface FontsConfig {
438
- fontselectorConfig: Record<string, XmlFontselectorCfg>;
439
- fontfamilyConfig: Record<string, XmlFontfamilyCfg>;
440
- fontcodesConfig: Record<string, XmlFontcodesCfg>;
441
- fontcharsConfig: Record<string, XmlFontcharsCfg>;
442
- fontsvgCfg?: XmlFontsvgCfg;
443
- fontttfConfig?: FontttfConfig;
444
- }
445
- type ElementSource = "browser" | "cheerio";
446
- interface TemplateInScraper {
447
- templateId: TemplateId;
448
- domainId: DomainId;
449
- /**
450
- * @default "browser"
451
- */
452
- defaultElementSource: ElementSource;
453
- /**
454
- * @default 600 seconds
455
- */
456
- maxExecutionDuration: number;
457
- configDetail: string;
458
- capName?: string;
459
- }
460
- type AttrsInXml = Record<string, string>;
461
- type DatatableColumnMap = Map<string, string>;
462
- interface ParsedTemplate {
463
- actionConfigs: XmlActionConfig[];
464
- paraCfgs: XmlParaCfg[];
465
- fontsConfig: FontsConfig | null;
466
- attrsInXml: AttrsInXml;
467
- captchaTypes: CaptchaType[];
468
- lastUsedTime: number;
469
- lastCheckTime: number;
470
- datatableMap: Map<string, DatatableColumnMap> | null;
471
- template?: TemplateInScraper;
472
- }
473
- type ParsedTemplateExt = Required<ParsedTemplate>;
474
17
  /**
475
18
  * Network context used to execute the task
476
19
  */
@@ -592,6 +135,11 @@ interface ScraperConfig {
592
135
  * @default false
593
136
  */
594
137
  loadUnfinishedTasks?: boolean;
138
+ /**
139
+ * unit: minutes
140
+ * @default 0
141
+ */
142
+ loadFailedTasksInterval?: number;
595
143
  /**
596
144
  * @default "", which will use current directory of process + "/data/"
597
145
  * if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions.
@@ -688,54 +236,9 @@ interface ScraperConfig {
688
236
 
689
237
  declare function setScraperLogFun(logFun: LogFunction): boolean;
690
238
 
691
- /** 修改node_modules/xml2js/lib/parser.js文件,添加如下内容(根据tagName自动添加type和subtype属性,如action_setvar_element添加 type="setvar" subtype="element"):
692
- //////// start of LSD added by Joe ////////////////////////////////////////////////////////////////////////////////////////////
693
- // to be able to add attributes here and later(defaultElementCfg.js), set obj[attrkey] if undefined
694
- if(!obj[attrkey]){
695
- obj[attrkey] = {}
696
- }
697
- const subTags = node.name.split("_")
698
- if(subTags.length > 1 && typeof obj[attrkey]["type"] ==="undefined"){
699
- obj[attrkey]["type"] = subTags[1]
700
- }
701
- if(subTags.length > 2 && typeof obj[attrkey]["subtype"] ==="undefined"){
702
- obj[attrkey]["subtype"] = subTags[2]
703
- }
704
- //////// end of LSD added by Joe ////////////////////////////////////////////////////////////////////////////////////////////
705
-
706
- obj["#name"] = _this.options.tagNameProcessors ? processItem(_this.options.tagNameProcessors, node.name) : node.name;
707
- */
708
- /**
709
- * TaskParser
710
- */
711
- declare class TaskParser {
712
- #private;
713
- /**
714
- *
715
- * @param xmlStr
716
- * @param defaultCfgFlag
717
- * @returns {$$:{id, version}, children: {paras: [...], depends: [...], actions: [...]}}}
718
- */
719
- static convertXmlToJson(xmlStr: string, defaultCfgFlag?: boolean): Promise<any>;
720
- static getPartOfJsonCfg(jsonCfg: any, partName: string, optional?: boolean): any;
721
- static getParaCfgsFromJsonCfg(jsonCfg: any): XmlParaCfg[];
722
- static getAttrsInXml(jsonCfg: any): AttrsInXml;
723
- static getCaptchTypes(jsonCfg: any): CaptchaType[];
724
- static getDatableMapFromJsonCfg(jsonCfg: any): Map<string, DatatableColumnMap> | null;
725
- static getInParas(parasStr: string, paraCfgs?: XmlParaCfg[], splitStr?: string): InParas;
726
- static convertExecData(origExecData: ExecData, datatableMap: Map<string, DatatableColumnMap> | null): ExecData;
727
- }
728
-
729
- declare class TemplateManagerInScraper {
730
- #private;
731
- static parseXmlTemplate(xmlStr: string, datatableFlag?: boolean): Promise<ParsedTemplate>;
732
- static getTemplateConfig(templateId: number, xmlStr?: string): Promise<ParsedTemplateExt>;
733
- static clearTemplateConfig(templateId?: number): boolean;
734
- }
735
-
736
239
  declare function performOneTask(templateId: number, parasStr: string, taskNetworkContext: TaskNetworkContext, taskType?: TaskType, xmlStr?: string, taskId?: number, useNickName?: boolean): Promise<TaskResult>;
737
240
 
738
241
  declare function updateScraperConfig(config: ScraperConfig): Promise<boolean>;
739
242
  declare function scraper(newTasks?: TemplateTasks[], config?: ScraperConfig): Promise<boolean>;
740
243
 
741
- export { type AttrsInXml, type BrowserConfig, type ExecData, type ParsedTemplate, type ScraperConfig, TaskParser, TemplateManagerInScraper, type TemplatePara, type TemplateTasks, performOneTask, scraper, setScraperLogFun, updateScraperConfig };
244
+ export { type BrowserConfig, type ExecData, type ScraperConfig, type TemplatePara, type TemplateTasks, performOneTask, scraper, setScraperLogFun, updateScraperConfig };