@letsscrapedata/scraper 0.0.74 → 0.0.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -2,7 +2,412 @@ import { BrowserControllerType, LsdBrowserType, LsdLaunchOptions, LsdConnectOpti
2
2
  import { Proxy } from '@letsscrapedata/proxy';
3
3
  import { LogFunction } from '@letsscrapedata/utils';
4
4
 
5
+ interface XmlParaCfg {
6
+ paraname: string;
7
+ name: string;
8
+ desc: string;
9
+ uitype: string;
10
+ defval: string;
11
+ listid: string;
12
+ listparas: string;
13
+ min: string;
14
+ max: string;
15
+ pattern: string;
16
+ alert: string;
17
+ }
18
+ interface XmlActionBreakCfg {
19
+ type: string;
20
+ id: string;
21
+ }
22
+ interface XmlActionCaptchaCfg {
23
+ type: string;
24
+ try: string;
25
+ errname: string;
26
+ id: string;
27
+ }
28
+ interface XmlActionClickCfg {
29
+ type: string;
30
+ wait: string;
31
+ gen: boolean;
32
+ cap: boolean;
33
+ clicktype: string;
34
+ try: string;
35
+ errname: string;
36
+ datapage: string;
37
+ popupsubtask: boolean;
38
+ login: boolean;
39
+ captcha: boolean;
40
+ navigate: boolean;
41
+ eurl: string;
42
+ eloc: string;
43
+ pn1: string;
44
+ pv1: string;
45
+ pn2: string;
46
+ pv2: string;
47
+ downloadpath: string;
48
+ filename: string;
49
+ pathvarname: string;
50
+ id: string;
51
+ }
52
+ interface XmlActionContinueCfg {
53
+ type: string;
54
+ id: string;
55
+ }
56
+ interface XmlActionMiscCfg {
57
+ type: string;
58
+ id: string;
59
+ }
60
+ interface XmlActionExitCfg {
61
+ type: string;
62
+ errname: string;
63
+ id: string;
64
+ }
65
+ interface XmlActionExtractCfg {
66
+ type: string;
67
+ tabname: string;
68
+ id: string;
69
+ }
70
+ interface XmlActionGotoCfg {
71
+ type: string;
72
+ url: string;
73
+ reuse: boolean;
74
+ wait: string;
75
+ encodeuri: boolean;
76
+ gen: boolean;
77
+ cap: boolean;
78
+ datapage: string;
79
+ popupsubtask: boolean;
80
+ login: boolean;
81
+ captcha: boolean;
82
+ eurl: string;
83
+ eloc: string;
84
+ pn1: string;
85
+ pv1: string;
86
+ pn2: string;
87
+ pv2: string;
88
+ headers: string;
89
+ referer: string;
90
+ id: string;
91
+ }
92
+ interface XmlActionHoverCfg {
93
+ type: string;
94
+ try: string;
95
+ gen: boolean;
96
+ cap: boolean;
97
+ wait: string;
98
+ errname: string;
99
+ id: string;
100
+ }
101
+ interface XmlActionIfelseCfg {
102
+ type: string;
103
+ id: string;
104
+ }
105
+ interface XmlActionInputCfg {
106
+ type: string;
107
+ content: string;
108
+ enter: boolean;
109
+ replace: boolean;
110
+ gen: boolean;
111
+ cap: boolean;
112
+ try: string;
113
+ wait: string;
114
+ errname: string;
115
+ datapage: string;
116
+ popupsubtask: boolean;
117
+ login: boolean;
118
+ captcha: boolean;
119
+ eurl: string;
120
+ eloc: string;
121
+ pn1: string;
122
+ pv1: string;
123
+ pn2: string;
124
+ pv2: string;
125
+ id: string;
126
+ }
127
+ interface XmlActionInterceptClearCfg {
128
+ type: string;
129
+ subtype: string;
130
+ }
131
+ interface XmlActionInterceptSetCfg {
132
+ type: string;
133
+ subtype: string;
134
+ }
135
+ interface XmlActionLoopdowhileElementCfg {
136
+ type: string;
137
+ subtype: string;
138
+ iswhile: boolean;
139
+ varname: string;
140
+ maxloops: string;
141
+ click: boolean;
142
+ navigate: boolean;
143
+ gen: boolean;
144
+ cap: boolean;
145
+ errname: string;
146
+ wait: string;
147
+ id: string;
148
+ }
149
+ interface XmlActionLoopdowhileTemplstrCfg {
150
+ type: string;
151
+ subtype: string;
152
+ iswhile: boolean;
153
+ varname: string;
154
+ maxloops: string;
155
+ id: string;
156
+ }
157
+ interface XmlActionLoopforCfg {
158
+ type: string;
159
+ from: string;
160
+ to: string;
161
+ step: string;
162
+ roundtype: string;
163
+ varname: string;
164
+ maxloops: string;
165
+ errname: string;
166
+ id: string;
167
+ }
168
+ interface XmlActionLoopinelesCfg {
169
+ type: string;
170
+ varname: string;
171
+ maxloops: string;
172
+ start: string;
173
+ end: string;
174
+ step: string;
175
+ errname: string;
176
+ id: string;
177
+ }
178
+ interface XmlActionLoopinstrCfg {
179
+ type: string;
180
+ list: string;
181
+ split: string;
182
+ varname: string;
183
+ maxloops: string;
184
+ trim: boolean;
185
+ errname: string;
186
+ id: string;
187
+ }
188
+ interface XmlActionScrollByCfg {
189
+ type: string;
190
+ subtype: string;
191
+ height: string;
192
+ unit: string;
193
+ maxtimes: string;
194
+ interval: string;
195
+ gen: boolean;
196
+ cap: boolean;
197
+ datapage: string;
198
+ popupsubtask: boolean;
199
+ login: boolean;
200
+ captcha: boolean;
201
+ eurl: string;
202
+ eloc: string;
203
+ pn1: string;
204
+ pv1: string;
205
+ pn2: string;
206
+ pv2: string;
207
+ id: string;
208
+ }
209
+ interface XmlActionScrollIntoviewCfg {
210
+ type: string;
211
+ subtype: string;
212
+ gen: boolean;
213
+ cap: boolean;
214
+ errname: string;
215
+ datapage: string;
216
+ popupsubtask: boolean;
217
+ login: boolean;
218
+ captcha: boolean;
219
+ eurl: string;
220
+ eloc: string;
221
+ pn1: string;
222
+ pv1: string;
223
+ pn2: string;
224
+ pv2: string;
225
+ id: string;
226
+ }
227
+ interface XmlActionScrollToCfg {
228
+ type: string;
229
+ subtype: string;
230
+ height: string;
231
+ unit: string;
232
+ gen: boolean;
233
+ cap: boolean;
234
+ datapage: string;
235
+ popupsubtask: boolean;
236
+ login: boolean;
237
+ captcha: boolean;
238
+ eurl: string;
239
+ eloc: string;
240
+ pn1: string;
241
+ pv1: string;
242
+ pn2: string;
243
+ pv2: string;
244
+ id: string;
245
+ }
246
+ interface XmlActionSelectCfg {
247
+ type: string;
248
+ selecttype: string;
249
+ selectval: string;
250
+ gen: boolean;
251
+ cap: boolean;
252
+ try: string;
253
+ wait: string;
254
+ errname: string;
255
+ datapage: string;
256
+ popupsubtask: boolean;
257
+ login: boolean;
258
+ captcha: boolean;
259
+ eurl: string;
260
+ eloc: string;
261
+ pn1: string;
262
+ pv1: string;
263
+ pn2: string;
264
+ pv2: string;
265
+ id: string;
266
+ }
267
+ interface XmlActionSetvarDbqueryCfg {
268
+ type: string;
269
+ subtype: string;
270
+ varname: string;
271
+ defaultval: string;
272
+ valerrname: string;
273
+ pattern: string;
274
+ flags: string;
275
+ path: string;
276
+ id: string;
277
+ }
278
+ interface XmlActionSetvarElementCfg {
279
+ type: string;
280
+ subtype: string;
281
+ varname: string;
282
+ defaultval: string;
283
+ try: string;
284
+ valerrname: string;
285
+ pattern: string;
286
+ flags: string;
287
+ path: string;
288
+ id: string;
289
+ }
290
+ interface XmlActionSetvarFileCfg {
291
+ type: string;
292
+ subtype: string;
293
+ varname: string;
294
+ defaultval: string;
295
+ proxy: boolean;
296
+ valerrname: string;
297
+ pattern: string;
298
+ flags: string;
299
+ path: string;
300
+ id: string;
301
+ }
302
+ interface XmlActionSetvarGetCfg {
303
+ type: string;
304
+ subtype: string;
305
+ varname: string;
306
+ defaultval: string;
307
+ valerrname: string;
308
+ pattern: string;
309
+ flags: string;
310
+ path: string;
311
+ id: string;
312
+ }
313
+ interface XmlActionSetvarOcrCfg {
314
+ type: string;
315
+ subtype: string;
316
+ varname: string;
317
+ defaultval: string;
318
+ valerrname: string;
319
+ pattern: string;
320
+ flags: string;
321
+ path: string;
322
+ id: string;
323
+ }
324
+ interface XmlActionSetvarSubtaskCfg {
325
+ type: string;
326
+ subtype: string;
327
+ varname: string;
328
+ defaultval: string;
329
+ valerrname: string;
330
+ pattern: string;
331
+ flags: string;
332
+ path: string;
333
+ id: string;
334
+ }
335
+ interface XmlActionSetvarTemplstrCfg {
336
+ type: string;
337
+ subtype: string;
338
+ varname: string;
339
+ defaultval: string;
340
+ valerrname: string;
341
+ pattern: string;
342
+ flags: string;
343
+ path: string;
344
+ id: string;
345
+ }
346
+ interface XmlActionSubtaskCfg {
347
+ type: string;
348
+ subtasks: string;
349
+ sameasparent: boolean;
350
+ id: string;
351
+ }
352
+ interface XmlActionWaitElementCfg {
353
+ type: string;
354
+ subtype: string;
355
+ timeout: string;
356
+ state: string;
357
+ errname: string;
358
+ wait: string;
359
+ id: string;
360
+ }
361
+ interface XmlActionWaitNavigationCfg {
362
+ type: string;
363
+ subtype: string;
364
+ timeout: string;
365
+ waituntil: string;
366
+ url: string;
367
+ errname: string;
368
+ wait: string;
369
+ id: string;
370
+ }
371
+ interface XmlActionWaitSleepCfg {
372
+ type: string;
373
+ subtype: string;
374
+ minms: string;
375
+ maxms: string;
376
+ errname: string;
377
+ wait: string;
378
+ id: string;
379
+ }
380
+ interface XmlFontsvgCfg {
381
+ exloc: string;
382
+ inloc: string;
383
+ csmaptype: string;
384
+ bsfilter: string;
385
+ }
386
+ interface XmlFontselectorCfg {
387
+ name: string;
388
+ fontfamily: string;
389
+ }
390
+ interface XmlFontfamilyCfg {
391
+ name: string;
392
+ fontcodes: string;
393
+ fontchars: string;
394
+ }
395
+ interface XmlFontcodesCfg {
396
+ name: string;
397
+ codes: string;
398
+ }
399
+ interface XmlFontcharsCfg {
400
+ name: string;
401
+ chars: string;
402
+ }
403
+ type XmlActionConfig = XmlActionBreakCfg | XmlActionCaptchaCfg | XmlActionClickCfg | XmlActionContinueCfg | XmlActionMiscCfg | XmlActionExitCfg | XmlActionExtractCfg | XmlActionGotoCfg | XmlActionHoverCfg | XmlActionIfelseCfg | XmlActionInputCfg | XmlActionInterceptClearCfg | XmlActionInterceptSetCfg | XmlActionLoopdowhileElementCfg | XmlActionLoopdowhileTemplstrCfg | XmlActionLoopforCfg | XmlActionLoopinelesCfg | XmlActionLoopinstrCfg | XmlActionScrollByCfg | XmlActionScrollIntoviewCfg | XmlActionScrollToCfg | XmlActionSelectCfg | XmlActionSetvarDbqueryCfg | XmlActionSetvarElementCfg | XmlActionSetvarFileCfg | XmlActionSetvarGetCfg | XmlActionSetvarOcrCfg | XmlActionSetvarSubtaskCfg | XmlActionSetvarTemplstrCfg | XmlActionSubtaskCfg | XmlActionWaitElementCfg | XmlActionWaitNavigationCfg | XmlActionWaitSleepCfg;
404
+
405
+ type TokenCaptchaType = "amazon" | "funcaptcha" | "geetest" | "keycaptcha" | "mtcaptcha" | "recaptcha" | "turnstile";
406
+ type RecognitionCaptchaType = "text" | "coordinate" | "grid" | "slider" | "rotation";
407
+ type CaptchaType = TokenCaptchaType | RecognitionCaptchaType;
408
+
5
409
  type TemplateId = number;
410
+ type DomainId = number;
6
411
  type HttpHeaders = Record<string, string>;
7
412
  interface ScraperStateData extends BrowserStateData {
8
413
  /**
@@ -14,6 +419,58 @@ interface ScraperStateData extends BrowserStateData {
14
419
  */
15
420
  userData: Record<string, string>;
16
421
  }
422
+ type InParas = Record<string, string>;
423
+ interface FontttfConfig {
424
+ exloc: string;
425
+ inloc: string;
426
+ minuc: number;
427
+ maxuc: number;
428
+ startidx: number;
429
+ fsfilter: string;
430
+ fufilter: string;
431
+ parsetype: string;
432
+ }
433
+ /**
434
+ * fonts config in xml
435
+ * * length: 0 ~ 20, default []
436
+ */
437
+ interface FontsConfig {
438
+ fontselectorConfig: Record<string, XmlFontselectorCfg>;
439
+ fontfamilyConfig: Record<string, XmlFontfamilyCfg>;
440
+ fontcodesConfig: Record<string, XmlFontcodesCfg>;
441
+ fontcharsConfig: Record<string, XmlFontcharsCfg>;
442
+ fontsvgCfg?: XmlFontsvgCfg;
443
+ fontttfConfig?: FontttfConfig;
444
+ }
445
+ type ElementSource = "browser" | "cheerio";
446
+ interface TemplateInScraper {
447
+ templateId: TemplateId;
448
+ domainId: DomainId;
449
+ /**
450
+ * @default "browser"
451
+ */
452
+ defaultElementSource: ElementSource;
453
+ /**
454
+ * @default 600 seconds
455
+ */
456
+ maxExecutionDuration: number;
457
+ configDetail: string;
458
+ capName?: string;
459
+ }
460
+ type AttrsInXml = Record<string, string>;
461
+ type DatatableColumnMap = Map<string, string>;
462
+ interface ParsedTemplate {
463
+ actionConfigs: XmlActionConfig[];
464
+ paraCfgs: XmlParaCfg[];
465
+ fontsConfig: FontsConfig | null;
466
+ attrsInXml: AttrsInXml;
467
+ captchaTypes: CaptchaType[];
468
+ lastUsedTime: number;
469
+ lastCheckTime: number;
470
+ datatableMap: Map<string, DatatableColumnMap> | null;
471
+ template?: TemplateInScraper;
472
+ }
473
+ type ParsedTemplateExt = Required<ParsedTemplate>;
17
474
  /**
18
475
  * Network context used to execute the task
19
476
  */
@@ -184,7 +641,7 @@ interface ScraperConfig {
184
641
  */
185
642
  clientKey: string;
186
643
  };
187
- templateUrl?: string;
644
+ urlPrefix?: string;
188
645
  /**
189
646
  * the default maximum number of concurrent tasks that can execute the same template in a browserContext
190
647
  * @default 1
@@ -231,9 +688,54 @@ interface ScraperConfig {
231
688
 
232
689
  declare function setScraperLogFun(logFun: LogFunction): boolean;
233
690
 
691
+ /** 修改node_modules/xml2js/lib/parser.js文件,添加如下内容(根据tagName自动添加type和subtype属性,如action_setvar_element添加 type="setvar" subtype="element"):
692
+ //////// start of LSD added by Joe ////////////////////////////////////////////////////////////////////////////////////////////
693
+ // to be able to add attributes here and later(defaultElementCfg.js), set obj[attrkey] if undefined
694
+ if(!obj[attrkey]){
695
+ obj[attrkey] = {}
696
+ }
697
+ const subTags = node.name.split("_")
698
+ if(subTags.length > 1 && typeof obj[attrkey]["type"] ==="undefined"){
699
+ obj[attrkey]["type"] = subTags[1]
700
+ }
701
+ if(subTags.length > 2 && typeof obj[attrkey]["subtype"] ==="undefined"){
702
+ obj[attrkey]["subtype"] = subTags[2]
703
+ }
704
+ //////// end of LSD added by Joe ////////////////////////////////////////////////////////////////////////////////////////////
705
+
706
+ obj["#name"] = _this.options.tagNameProcessors ? processItem(_this.options.tagNameProcessors, node.name) : node.name;
707
+ */
708
+ /**
709
+ * TaskParser
710
+ */
711
+ declare class TaskParser {
712
+ #private;
713
+ /**
714
+ *
715
+ * @param xmlStr
716
+ * @param defaultCfgFlag
717
+ * @returns {$$:{id, version}, children: {paras: [...], depends: [...], actions: [...]}}}
718
+ */
719
+ static convertXmlToJson(xmlStr: string, defaultCfgFlag?: boolean): Promise<any>;
720
+ static getPartOfJsonCfg(jsonCfg: any, partName: string, optional?: boolean): any;
721
+ static getParaCfgsFromJsonCfg(jsonCfg: any): XmlParaCfg[];
722
+ static getAttrsInXml(jsonCfg: any): AttrsInXml;
723
+ static getCaptchTypes(jsonCfg: any): CaptchaType[];
724
+ static getDatableMapFromJsonCfg(jsonCfg: any): Map<string, DatatableColumnMap> | null;
725
+ static getInParas(parasStr: string, paraCfgs?: XmlParaCfg[], splitStr?: string): InParas;
726
+ static convertExecData(origExecData: ExecData, datatableMap: Map<string, DatatableColumnMap> | null): ExecData;
727
+ }
728
+
729
+ declare class TemplateManagerInScraper {
730
+ #private;
731
+ static parseXmlTemplate(xmlStr: string, datatableFlag?: boolean): Promise<ParsedTemplate>;
732
+ static getTemplateConfig(templateId: number, xmlStr?: string): Promise<ParsedTemplateExt>;
733
+ static clearTemplateConfig(templateId?: number): boolean;
734
+ }
735
+
234
736
  declare function performOneTask(templateId: number, parasStr: string, taskNetworkContext: TaskNetworkContext, taskType?: TaskType, xmlStr?: string, taskId?: number, useNickName?: boolean): Promise<TaskResult>;
235
737
 
236
738
  declare function updateScraperConfig(config: ScraperConfig): Promise<boolean>;
237
739
  declare function scraper(newTasks?: TemplateTasks[], config?: ScraperConfig): Promise<boolean>;
238
740
 
239
- export { type BrowserConfig, type ExecData, type ScraperConfig, type TemplatePara, type TemplateTasks, performOneTask, scraper, setScraperLogFun, updateScraperConfig };
741
+ export { type AttrsInXml, type BrowserConfig, type ExecData, type ParsedTemplate, type ScraperConfig, TaskParser, TemplateManagerInScraper, type TemplatePara, type TemplateTasks, performOneTask, scraper, setScraperLogFun, updateScraperConfig };