@letsscrapedata/scraper 0.0.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,239 @@
1
+ import { BrowserControllerType, LsdBrowserType, LsdLaunchOptions, LsdConnectOptions, LsdPage, LsdApiContext, BrowserStateData } from '@letsscrapedata/controller';
2
+ import { Proxy } from '@letsscrapedata/proxy';
3
+ import { LogFunction } from '@letsscrapedata/utils';
4
+
5
+ type TemplateId = number;
6
+ type HttpHeaders = Record<string, string>;
7
+ interface ScraperStateData extends BrowserStateData {
8
+ /**
9
+ * @default {}
10
+ */
11
+ headers: HttpHeaders;
12
+ /**
13
+ * @default {}
14
+ */
15
+ userData: Record<string, string>;
16
+ }
17
+ /**
18
+ * Network context used to execute the task
19
+ */
20
+ interface TaskNetworkContext {
21
+ /**
22
+ * Proxy that is used to access the target website:
23
+ * * null only when domainId is 0, which means no network resource are required to execute the task
24
+ * @default null
25
+ */
26
+ proxy: Proxy | null;
27
+ /**
28
+ * browser page that is used to open web pages when executing the task
29
+ * * null when domainNum is less than 0, which means no browser page is required to execute the task
30
+ * @default null
31
+ */
32
+ page: LsdPage | null;
33
+ /**
34
+ * LsdApiContext that shares the state data within the same browser context:
35
+ * * null only when domainId is 0, which means no network resource are required to execute the task
36
+ * @default null
37
+ */
38
+ browserApiContext: LsdApiContext | null;
39
+ /**
40
+ * Standalone LsdApiContext that shares the state data between the tasks that using the same standalone LsdApiContext:
41
+ * * null only when domainId is 0, which means no network resource are required to execute the task
42
+ * * it is not recommended to use this context unless you never use a browser to access web pages.
43
+ * @default null
44
+ */
45
+ standaloneApiContext: LsdApiContext | null;
46
+ }
47
+ type DataRecord = Record<string, string>;
48
+ type ExecData = Record<string, DataRecord[]>;
49
+ interface Subtask {
50
+ tid: number;
51
+ parasstr: string;
52
+ idx?: number;
53
+ sapFlag?: boolean;
54
+ }
55
+ interface TaskMisc {
56
+ taskId: number;
57
+ message: string;
58
+ stack: string;
59
+ variables: Record<string, string>;
60
+ }
61
+ interface TaskData {
62
+ templateId: TemplateId;
63
+ parasStr: string;
64
+ credits: number;
65
+ execData: ExecData;
66
+ subtasks: Subtask[];
67
+ }
68
+ interface TaskResult {
69
+ taskData: TaskData;
70
+ subtaskDatas?: TaskData[];
71
+ /**
72
+ * included if credits >= 0
73
+ */
74
+ newStateData?: ScraperStateData;
75
+ /**
76
+ * included if credits < 0
77
+ */
78
+ misc?: TaskMisc;
79
+ }
80
+ type TaskType = "indAsync" | "indSync" | "memSync";
81
+ interface TemplateTasks {
82
+ tid: number;
83
+ parasstrs: string[];
84
+ }
85
+ /**
86
+ * Only one of browserUrl and proxyUrl will take effect, and browserUrl has higher priority.
87
+ */
88
+ interface BrowserConfig {
89
+ browserControllerType?: BrowserControllerType;
90
+ /**
91
+ * url used to connected the current browser
92
+ ** url starts with "http://", such as "http://localhost:9222/"
93
+ ** browserUrl can be used when mannaul login in advance.
94
+ */
95
+ browserUrl?: string;
96
+ /**
97
+ * proxy
98
+ ** no proxy will be used if proxyUrl is ""
99
+ ** valid only if !browserUrl
100
+ */
101
+ proxyUrl?: string;
102
+ /**
103
+ * type of browser to be launched
104
+ * valid only if !browserUrl
105
+ * @default "chromium"
106
+ */
107
+ browserType?: LsdBrowserType;
108
+ }
109
+ interface TemplatePara {
110
+ templateId: number;
111
+ /**
112
+ * code for reading or getting the template
113
+ * @default "" - only public templates can be got if readCode is ""
114
+ */
115
+ readCode?: string;
116
+ /**
117
+ * the maximum number of concurrent tasks that can execute the same template in a browserContext
118
+ * @default 1
119
+ */
120
+ maxConncurrency?: number;
121
+ }
122
+ type DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
123
+ interface ScraperConfig {
124
+ /**
125
+ * @default false
126
+ */
127
+ exitWhenCompleted?: boolean;
128
+ /**
129
+ * whether to use the parasstr in XML if parasstr of a task is ""
130
+ * @default false
131
+ */
132
+ useParasstrInXmlIfNeeded?: boolean;
133
+ /**
134
+ * whether to load unfinished tasks
135
+ * @default false
136
+ */
137
+ loadUnfinishedTasks?: boolean;
138
+ /**
139
+ * @default "", which will use current directory of process + "/data/"
140
+ * if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions.
141
+ */
142
+ baseDir?: string;
143
+ /**
144
+ * where are the templates saved
145
+ * @default "", which means to get the templates from LSD server
146
+ */
147
+ templateDir?: string;
148
+ /**
149
+ * filename in action_setvar_get/get_file must include inputFileDirePart for security.
150
+ * @default "LetsScrapeData"
151
+ */
152
+ inputFileDirPart?: string;
153
+ /**
154
+ * wether to use puppeteer-extra-plugin-stealth, use patchright instead
155
+ * @default false
156
+ */
157
+ useStealthPlugin?: boolean;
158
+ /**
159
+ * default browserControllerType of BrowserConfig
160
+ * @default "patchright"
161
+ */
162
+ browserControllerType?: BrowserControllerType;
163
+ /**
164
+ * default browserType of BrowserConfig
165
+ * @default "chromium"
166
+ */
167
+ browserType?: LsdBrowserType;
168
+ /**
169
+ * @default { headless: false }
170
+ */
171
+ lsdLaunchOptions?: LsdLaunchOptions;
172
+ /**
173
+ * @default {browserUrl: ""}
174
+ */
175
+ lsdConnectOptions?: LsdConnectOptions;
176
+ /**
177
+ * Important: browsers to be launched or connected using proxyUrl
178
+ * @default [{proxyUrl: ""}], launch a default browser using default type of browser controller, no proxy
179
+ */
180
+ browserConfigs?: BrowserConfig[];
181
+ captcha?: {
182
+ /**
183
+ * clientKey of 2captcha
184
+ */
185
+ clientKey: string;
186
+ };
187
+ templateUrl?: string;
188
+ /**
189
+ * the default maximum number of concurrent tasks that can execute the same template in a browserContext
190
+ * @default 1
191
+ */
192
+ maxConcurrency?: number;
193
+ /**
194
+ * @default ""
195
+ */
196
+ readCode?: string;
197
+ /**
198
+ * @default []
199
+ */
200
+ templateParas?: TemplatePara[];
201
+ /**
202
+ * @default 10
203
+ */
204
+ totalMaxConcurrency?: number;
205
+ /**
206
+ * min miliseconds between two tasks of the same template
207
+ * @default 2000
208
+ */
209
+ minMiliseconds?: number;
210
+ /**
211
+ * whether to move all dat_* files into a new directory "yyyyMMddHHmmss"
212
+ * @default false
213
+ */
214
+ moveDataWhenStart?: boolean;
215
+ /**
216
+ ** DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
217
+ * @default "jsonl"
218
+ */
219
+ dataFileFormat?: DataFileFormat;
220
+ /**
221
+ * valid only when dataFileFormat is "jsonl"
222
+ * @default true
223
+ */
224
+ useNickName?: boolean;
225
+ /**
226
+ * valid only when dataFileFormat is "txt"
227
+ * @default "::"
228
+ */
229
+ columnSeperator?: string;
230
+ }
231
+
232
+ declare function setScraperLogFun(logFun: LogFunction): boolean;
233
+
234
+ declare function performOneTask(templateId: number, parasStr: string, taskNetworkContext: TaskNetworkContext, taskType?: TaskType, xmlStr?: string, taskId?: number, useNickName?: boolean): Promise<TaskResult>;
235
+
236
+ declare function updateScraperConfig(config: ScraperConfig): Promise<boolean>;
237
+ declare function scraper(newTasks?: TemplateTasks[], config?: ScraperConfig): Promise<boolean>;
238
+
239
+ export { type BrowserConfig, type ExecData, type ScraperConfig, type TemplatePara, type TemplateTasks, performOneTask, scraper, setScraperLogFun, updateScraperConfig };
@@ -0,0 +1,239 @@
1
+ import { BrowserControllerType, LsdBrowserType, LsdLaunchOptions, LsdConnectOptions, LsdPage, LsdApiContext, BrowserStateData } from '@letsscrapedata/controller';
2
+ import { Proxy } from '@letsscrapedata/proxy';
3
+ import { LogFunction } from '@letsscrapedata/utils';
4
+
5
+ type TemplateId = number;
6
+ type HttpHeaders = Record<string, string>;
7
+ interface ScraperStateData extends BrowserStateData {
8
+ /**
9
+ * @default {}
10
+ */
11
+ headers: HttpHeaders;
12
+ /**
13
+ * @default {}
14
+ */
15
+ userData: Record<string, string>;
16
+ }
17
+ /**
18
+ * Network context used to execute the task
19
+ */
20
+ interface TaskNetworkContext {
21
+ /**
22
+ * Proxy that is used to access the target website:
23
+ * * null only when domainId is 0, which means no network resource are required to execute the task
24
+ * @default null
25
+ */
26
+ proxy: Proxy | null;
27
+ /**
28
+ * browser page that is used to open web pages when executing the task
29
+ * * null when domainNum is less than 0, which means no browser page is required to execute the task
30
+ * @default null
31
+ */
32
+ page: LsdPage | null;
33
+ /**
34
+ * LsdApiContext that shares the state data within the same browser context:
35
+ * * null only when domainId is 0, which means no network resource are required to execute the task
36
+ * @default null
37
+ */
38
+ browserApiContext: LsdApiContext | null;
39
+ /**
40
+ * Standalone LsdApiContext that shares the state data between the tasks that using the same standalone LsdApiContext:
41
+ * * null only when domainId is 0, which means no network resource are required to execute the task
42
+ * * it is not recommended to use this context unless you never use a browser to access web pages.
43
+ * @default null
44
+ */
45
+ standaloneApiContext: LsdApiContext | null;
46
+ }
47
+ type DataRecord = Record<string, string>;
48
+ type ExecData = Record<string, DataRecord[]>;
49
+ interface Subtask {
50
+ tid: number;
51
+ parasstr: string;
52
+ idx?: number;
53
+ sapFlag?: boolean;
54
+ }
55
+ interface TaskMisc {
56
+ taskId: number;
57
+ message: string;
58
+ stack: string;
59
+ variables: Record<string, string>;
60
+ }
61
+ interface TaskData {
62
+ templateId: TemplateId;
63
+ parasStr: string;
64
+ credits: number;
65
+ execData: ExecData;
66
+ subtasks: Subtask[];
67
+ }
68
+ interface TaskResult {
69
+ taskData: TaskData;
70
+ subtaskDatas?: TaskData[];
71
+ /**
72
+ * included if credits >= 0
73
+ */
74
+ newStateData?: ScraperStateData;
75
+ /**
76
+ * included if credits < 0
77
+ */
78
+ misc?: TaskMisc;
79
+ }
80
+ type TaskType = "indAsync" | "indSync" | "memSync";
81
+ interface TemplateTasks {
82
+ tid: number;
83
+ parasstrs: string[];
84
+ }
85
+ /**
86
+ * Only one of browserUrl and proxyUrl will take effect, and browserUrl has higher priority.
87
+ */
88
+ interface BrowserConfig {
89
+ browserControllerType?: BrowserControllerType;
90
+ /**
91
+ * url used to connected the current browser
92
+ ** url starts with "http://", such as "http://localhost:9222/"
93
+ ** browserUrl can be used when mannaul login in advance.
94
+ */
95
+ browserUrl?: string;
96
+ /**
97
+ * proxy
98
+ ** no proxy will be used if proxyUrl is ""
99
+ ** valid only if !browserUrl
100
+ */
101
+ proxyUrl?: string;
102
+ /**
103
+ * type of browser to be launched
104
+ * valid only if !browserUrl
105
+ * @default "chromium"
106
+ */
107
+ browserType?: LsdBrowserType;
108
+ }
109
+ interface TemplatePara {
110
+ templateId: number;
111
+ /**
112
+ * code for reading or getting the template
113
+ * @default "" - only public templates can be got if readCode is ""
114
+ */
115
+ readCode?: string;
116
+ /**
117
+ * the maximum number of concurrent tasks that can execute the same template in a browserContext
118
+ * @default 1
119
+ */
120
+ maxConncurrency?: number;
121
+ }
122
+ type DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
123
+ interface ScraperConfig {
124
+ /**
125
+ * @default false
126
+ */
127
+ exitWhenCompleted?: boolean;
128
+ /**
129
+ * whether to use the parasstr in XML if parasstr of a task is ""
130
+ * @default false
131
+ */
132
+ useParasstrInXmlIfNeeded?: boolean;
133
+ /**
134
+ * whether to load unfinished tasks
135
+ * @default false
136
+ */
137
+ loadUnfinishedTasks?: boolean;
138
+ /**
139
+ * @default "", which will use current directory of process + "/data/"
140
+ * if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions.
141
+ */
142
+ baseDir?: string;
143
+ /**
144
+ * where are the templates saved
145
+ * @default "", which means to get the templates from LSD server
146
+ */
147
+ templateDir?: string;
148
+ /**
149
+ * filename in action_setvar_get/get_file must include inputFileDirePart for security.
150
+ * @default "LetsScrapeData"
151
+ */
152
+ inputFileDirPart?: string;
153
+ /**
154
+ * wether to use puppeteer-extra-plugin-stealth, use patchright instead
155
+ * @default false
156
+ */
157
+ useStealthPlugin?: boolean;
158
+ /**
159
+ * default browserControllerType of BrowserConfig
160
+ * @default "patchright"
161
+ */
162
+ browserControllerType?: BrowserControllerType;
163
+ /**
164
+ * default browserType of BrowserConfig
165
+ * @default "chromium"
166
+ */
167
+ browserType?: LsdBrowserType;
168
+ /**
169
+ * @default { headless: false }
170
+ */
171
+ lsdLaunchOptions?: LsdLaunchOptions;
172
+ /**
173
+ * @default {browserUrl: ""}
174
+ */
175
+ lsdConnectOptions?: LsdConnectOptions;
176
+ /**
177
+ * Important: browsers to be launched or connected using proxyUrl
178
+ * @default [{proxyUrl: ""}], launch a default browser using default type of browser controller, no proxy
179
+ */
180
+ browserConfigs?: BrowserConfig[];
181
+ captcha?: {
182
+ /**
183
+ * clientKey of 2captcha
184
+ */
185
+ clientKey: string;
186
+ };
187
+ templateUrl?: string;
188
+ /**
189
+ * the default maximum number of concurrent tasks that can execute the same template in a browserContext
190
+ * @default 1
191
+ */
192
+ maxConcurrency?: number;
193
+ /**
194
+ * @default ""
195
+ */
196
+ readCode?: string;
197
+ /**
198
+ * @default []
199
+ */
200
+ templateParas?: TemplatePara[];
201
+ /**
202
+ * @default 10
203
+ */
204
+ totalMaxConcurrency?: number;
205
+ /**
206
+ * min miliseconds between two tasks of the same template
207
+ * @default 2000
208
+ */
209
+ minMiliseconds?: number;
210
+ /**
211
+ * whether to move all dat_* files into a new directory "yyyyMMddHHmmss"
212
+ * @default false
213
+ */
214
+ moveDataWhenStart?: boolean;
215
+ /**
216
+ ** DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
217
+ * @default "jsonl"
218
+ */
219
+ dataFileFormat?: DataFileFormat;
220
+ /**
221
+ * valid only when dataFileFormat is "jsonl"
222
+ * @default true
223
+ */
224
+ useNickName?: boolean;
225
+ /**
226
+ * valid only when dataFileFormat is "txt"
227
+ * @default "::"
228
+ */
229
+ columnSeperator?: string;
230
+ }
231
+
232
+ declare function setScraperLogFun(logFun: LogFunction): boolean;
233
+
234
+ declare function performOneTask(templateId: number, parasStr: string, taskNetworkContext: TaskNetworkContext, taskType?: TaskType, xmlStr?: string, taskId?: number, useNickName?: boolean): Promise<TaskResult>;
235
+
236
+ declare function updateScraperConfig(config: ScraperConfig): Promise<boolean>;
237
+ declare function scraper(newTasks?: TemplateTasks[], config?: ScraperConfig): Promise<boolean>;
238
+
239
+ export { type BrowserConfig, type ExecData, type ScraperConfig, type TemplatePara, type TemplateTasks, performOneTask, scraper, setScraperLogFun, updateScraperConfig };