@letsscrapedata/scraper 0.0.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/readme.md ADDED
@@ -0,0 +1,246 @@
1
+ <div align="center">
2
+ <div>
3
+ <a href="https://www.LetsScrapeData.com" style="text-decoration: none" target="_blank">
4
+ <img src="https://www.letsscrapedata.com/assets/logo.svg" width="160" alt="LetsScrapeData">
5
+ </a>
6
+ </div>
7
+ <!-- <div>This is part of LetsScrapeData <a href="https://www.npmjs.com/~letsscrapedata"> web scraping suites </a>.</div> -->
8
+ <div>You can use a free <a href="https://www.LetsScrapeData.com">LetsScrapeData App</a> if you want to scrape web data without programming.</div>
9
+ <br/>
10
+ </div>
11
+
12
+ <font size=4>Please get help and discuss how to scrape a website on the [discord server](https://discord.gg/46atZ8kPVb), which can respond quickly. It is better to submit issues on [github](https://github.com/LetsScrapeData/scraper) for better tracking.</font>
13
+
14
+ ## Features
15
+
16
+ 1. Template driven web scraping
17
+
18
+ - you can quickly [design templates](https://doc.letsscrapedata.com/template/) for scraping different websites.
19
+ - The templates are intuitive and easier to maintain.
20
+
21
+ 2. Browser operations supported by the [controller](https://www.npmjs.com/package/@letsscrapedata/controller) package
22
+
23
+ - Same interface of playwright, patchright, puppeteer, cheerio: easy to switch between them
24
+ - Web browsing automation: goto(open) / click / input / hover / select / scroll
25
+ - Automatic captcha solver: Recaptcha(v2 & v3), Cloudflare Turnstile, GeeTest(v3 & v4), image/text, cooridinate
26
+ - State data management: cookies, localStorage, HTTP Headers, custom session data
27
+ - Elements selection by CSS selectors or XPath: whether in frames or not
28
+ - Automatic file saving: such as screenshot, pdf, mhtml, download directly or by clicking
29
+
30
+ 3. API request
31
+
32
+ - Both browser and API can be used at the same time and cookies/headers are shared.
33
+ - HTTP headers: intercepted, generated automatically or by browser automation, got by API or others
34
+
35
+ 4. fingerprint management:
36
+
37
+ - Automatically generate fingerprints of the latest common browsers
38
+
39
+ 5. Simple rate limits: automatic flow control, such as interval / max concurrency /times per period
40
+ 6. Simple proxy management: multiple "static" proxies to increase concurrency
41
+ 7. Subtasks: complex tasks can be split into multiple simple subtasks for better maintenance and increased concurrency
42
+ 8. Data export
43
+
44
+ ## Install
45
+
46
+ ```sh
47
+ npm install @letsscrapedata/scraper
48
+ ```
49
+
50
+ ## Examples
51
+
52
+ 1. Example with default ScraperConfig:
53
+
54
+ ```javascript
55
+ // javascript
56
+ import { scraper } from "@letsscrapedata/sraper";
57
+
58
+ /**
59
+ * tid: ID of template to be executed, such as template for scraping one list of example in page "https://www.letsscrapedata.com/pages/listexample1.html"
60
+ * parasstrs: input parameters of tasks, such as "1"
61
+ * this example will execute five tasks using template 2000007, each of them scrapes the data in one page.
62
+ */
63
+ const newTasks = [{ tid: 2000007, parasstrs: ["1", "2", "3", "4", "5"] }];
64
+
65
+ /* The following line can do the same thing using subtasks, scraping the data in the first five pages */
66
+ // const newTasks = [{ tid: 2000008, parasstrs: ["5"] }];
67
+
68
+ await scraper(newTasks);
69
+ ```
70
+
71
+ 2. Example with ScraperConfig
72
+
73
+ ```typescript
74
+ // typescript
75
+ import { scraper, TemplateTasks, ScraperConfig } from "@letsscrapedata/sraper";
76
+
77
+ const scraperConfig: ScraperConfig = {
78
+ browserConfigs: [
79
+ /* launch a chromium browser using puppeteer, no proxy */
80
+ { browserControllerType: "puppeteer", proxyUrl: "" },
81
+ /* launch a chromium browser using playwright, proxy */
82
+ { browserContollerType: "playwright", proxyUrl: "http://proxyId:port" },
83
+ /* connect to the current browser using patchwright */
84
+ { browserUrl: "http://localhost:9222/" },
85
+ ],
86
+ // exitWhenCompleted: true,
87
+ // loadUnfinishedTasks: true,
88
+ // lsdLaunchOptions: { headless: true },
89
+ };
90
+
91
+ const newTasks: TemplateTasks[] = [{ tid: 2000008, parasstrs: ["9"] }];
92
+
93
+ await scraper(newTasks, scraperConfig);
94
+ ```
95
+
96
+ ## ScraperConfig
97
+
98
+ Common configurations:
99
+
100
+ - Proxies and browser: browserConfigs, by default launching a browser using browserControllerType/browserType, without proxy
101
+ - Default browser controller to use: browserControllerType, default "playwright"
102
+ - Default browser to use: browserType, default "chromium"
103
+ - File format of scraped data: dataFileFormat, default "tsv"
104
+ - Where are the templates: templateDir, default "" which means to obtain the template from the network
105
+
106
+ Complete configurations:
107
+
108
+ ```typescript
109
+ export interface ScraperConfig {
110
+ /**
111
+ * @default false
112
+ */
113
+ exitWhenCompleted?: boolean;
114
+ /**
115
+ * whether to use the parasstr in XML if parasstr of a task is ""
116
+ * @default false
117
+ */
118
+ useParasstrInXmlIfNeeded?: boolean;
119
+ /**
120
+ * whether to load unfinished tasks
121
+ * @default false
122
+ */
123
+ loadUnfinishedTasks?: boolean;
124
+ //////////////////////////////////////////////////////////////////////////// directory
125
+ /**
126
+ * @default "", which will use current directory of process + "/data/"
127
+ * if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions.
128
+ */
129
+ baseDir?: string;
130
+ /**
131
+ * where are the templates saved
132
+ * @default "", which means to get the templates from LSD server
133
+ */
134
+ templateDir?: string;
135
+ /**
136
+ * filename in action_setvar_get/get_file must include inputFileDirePart for security.
137
+ * @default "LetsScrapeData"
138
+ */
139
+ inputFileDirPart?: string;
140
+ //////////////////////////////////////////////////////////////////////////// browser
141
+ /**
142
+ * wether to use puppeteer-extra-plugin-stealth, use patchright instead
143
+ * @default false
144
+ */
145
+ useStealthPlugin?: boolean;
146
+ /**
147
+ * default browserControllerType of BrowserConfig
148
+ * @default "patchright"
149
+ */
150
+ browserControllerType?: BrowserControllerType;
151
+ /**
152
+ * default browserType of BrowserConfig
153
+ * @default "chromium"
154
+ */
155
+ browserType?: LsdBrowserType;
156
+ /**
157
+ * @default { headless: false }
158
+ */
159
+ lsdLaunchOptions?: LsdLaunchOptions;
160
+ /**
161
+ * @default {browserUrl: ""}
162
+ */
163
+ lsdConnectOptions?: LsdConnectOptions;
164
+ /**
165
+ * Important: browsers to be launched or connected using proxyUrl
166
+ * @default [{proxyUrl: ""}], launch a default browser using default type of browser controller, no proxy
167
+ */
168
+ browserConfigs?: BrowserConfig[];
169
+ //////////////////////////////////////////////////////////////////////////// captcha
170
+ captcha?: {
171
+ /**
172
+ * clientKey of 2captcha
173
+ */
174
+ clientKey: string;
175
+ },
176
+ //////////////////////////////////////////////////////////////////////////// template
177
+ templateUrl?: string; // LSD
178
+ /**
179
+ * the default maximum number of concurrent tasks that can execute the same template in a browserContext
180
+ * @default 1
181
+ */
182
+ maxConcurrency?: number;
183
+ /**
184
+ * @default ""
185
+ */
186
+ readCode?: string;
187
+ /**
188
+ * @default []
189
+ */
190
+ templateParas?: TemplatePara[];
191
+ //////////////////////////////////////////////////////////////////////////// scheduler
192
+ /**
193
+ * @default 10
194
+ */
195
+ totalMaxConcurrency?: number;
196
+ /**
197
+ * min miliseconds between two tasks of the same template
198
+ * @default 2000
199
+ */
200
+ minMiliseconds?: number,
201
+ //////////////////////////////////////////////////////////////////////////// data
202
+ /**
203
+ * whether to move all dat_* files into a new directory "yyyyMMddHHmmss"
204
+ * @default false
205
+ */
206
+ moveDataWhenStart?: boolean;
207
+ /**
208
+ ** DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
209
+ * @default "jsonl"
210
+ */
211
+ dataFileFormat?: DataFileFormat;
212
+ /**
213
+ * @default true
214
+ */
215
+ useNickName?: boolean;
216
+
217
+ * valid only when dataFileFormat is "txt"
218
+ */
219
+ columnSeperator?: string;
220
+ }
221
+
222
+ /**
223
+ * Only one of browserUrl and proxyUrl will take effect, and browserUrl has higher priority.
224
+ */
225
+ export interface BrowserConfig {
226
+ browserControllerType?: BrowserControllerType;
227
+ /**
228
+ * url used to connected the current browser
229
+ ** url starts with "http://", such as "http://localhost:9222/"
230
+ ** browserUrl can be used when mannaul login in advance.
231
+ */
232
+ browserUrl?: string;
233
+ /**
234
+ * proxy
235
+ ** no proxy will be used if proxyUrl is ""
236
+ ** valid only if !browserUrl
237
+ */
238
+ proxyUrl?: string;
239
+ /**
240
+ * type of browser to be launched
241
+ * valid only if !browserUrl
242
+ * @default "chromium"
243
+ */
244
+ browserType?: LsdBrowserType;
245
+ }
246
+ ```