@letsscrapedata/scraper 0.0.74
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -0
- package/dist/index.d.cts +239 -0
- package/dist/index.d.ts +239 -0
- package/dist/index.js +1 -0
- package/package.json +72 -0
- package/readme.md +246 -0
package/readme.md
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<div>
|
|
3
|
+
<a href="https://www.LetsScrapeData.com" style="text-decoration: none" target="_blank">
|
|
4
|
+
<img src="https://www.letsscrapedata.com/assets/logo.svg" width="160" alt="LetsScrapeData">
|
|
5
|
+
</a>
|
|
6
|
+
</div>
|
|
7
|
+
<!-- <div>This is part of LetsScrapeData <a href="https://www.npmjs.com/~letsscrapedata"> web scraping suites </a>.</div> -->
|
|
8
|
+
<div>You can use a free <a href="https://www.LetsScrapeData.com">LetsScrapeData App</a> if you want to scrape web data without programming.</div>
|
|
9
|
+
<br/>
|
|
10
|
+
</div>
|
|
11
|
+
|
|
12
|
+
<font size=4>Please get help and discuss how to scrape a website on the [discord server](https://discord.gg/46atZ8kPVb), which can respond quickly. It is better to submit issues on [github](https://github.com/LetsScrapeData/scraper) for better tracking.</font>
|
|
13
|
+
|
|
14
|
+
## Features
|
|
15
|
+
|
|
16
|
+
1. Template driven web scraping
|
|
17
|
+
|
|
18
|
+
- you can quickly [design templates](https://doc.letsscrapedata.com/template/) for scraping different websites.
|
|
19
|
+
- The templates are intuitive and easier to maintain.
|
|
20
|
+
|
|
21
|
+
2. Browser operations supported by the [controller](https://www.npmjs.com/package/@letsscrapedata/controller) package
|
|
22
|
+
|
|
23
|
+
- Same interface of playwright, patchright, puppeteer, cheerio: easy to switch between them
|
|
24
|
+
- Web browsing automation: goto(open) / click / input / hover / select / scroll
|
|
25
|
+
- Automatic captcha solver: Recaptcha(v2 & v3), Cloudflare Turnstile, GeeTest(v3 & v4), image/text, cooridinate
|
|
26
|
+
- State data management: cookies, localStorage, HTTP Headers, custom session data
|
|
27
|
+
- Elements selection by CSS selectors or XPath: whether in frames or not
|
|
28
|
+
- Automatic file saving: such as screenshot, pdf, mhtml, download directly or by clicking
|
|
29
|
+
|
|
30
|
+
3. API request
|
|
31
|
+
|
|
32
|
+
- Both browser and API can be used at the same time and cookies/headers are shared.
|
|
33
|
+
- HTTP headers: intercepted, generated automatically or by browser automation, got by API or others
|
|
34
|
+
|
|
35
|
+
4. fingerprint management:
|
|
36
|
+
|
|
37
|
+
- Automatically generate fingerprints of the latest common browsers
|
|
38
|
+
|
|
39
|
+
5. Simple rate limits: automatic flow control, such as interval / max concurrency /times per period
|
|
40
|
+
6. Simple proxy management: multiple "static" proxies to increase concurrency
|
|
41
|
+
7. Subtasks: complex tasks can be split into multiple simple subtasks for better maintenance and increased concurrency
|
|
42
|
+
8. Data export
|
|
43
|
+
|
|
44
|
+
## Install
|
|
45
|
+
|
|
46
|
+
```sh
|
|
47
|
+
npm install @letsscrapedata/scraper
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Examples
|
|
51
|
+
|
|
52
|
+
1. Example with default ScraperConfig:
|
|
53
|
+
|
|
54
|
+
```javascript
|
|
55
|
+
// javascript
|
|
56
|
+
import { scraper } from "@letsscrapedata/sraper";
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* tid: ID of template to be executed, such as template for scraping one list of example in page "https://www.letsscrapedata.com/pages/listexample1.html"
|
|
60
|
+
* parasstrs: input parameters of tasks, such as "1"
|
|
61
|
+
* this example will execute five tasks using template 2000007, each of them scrapes the data in one page.
|
|
62
|
+
*/
|
|
63
|
+
const newTasks = [{ tid: 2000007, parasstrs: ["1", "2", "3", "4", "5"] }];
|
|
64
|
+
|
|
65
|
+
/* The following line can do the same thing using subtasks, scraping the data in the first five pages */
|
|
66
|
+
// const newTasks = [{ tid: 2000008, parasstrs: ["5"] }];
|
|
67
|
+
|
|
68
|
+
await scraper(newTasks);
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
2. Example with ScraperConfig
|
|
72
|
+
|
|
73
|
+
```typescript
|
|
74
|
+
// typescript
|
|
75
|
+
import { scraper, TemplateTasks, ScraperConfig } from "@letsscrapedata/sraper";
|
|
76
|
+
|
|
77
|
+
const scraperConfig: ScraperConfig = {
|
|
78
|
+
browserConfigs: [
|
|
79
|
+
/* launch a chromium browser using puppeteer, no proxy */
|
|
80
|
+
{ browserControllerType: "puppeteer", proxyUrl: "" },
|
|
81
|
+
/* launch a chromium browser using playwright, proxy */
|
|
82
|
+
{ browserContollerType: "playwright", proxyUrl: "http://proxyId:port" },
|
|
83
|
+
/* connect to the current browser using patchwright */
|
|
84
|
+
{ browserUrl: "http://localhost:9222/" },
|
|
85
|
+
],
|
|
86
|
+
// exitWhenCompleted: true,
|
|
87
|
+
// loadUnfinishedTasks: true,
|
|
88
|
+
// lsdLaunchOptions: { headless: true },
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
const newTasks: TemplateTasks[] = [{ tid: 2000008, parasstrs: ["9"] }];
|
|
92
|
+
|
|
93
|
+
await scraper(newTasks, scraperConfig);
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## ScraperConfig
|
|
97
|
+
|
|
98
|
+
Common configurations:
|
|
99
|
+
|
|
100
|
+
- Proxies and browser: browserConfigs, by default launching a browser using browserControllerType/browserType, without proxy
|
|
101
|
+
- Default browser controller to use: browserControllerType, default "playwright"
|
|
102
|
+
- Default browser to use: browserType, default "chromium"
|
|
103
|
+
- File format of scraped data: dataFileFormat, default "tsv"
|
|
104
|
+
- Where are the templates: templateDir, default "" which means to obtain the template from the network
|
|
105
|
+
|
|
106
|
+
Complete configurations:
|
|
107
|
+
|
|
108
|
+
```typescript
|
|
109
|
+
export interface ScraperConfig {
|
|
110
|
+
/**
|
|
111
|
+
* @default false
|
|
112
|
+
*/
|
|
113
|
+
exitWhenCompleted?: boolean;
|
|
114
|
+
/**
|
|
115
|
+
* whether to use the parasstr in XML if parasstr of a task is ""
|
|
116
|
+
* @default false
|
|
117
|
+
*/
|
|
118
|
+
useParasstrInXmlIfNeeded?: boolean;
|
|
119
|
+
/**
|
|
120
|
+
* whether to load unfinished tasks
|
|
121
|
+
* @default false
|
|
122
|
+
*/
|
|
123
|
+
loadUnfinishedTasks?: boolean;
|
|
124
|
+
//////////////////////////////////////////////////////////////////////////// directory
|
|
125
|
+
/**
|
|
126
|
+
* @default "", which will use current directory of process + "/data/"
|
|
127
|
+
* if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions.
|
|
128
|
+
*/
|
|
129
|
+
baseDir?: string;
|
|
130
|
+
/**
|
|
131
|
+
* where are the templates saved
|
|
132
|
+
* @default "", which means to get the templates from LSD server
|
|
133
|
+
*/
|
|
134
|
+
templateDir?: string;
|
|
135
|
+
/**
|
|
136
|
+
* filename in action_setvar_get/get_file must include inputFileDirePart for security.
|
|
137
|
+
* @default "LetsScrapeData"
|
|
138
|
+
*/
|
|
139
|
+
inputFileDirPart?: string;
|
|
140
|
+
//////////////////////////////////////////////////////////////////////////// browser
|
|
141
|
+
/**
|
|
142
|
+
* wether to use puppeteer-extra-plugin-stealth, use patchright instead
|
|
143
|
+
* @default false
|
|
144
|
+
*/
|
|
145
|
+
useStealthPlugin?: boolean;
|
|
146
|
+
/**
|
|
147
|
+
* default browserControllerType of BrowserConfig
|
|
148
|
+
* @default "patchright"
|
|
149
|
+
*/
|
|
150
|
+
browserControllerType?: BrowserControllerType;
|
|
151
|
+
/**
|
|
152
|
+
* default browserType of BrowserConfig
|
|
153
|
+
* @default "chromium"
|
|
154
|
+
*/
|
|
155
|
+
browserType?: LsdBrowserType;
|
|
156
|
+
/**
|
|
157
|
+
* @default { headless: false }
|
|
158
|
+
*/
|
|
159
|
+
lsdLaunchOptions?: LsdLaunchOptions;
|
|
160
|
+
/**
|
|
161
|
+
* @default {browserUrl: ""}
|
|
162
|
+
*/
|
|
163
|
+
lsdConnectOptions?: LsdConnectOptions;
|
|
164
|
+
/**
|
|
165
|
+
* Important: browsers to be launched or connected using proxyUrl
|
|
166
|
+
* @default [{proxyUrl: ""}], launch a default browser using default type of browser controller, no proxy
|
|
167
|
+
*/
|
|
168
|
+
browserConfigs?: BrowserConfig[];
|
|
169
|
+
//////////////////////////////////////////////////////////////////////////// captcha
|
|
170
|
+
captcha?: {
|
|
171
|
+
/**
|
|
172
|
+
* clientKey of 2captcha
|
|
173
|
+
*/
|
|
174
|
+
clientKey: string;
|
|
175
|
+
},
|
|
176
|
+
//////////////////////////////////////////////////////////////////////////// template
|
|
177
|
+
templateUrl?: string; // LSD
|
|
178
|
+
/**
|
|
179
|
+
* the default maximum number of concurrent tasks that can execute the same template in a browserContext
|
|
180
|
+
* @default 1
|
|
181
|
+
*/
|
|
182
|
+
maxConcurrency?: number;
|
|
183
|
+
/**
|
|
184
|
+
* @default ""
|
|
185
|
+
*/
|
|
186
|
+
readCode?: string;
|
|
187
|
+
/**
|
|
188
|
+
* @default []
|
|
189
|
+
*/
|
|
190
|
+
templateParas?: TemplatePara[];
|
|
191
|
+
//////////////////////////////////////////////////////////////////////////// scheduler
|
|
192
|
+
/**
|
|
193
|
+
* @default 10
|
|
194
|
+
*/
|
|
195
|
+
totalMaxConcurrency?: number;
|
|
196
|
+
/**
|
|
197
|
+
* min miliseconds between two tasks of the same template
|
|
198
|
+
* @default 2000
|
|
199
|
+
*/
|
|
200
|
+
minMiliseconds?: number,
|
|
201
|
+
//////////////////////////////////////////////////////////////////////////// data
|
|
202
|
+
/**
|
|
203
|
+
* whether to move all dat_* files into a new directory "yyyyMMddHHmmss"
|
|
204
|
+
* @default false
|
|
205
|
+
*/
|
|
206
|
+
moveDataWhenStart?: boolean;
|
|
207
|
+
/**
|
|
208
|
+
** DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
|
|
209
|
+
* @default "jsonl"
|
|
210
|
+
*/
|
|
211
|
+
dataFileFormat?: DataFileFormat;
|
|
212
|
+
/**
|
|
213
|
+
* @default true
|
|
214
|
+
*/
|
|
215
|
+
useNickName?: boolean;
|
|
216
|
+
|
|
217
|
+
* valid only when dataFileFormat is "txt"
|
|
218
|
+
*/
|
|
219
|
+
columnSeperator?: string;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Only one of browserUrl and proxyUrl will take effect, and browserUrl has higher priority.
|
|
224
|
+
*/
|
|
225
|
+
export interface BrowserConfig {
|
|
226
|
+
browserControllerType?: BrowserControllerType;
|
|
227
|
+
/**
|
|
228
|
+
* url used to connected the current browser
|
|
229
|
+
** url starts with "http://", such as "http://localhost:9222/"
|
|
230
|
+
** browserUrl can be used when mannaul login in advance.
|
|
231
|
+
*/
|
|
232
|
+
browserUrl?: string;
|
|
233
|
+
/**
|
|
234
|
+
* proxy
|
|
235
|
+
** no proxy will be used if proxyUrl is ""
|
|
236
|
+
** valid only if !browserUrl
|
|
237
|
+
*/
|
|
238
|
+
proxyUrl?: string;
|
|
239
|
+
/**
|
|
240
|
+
* type of browser to be launched
|
|
241
|
+
* valid only if !browserUrl
|
|
242
|
+
* @default "chromium"
|
|
243
|
+
*/
|
|
244
|
+
browserType?: LsdBrowserType;
|
|
245
|
+
}
|
|
246
|
+
```
|