clearscrape 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +379 -0
- package/dist/index.d.mts +374 -0
- package/dist/index.d.ts +374 -0
- package/dist/index.js +356 -0
- package/dist/index.mjs +326 -0
- package/package.json +59 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 ClearScrape
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
# ClearScrape Node.js SDK
|
|
2
|
+
|
|
3
|
+
Official Node.js client for the [ClearScrape](https://clearscrape.io) web scraping API.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Simple, promise-based API
|
|
8
|
+
- Full TypeScript support
|
|
9
|
+
- Automatic retries with exponential backoff
|
|
10
|
+
- Support for all ClearScrape features:
|
|
11
|
+
- JavaScript rendering
|
|
12
|
+
- Premium residential proxies
|
|
13
|
+
- Antibot bypass
|
|
14
|
+
- Screenshots
|
|
15
|
+
- Domain-specific extractors (Amazon, Walmart, Google, etc.)
|
|
16
|
+
- Scraping Browser (Playwright/Puppeteer)
|
|
17
|
+
- Residential Proxy service
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
npm install clearscrape
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
yarn add clearscrape
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pnpm add clearscrape
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Quick Start
|
|
34
|
+
|
|
35
|
+
```typescript
|
|
36
|
+
import { ClearScrape } from 'clearscrape';
|
|
37
|
+
|
|
38
|
+
const client = new ClearScrape({
|
|
39
|
+
apiKey: process.env.CLEARSCRAPE_API_KEY
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
// Basic scrape
|
|
43
|
+
const result = await client.scrape({
|
|
44
|
+
url: 'https://example.com'
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
console.log(result.data.html);
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Usage Examples
|
|
51
|
+
|
|
52
|
+
### Basic Scraping
|
|
53
|
+
|
|
54
|
+
```typescript
|
|
55
|
+
// Simple HTML fetch
|
|
56
|
+
const result = await client.scrape({
|
|
57
|
+
url: 'https://example.com'
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
// Get just the HTML
|
|
61
|
+
const html = await client.getHtml('https://example.com');
|
|
62
|
+
|
|
63
|
+
// Get just the text content
|
|
64
|
+
const text = await client.getText('https://example.com');
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### JavaScript Rendering
|
|
68
|
+
|
|
69
|
+
Enable JavaScript rendering for dynamic websites (SPAs, React, Vue, etc.):
|
|
70
|
+
|
|
71
|
+
```typescript
|
|
72
|
+
const result = await client.scrape({
|
|
73
|
+
url: 'https://example.com/spa-page',
|
|
74
|
+
jsRender: true,
|
|
75
|
+
waitFor: '.product-list', // Wait for element
|
|
76
|
+
wait: 3000 // Additional wait time (ms)
|
|
77
|
+
});
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Premium Proxies
|
|
81
|
+
|
|
82
|
+
Use residential proxies to avoid blocks and geo-target:
|
|
83
|
+
|
|
84
|
+
```typescript
|
|
85
|
+
const result = await client.scrape({
|
|
86
|
+
url: 'https://example.com',
|
|
87
|
+
premiumProxy: true,
|
|
88
|
+
proxyCountry: 'us' // Target specific country
|
|
89
|
+
});
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Antibot Bypass
|
|
93
|
+
|
|
94
|
+
Bypass Cloudflare, DataDome, PerimeterX and other bot protection:
|
|
95
|
+
|
|
96
|
+
```typescript
|
|
97
|
+
const result = await client.scrape({
|
|
98
|
+
url: 'https://protected-site.com',
|
|
99
|
+
antibot: true,
|
|
100
|
+
premiumProxy: true
|
|
101
|
+
});
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Screenshots
|
|
105
|
+
|
|
106
|
+
Capture screenshots of web pages:
|
|
107
|
+
|
|
108
|
+
```typescript
|
|
109
|
+
import fs from 'fs';
|
|
110
|
+
|
|
111
|
+
// Get base64 screenshot
|
|
112
|
+
const screenshot = await client.screenshot('https://example.com');
|
|
113
|
+
|
|
114
|
+
// Save to file
|
|
115
|
+
fs.writeFileSync('screenshot.png', Buffer.from(screenshot, 'base64'));
|
|
116
|
+
|
|
117
|
+
// Screenshot specific element
|
|
118
|
+
const result = await client.scrape({
|
|
119
|
+
url: 'https://example.com',
|
|
120
|
+
jsRender: true,
|
|
121
|
+
screenshotSelector: '.product-card'
|
|
122
|
+
});
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Domain Extractors
|
|
126
|
+
|
|
127
|
+
Extract structured data from supported websites:
|
|
128
|
+
|
|
129
|
+
```typescript
|
|
130
|
+
import { AmazonProduct, GoogleSerpResult } from 'clearscrape';
|
|
131
|
+
|
|
132
|
+
// Amazon product data
|
|
133
|
+
const product = await client.extract<AmazonProduct>(
|
|
134
|
+
'https://www.amazon.com/dp/B09V3KXJPB',
|
|
135
|
+
'amazon'
|
|
136
|
+
);
|
|
137
|
+
|
|
138
|
+
console.log(product.title); // "Apple AirPods Pro..."
|
|
139
|
+
console.log(product.price); // "$249.00"
|
|
140
|
+
console.log(product.rating); // "4.7"
|
|
141
|
+
console.log(product.reviewCount); // "125,432"
|
|
142
|
+
|
|
143
|
+
// Google SERP data
|
|
144
|
+
const serp = await client.extract<GoogleSerpResult>(
|
|
145
|
+
'https://www.google.com/search?q=best+laptops',
|
|
146
|
+
'google'
|
|
147
|
+
);
|
|
148
|
+
|
|
149
|
+
console.log(serp.organicResults[0].title);
|
|
150
|
+
console.log(serp.featuredSnippet);
|
|
151
|
+
console.log(serp.relatedSearches);
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**Supported domains:**
|
|
155
|
+
- `amazon` - Product pages
|
|
156
|
+
- `walmart` - Product pages
|
|
157
|
+
- `google` - Search results
|
|
158
|
+
- `google_shopping` - Shopping results
|
|
159
|
+
- `ebay` - Product pages
|
|
160
|
+
- `target` - Product pages
|
|
161
|
+
- `etsy` - Product pages
|
|
162
|
+
- `bestbuy` - Product pages
|
|
163
|
+
- `homedepot` - Product pages
|
|
164
|
+
- `zillow` - Property listings
|
|
165
|
+
- `yelp` - Business pages
|
|
166
|
+
- `indeed` - Job listings
|
|
167
|
+
- `linkedin_jobs` - Job listings
|
|
168
|
+
|
|
169
|
+
### Scraping Browser (Playwright/Puppeteer)
|
|
170
|
+
|
|
171
|
+
Connect to cloud browsers with built-in antibot bypass:
|
|
172
|
+
|
|
173
|
+
```typescript
|
|
174
|
+
// With Playwright
|
|
175
|
+
import { chromium } from 'playwright';
|
|
176
|
+
|
|
177
|
+
const browser = await chromium.connectOverCDP(
|
|
178
|
+
client.getBrowserWsUrl()
|
|
179
|
+
);
|
|
180
|
+
|
|
181
|
+
const page = await browser.newPage();
|
|
182
|
+
await page.goto('https://example.com');
|
|
183
|
+
|
|
184
|
+
const title = await page.title();
|
|
185
|
+
await browser.close();
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
```typescript
|
|
189
|
+
// With Puppeteer
|
|
190
|
+
import puppeteer from 'puppeteer-core';
|
|
191
|
+
|
|
192
|
+
const browser = await puppeteer.connect({
|
|
193
|
+
browserWSEndpoint: client.getBrowserWsUrl()
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
const page = await browser.newPage();
|
|
197
|
+
await page.goto('https://example.com');
|
|
198
|
+
|
|
199
|
+
await browser.close();
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
```typescript
|
|
203
|
+
// With country targeting
|
|
204
|
+
const wsUrl = client.getBrowserWsUrl({ proxyCountry: 'gb' });
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### Residential Proxies
|
|
208
|
+
|
|
209
|
+
Use ClearScrape proxies with any HTTP client:
|
|
210
|
+
|
|
211
|
+
```typescript
|
|
212
|
+
// Get proxy configuration
|
|
213
|
+
const proxy = client.getProxyConfig();
|
|
214
|
+
// { host: 'proxy.clearscrape.io', port: 8000, username: '...', password: '...' }
|
|
215
|
+
|
|
216
|
+
// Get proxy URL string
|
|
217
|
+
const proxyUrl = client.getProxyUrl();
|
|
218
|
+
// 'http://apikey:apikey@proxy.clearscrape.io:8000'
|
|
219
|
+
|
|
220
|
+
// With country targeting
|
|
221
|
+
const proxyUrl = client.getProxyUrl({ country: 'us' });
|
|
222
|
+
|
|
223
|
+
// With session sticky IP
|
|
224
|
+
const proxyUrl = client.getProxyUrl({ session: 'my-session-123' });
|
|
225
|
+
|
|
226
|
+
// Combined
|
|
227
|
+
const proxyUrl = client.getProxyUrl({ country: 'us', session: 'abc' });
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
**Use with axios:**
|
|
231
|
+
```typescript
|
|
232
|
+
import axios from 'axios';
|
|
233
|
+
import { HttpsProxyAgent } from 'https-proxy-agent';
|
|
234
|
+
|
|
235
|
+
const agent = new HttpsProxyAgent(client.getProxyUrl({ country: 'us' }));
|
|
236
|
+
|
|
237
|
+
const response = await axios.get('https://httpbin.org/ip', {
|
|
238
|
+
httpsAgent: agent
|
|
239
|
+
});
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
**Use with node-fetch:**
|
|
243
|
+
```typescript
|
|
244
|
+
import fetch from 'node-fetch';
|
|
245
|
+
import { HttpsProxyAgent } from 'https-proxy-agent';
|
|
246
|
+
|
|
247
|
+
const agent = new HttpsProxyAgent(client.getProxyUrl());
|
|
248
|
+
|
|
249
|
+
const response = await fetch('https://httpbin.org/ip', { agent });
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
## Configuration
|
|
253
|
+
|
|
254
|
+
```typescript
|
|
255
|
+
const client = new ClearScrape({
|
|
256
|
+
// Required: Your API key
|
|
257
|
+
apiKey: 'your-api-key',
|
|
258
|
+
|
|
259
|
+
// Optional: Custom base URL (default: https://api.clearscrape.io)
|
|
260
|
+
baseUrl: 'https://api.clearscrape.io',
|
|
261
|
+
|
|
262
|
+
// Optional: Request timeout in ms (default: 60000)
|
|
263
|
+
timeout: 60000,
|
|
264
|
+
|
|
265
|
+
// Optional: Number of retries (default: 3)
|
|
266
|
+
retries: 3
|
|
267
|
+
});
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
## Error Handling
|
|
271
|
+
|
|
272
|
+
```typescript
|
|
273
|
+
import {
|
|
274
|
+
ClearScrape,
|
|
275
|
+
ClearScrapeError,
|
|
276
|
+
InsufficientCreditsError,
|
|
277
|
+
RateLimitError
|
|
278
|
+
} from 'clearscrape';
|
|
279
|
+
|
|
280
|
+
try {
|
|
281
|
+
const result = await client.scrape({ url: 'https://example.com' });
|
|
282
|
+
} catch (error) {
|
|
283
|
+
if (error instanceof InsufficientCreditsError) {
|
|
284
|
+
console.log(`Need ${error.required} credits`);
|
|
285
|
+
} else if (error instanceof RateLimitError) {
|
|
286
|
+
console.log('Rate limited, try again later');
|
|
287
|
+
} else if (error instanceof ClearScrapeError) {
|
|
288
|
+
console.log(`Error ${error.statusCode}: ${error.message}`);
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
## TypeScript
|
|
294
|
+
|
|
295
|
+
The SDK is written in TypeScript and includes full type definitions:
|
|
296
|
+
|
|
297
|
+
```typescript
|
|
298
|
+
import {
|
|
299
|
+
ClearScrape,
|
|
300
|
+
ScrapeOptions,
|
|
301
|
+
ScrapeResponse,
|
|
302
|
+
AmazonProduct,
|
|
303
|
+
GoogleSerpResult,
|
|
304
|
+
DomainType
|
|
305
|
+
} from 'clearscrape';
|
|
306
|
+
|
|
307
|
+
// Full type safety
|
|
308
|
+
const options: ScrapeOptions = {
|
|
309
|
+
url: 'https://example.com',
|
|
310
|
+
jsRender: true,
|
|
311
|
+
premiumProxy: true
|
|
312
|
+
};
|
|
313
|
+
|
|
314
|
+
const result: ScrapeResponse = await client.scrape(options);
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## API Reference
|
|
318
|
+
|
|
319
|
+
### `ClearScrape`
|
|
320
|
+
|
|
321
|
+
Main client class.
|
|
322
|
+
|
|
323
|
+
#### Constructor
|
|
324
|
+
|
|
325
|
+
```typescript
|
|
326
|
+
new ClearScrape(config: ClearScrapeConfig)
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
#### Methods
|
|
330
|
+
|
|
331
|
+
| Method | Description |
|
|
332
|
+
|--------|-------------|
|
|
333
|
+
| `scrape(options)` | Scrape a URL with full options |
|
|
334
|
+
| `getHtml(url, options?)` | Get HTML content only |
|
|
335
|
+
| `getText(url, options?)` | Get text content only |
|
|
336
|
+
| `screenshot(url, options?)` | Capture screenshot |
|
|
337
|
+
| `extract(url, domain)` | Extract structured data |
|
|
338
|
+
| `getProxyConfig(options?)` | Get proxy configuration object |
|
|
339
|
+
| `getProxyUrl(options?)` | Get proxy URL string |
|
|
340
|
+
| `getBrowserWsUrl(options?)` | Get Scraping Browser WebSocket URL |
|
|
341
|
+
|
|
342
|
+
### `ScrapeOptions`
|
|
343
|
+
|
|
344
|
+
| Option | Type | Description |
|
|
345
|
+
|--------|------|-------------|
|
|
346
|
+
| `url` | `string` | Target URL (required) |
|
|
347
|
+
| `method` | `string` | HTTP method (default: GET) |
|
|
348
|
+
| `jsRender` | `boolean` | Enable JS rendering (+5 credits) |
|
|
349
|
+
| `premiumProxy` | `boolean` | Use residential proxy (+10 credits) |
|
|
350
|
+
| `antibot` | `boolean` | Enable antibot bypass (+25 credits) |
|
|
351
|
+
| `proxyCountry` | `string` | 2-letter country code |
|
|
352
|
+
| `waitFor` | `string` | CSS selector to wait for |
|
|
353
|
+
| `wait` | `number` | Wait time in ms |
|
|
354
|
+
| `autoScroll` | `boolean` | Scroll to load content |
|
|
355
|
+
| `screenshot` | `boolean` | Capture screenshot |
|
|
356
|
+
| `screenshotSelector` | `string` | Screenshot specific element |
|
|
357
|
+
| `headers` | `object` | Custom HTTP headers |
|
|
358
|
+
| `body` | `string\|object` | Request body |
|
|
359
|
+
| `domain` | `DomainType` | Domain extractor |
|
|
360
|
+
|
|
361
|
+
## Credits
|
|
362
|
+
|
|
363
|
+
| Feature | Cost |
|
|
364
|
+
|---------|------|
|
|
365
|
+
| Base request | 1 credit |
|
|
366
|
+
| + JavaScript rendering | +5 credits |
|
|
367
|
+
| + Premium proxy | +10 credits |
|
|
368
|
+
| + Antibot bypass | +25 credits |
|
|
369
|
+
| Domain API extraction | 25 credits |
|
|
370
|
+
|
|
371
|
+
## Support
|
|
372
|
+
|
|
373
|
+
- [Documentation](https://clearscrape.io/docs)
|
|
374
|
+
- [API Reference](https://clearscrape.io/docs#parameters)
|
|
375
|
+
- [GitHub Issues](https://github.com/clearscrape/clearscrape-node/issues)
|
|
376
|
+
|
|
377
|
+
## License
|
|
378
|
+
|
|
379
|
+
MIT
|