clearscrape 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +379 -0
- package/dist/index.d.mts +374 -0
- package/dist/index.d.ts +374 -0
- package/dist/index.js +356 -0
- package/dist/index.mjs +326 -0
- package/package.json +59 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ClearScrape SDK Types
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Configuration options for the ClearScrape client
|
|
6
|
+
*/
|
|
7
|
+
interface ClearScrapeConfig {
|
|
8
|
+
/** Your ClearScrape API key */
|
|
9
|
+
apiKey: string;
|
|
10
|
+
/** Base URL for the API (defaults to https://api.clearscrape.io) */
|
|
11
|
+
baseUrl?: string;
|
|
12
|
+
/** Request timeout in milliseconds (defaults to 60000) */
|
|
13
|
+
timeout?: number;
|
|
14
|
+
/** Number of retries for failed requests (defaults to 3) */
|
|
15
|
+
retries?: number;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Options for scraping a URL
|
|
19
|
+
*/
|
|
20
|
+
interface ScrapeOptions {
|
|
21
|
+
/** Target URL to scrape */
|
|
22
|
+
url: string;
|
|
23
|
+
/** HTTP method (defaults to GET) */
|
|
24
|
+
method?: 'GET' | 'POST' | 'PUT' | 'DELETE' | 'PATCH';
|
|
25
|
+
/** Enable JavaScript rendering (+5 credits) */
|
|
26
|
+
jsRender?: boolean;
|
|
27
|
+
/** Use premium residential proxies (+10 credits) */
|
|
28
|
+
premiumProxy?: boolean;
|
|
29
|
+
/** Enable antibot bypass (+25 credits) */
|
|
30
|
+
antibot?: boolean;
|
|
31
|
+
/** 2-letter country code for geo-targeting */
|
|
32
|
+
proxyCountry?: string;
|
|
33
|
+
/** CSS selector to wait for (requires jsRender) */
|
|
34
|
+
waitFor?: string;
|
|
35
|
+
/** Fixed wait time in milliseconds (max 30000) */
|
|
36
|
+
wait?: number;
|
|
37
|
+
/** Scroll page to load lazy content */
|
|
38
|
+
autoScroll?: boolean;
|
|
39
|
+
/** Capture full page screenshot */
|
|
40
|
+
screenshot?: boolean;
|
|
41
|
+
/** Capture screenshot of specific element */
|
|
42
|
+
screenshotSelector?: string;
|
|
43
|
+
/** Custom HTTP headers */
|
|
44
|
+
headers?: Record<string, string>;
|
|
45
|
+
/** Request body for POST/PUT requests */
|
|
46
|
+
body?: string | Record<string, unknown>;
|
|
47
|
+
/** Domain extractor (amazon, walmart, google, etc.) */
|
|
48
|
+
domain?: DomainType;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Supported domain extractors
|
|
52
|
+
*/
|
|
53
|
+
type DomainType = 'amazon' | 'walmart' | 'google' | 'google_shopping' | 'ebay' | 'target' | 'etsy' | 'bestbuy' | 'homedepot' | 'zillow' | 'yelp' | 'indeed' | 'linkedin_jobs';
|
|
54
|
+
/**
|
|
55
|
+
* Response from a successful scrape request
|
|
56
|
+
*/
|
|
57
|
+
interface ScrapeResponse {
|
|
58
|
+
success: true;
|
|
59
|
+
data: {
|
|
60
|
+
/** Raw HTML content */
|
|
61
|
+
html: string;
|
|
62
|
+
/** Extracted text content */
|
|
63
|
+
text?: string;
|
|
64
|
+
/** Base64 encoded screenshot (if requested) */
|
|
65
|
+
screenshot?: string;
|
|
66
|
+
/** Extracted data (if domain extractor used) */
|
|
67
|
+
extracted?: Record<string, unknown>;
|
|
68
|
+
};
|
|
69
|
+
metadata: {
|
|
70
|
+
/** Final URL after redirects */
|
|
71
|
+
url: string;
|
|
72
|
+
/** HTTP status code */
|
|
73
|
+
statusCode: number;
|
|
74
|
+
/** Credits consumed */
|
|
75
|
+
cost: number;
|
|
76
|
+
/** Request duration in milliseconds */
|
|
77
|
+
duration: number;
|
|
78
|
+
/** Response size in bytes */
|
|
79
|
+
byteSize: number;
|
|
80
|
+
/** Options used for the request */
|
|
81
|
+
options: {
|
|
82
|
+
js_render: boolean;
|
|
83
|
+
premium_proxy: boolean;
|
|
84
|
+
antibot: boolean;
|
|
85
|
+
proxy_country?: string;
|
|
86
|
+
};
|
|
87
|
+
/** Domain extractor used */
|
|
88
|
+
domain?: string;
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Response from a failed scrape request
|
|
93
|
+
*/
|
|
94
|
+
interface ScrapeErrorResponse {
|
|
95
|
+
success: false;
|
|
96
|
+
error: string;
|
|
97
|
+
message: string;
|
|
98
|
+
/** Credits required (for insufficient credits error) */
|
|
99
|
+
required?: number;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Amazon product data extracted by domain API
|
|
103
|
+
*/
|
|
104
|
+
interface AmazonProduct {
|
|
105
|
+
title: string;
|
|
106
|
+
price: string;
|
|
107
|
+
originalPrice?: string;
|
|
108
|
+
currency: string;
|
|
109
|
+
rating: string;
|
|
110
|
+
reviewCount: string;
|
|
111
|
+
availability: string;
|
|
112
|
+
seller: string;
|
|
113
|
+
asin: string;
|
|
114
|
+
brand?: string;
|
|
115
|
+
images: string[];
|
|
116
|
+
features: string[];
|
|
117
|
+
breadcrumbs: string[];
|
|
118
|
+
description?: string;
|
|
119
|
+
specifications?: Record<string, string>;
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Google SERP data extracted by domain API
|
|
123
|
+
*/
|
|
124
|
+
interface GoogleSerpResult {
|
|
125
|
+
searchQuery: string;
|
|
126
|
+
totalResults: string;
|
|
127
|
+
organicResults: Array<{
|
|
128
|
+
position: number;
|
|
129
|
+
title: string;
|
|
130
|
+
url: string;
|
|
131
|
+
displayUrl: string;
|
|
132
|
+
description: string;
|
|
133
|
+
}>;
|
|
134
|
+
featuredSnippet?: {
|
|
135
|
+
title: string;
|
|
136
|
+
content: string;
|
|
137
|
+
url: string;
|
|
138
|
+
};
|
|
139
|
+
peopleAlsoAsk?: Array<{
|
|
140
|
+
question: string;
|
|
141
|
+
answer: string;
|
|
142
|
+
}>;
|
|
143
|
+
relatedSearches?: string[];
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Proxy configuration for residential proxy service
|
|
147
|
+
*/
|
|
148
|
+
interface ProxyConfig {
|
|
149
|
+
host: string;
|
|
150
|
+
port: number;
|
|
151
|
+
username: string;
|
|
152
|
+
password: string;
|
|
153
|
+
}
|
|
154
|
+
/**
|
|
155
|
+
* Browser connection options for Scraping Browser
|
|
156
|
+
*/
|
|
157
|
+
interface BrowserOptions {
|
|
158
|
+
/** 2-letter country code for geo-targeting */
|
|
159
|
+
proxyCountry?: string;
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* ClearScrape API error
|
|
163
|
+
*/
|
|
164
|
+
declare class ClearScrapeError extends Error {
|
|
165
|
+
readonly statusCode: number;
|
|
166
|
+
readonly response?: ScrapeErrorResponse;
|
|
167
|
+
constructor(message: string, statusCode: number, response?: ScrapeErrorResponse);
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Insufficient credits error
|
|
171
|
+
*/
|
|
172
|
+
declare class InsufficientCreditsError extends ClearScrapeError {
|
|
173
|
+
readonly required: number;
|
|
174
|
+
constructor(message: string, required: number);
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Rate limit error
|
|
178
|
+
*/
|
|
179
|
+
declare class RateLimitError extends ClearScrapeError {
|
|
180
|
+
constructor(message: string);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* ClearScrape API Client
|
|
185
|
+
*
|
|
186
|
+
* @example
|
|
187
|
+
* ```typescript
|
|
188
|
+
* import { ClearScrape } from 'clearscrape';
|
|
189
|
+
*
|
|
190
|
+
* const client = new ClearScrape({ apiKey: 'your-api-key' });
|
|
191
|
+
*
|
|
192
|
+
* const result = await client.scrape({
|
|
193
|
+
* url: 'https://example.com',
|
|
194
|
+
* jsRender: true
|
|
195
|
+
* });
|
|
196
|
+
*
|
|
197
|
+
* console.log(result.data.html);
|
|
198
|
+
* ```
|
|
199
|
+
*/
|
|
200
|
+
declare class ClearScrape {
|
|
201
|
+
private readonly apiKey;
|
|
202
|
+
private readonly baseUrl;
|
|
203
|
+
private readonly timeout;
|
|
204
|
+
private readonly retries;
|
|
205
|
+
constructor(config: ClearScrapeConfig);
|
|
206
|
+
/**
|
|
207
|
+
* Scrape a URL and return the HTML content
|
|
208
|
+
*
|
|
209
|
+
* @param options - Scraping options
|
|
210
|
+
* @returns Promise resolving to scrape response
|
|
211
|
+
*
|
|
212
|
+
* @example
|
|
213
|
+
* ```typescript
|
|
214
|
+
* // Basic scrape
|
|
215
|
+
* const result = await client.scrape({ url: 'https://example.com' });
|
|
216
|
+
*
|
|
217
|
+
* // With JavaScript rendering
|
|
218
|
+
* const result = await client.scrape({
|
|
219
|
+
* url: 'https://example.com',
|
|
220
|
+
* jsRender: true,
|
|
221
|
+
* waitFor: '.content'
|
|
222
|
+
* });
|
|
223
|
+
*
|
|
224
|
+
* // With premium proxy and country targeting
|
|
225
|
+
* const result = await client.scrape({
|
|
226
|
+
* url: 'https://example.com',
|
|
227
|
+
* premiumProxy: true,
|
|
228
|
+
* proxyCountry: 'us'
|
|
229
|
+
* });
|
|
230
|
+
* ```
|
|
231
|
+
*/
|
|
232
|
+
scrape(options: ScrapeOptions): Promise<ScrapeResponse>;
|
|
233
|
+
/**
|
|
234
|
+
* Scrape a URL and return only the HTML content
|
|
235
|
+
*
|
|
236
|
+
* @param url - URL to scrape
|
|
237
|
+
* @param options - Additional scraping options
|
|
238
|
+
* @returns Promise resolving to HTML string
|
|
239
|
+
*/
|
|
240
|
+
getHtml(url: string, options?: Omit<ScrapeOptions, 'url'>): Promise<string>;
|
|
241
|
+
/**
|
|
242
|
+
* Scrape a URL and return only the text content
|
|
243
|
+
*
|
|
244
|
+
* @param url - URL to scrape
|
|
245
|
+
* @param options - Additional scraping options
|
|
246
|
+
* @returns Promise resolving to text string
|
|
247
|
+
*/
|
|
248
|
+
getText(url: string, options?: Omit<ScrapeOptions, 'url'>): Promise<string>;
|
|
249
|
+
/**
|
|
250
|
+
* Take a screenshot of a URL
|
|
251
|
+
*
|
|
252
|
+
* @param url - URL to screenshot
|
|
253
|
+
* @param options - Additional options
|
|
254
|
+
* @returns Promise resolving to base64 encoded screenshot
|
|
255
|
+
*
|
|
256
|
+
* @example
|
|
257
|
+
* ```typescript
|
|
258
|
+
* const screenshot = await client.screenshot('https://example.com');
|
|
259
|
+
* // Save to file
|
|
260
|
+
* fs.writeFileSync('screenshot.png', Buffer.from(screenshot, 'base64'));
|
|
261
|
+
* ```
|
|
262
|
+
*/
|
|
263
|
+
screenshot(url: string, options?: Omit<ScrapeOptions, 'url' | 'screenshot'>): Promise<string>;
|
|
264
|
+
/**
|
|
265
|
+
* Scrape using a domain-specific extractor (Amazon, Walmart, Google, etc.)
|
|
266
|
+
*
|
|
267
|
+
* @param url - URL to scrape
|
|
268
|
+
* @param domain - Domain extractor to use
|
|
269
|
+
* @returns Promise resolving to extracted data
|
|
270
|
+
*
|
|
271
|
+
* @example
|
|
272
|
+
* ```typescript
|
|
273
|
+
* // Extract Amazon product data
|
|
274
|
+
* const product = await client.extract(
|
|
275
|
+
* 'https://www.amazon.com/dp/B09V3KXJPB',
|
|
276
|
+
* 'amazon'
|
|
277
|
+
* );
|
|
278
|
+
* console.log(product.title, product.price);
|
|
279
|
+
*
|
|
280
|
+
* // Extract Google SERP data
|
|
281
|
+
* const serp = await client.extract(
|
|
282
|
+
* 'https://www.google.com/search?q=best+laptops',
|
|
283
|
+
* 'google'
|
|
284
|
+
* );
|
|
285
|
+
* console.log(serp.organicResults);
|
|
286
|
+
* ```
|
|
287
|
+
*/
|
|
288
|
+
extract<T = Record<string, unknown>>(url: string, domain: ScrapeOptions['domain']): Promise<T>;
|
|
289
|
+
/**
|
|
290
|
+
* Get proxy configuration for the residential proxy service
|
|
291
|
+
*
|
|
292
|
+
* @param options - Proxy options
|
|
293
|
+
* @returns Proxy configuration object
|
|
294
|
+
*
|
|
295
|
+
* @example
|
|
296
|
+
* ```typescript
|
|
297
|
+
* // Basic proxy config
|
|
298
|
+
* const proxy = client.getProxyConfig();
|
|
299
|
+
* // { host: 'proxy.clearscrape.io', port: 8000, username: '...', password: '...' }
|
|
300
|
+
*
|
|
301
|
+
* // With country targeting
|
|
302
|
+
* const proxy = client.getProxyConfig({ country: 'us' });
|
|
303
|
+
*
|
|
304
|
+
* // With session sticky IP
|
|
305
|
+
* const proxy = client.getProxyConfig({ session: 'my-session-123' });
|
|
306
|
+
* ```
|
|
307
|
+
*/
|
|
308
|
+
getProxyConfig(options?: {
|
|
309
|
+
country?: string;
|
|
310
|
+
session?: string;
|
|
311
|
+
}): ProxyConfig;
|
|
312
|
+
/**
|
|
313
|
+
* Get proxy URL string for use with HTTP clients
|
|
314
|
+
*
|
|
315
|
+
* @param options - Proxy options
|
|
316
|
+
* @returns Proxy URL string
|
|
317
|
+
*
|
|
318
|
+
* @example
|
|
319
|
+
* ```typescript
|
|
320
|
+
* const proxyUrl = client.getProxyUrl({ country: 'us' });
|
|
321
|
+
* // 'http://apikey-country-us:apikey@proxy.clearscrape.io:8000'
|
|
322
|
+
*
|
|
323
|
+
* // Use with axios
|
|
324
|
+
* const HttpsProxyAgent = require('https-proxy-agent');
|
|
325
|
+
* const agent = new HttpsProxyAgent(client.getProxyUrl());
|
|
326
|
+
* axios.get(url, { httpsAgent: agent });
|
|
327
|
+
* ```
|
|
328
|
+
*/
|
|
329
|
+
getProxyUrl(options?: {
|
|
330
|
+
country?: string;
|
|
331
|
+
session?: string;
|
|
332
|
+
}): string;
|
|
333
|
+
/**
|
|
334
|
+
* Get WebSocket URL for Scraping Browser (Playwright/Puppeteer)
|
|
335
|
+
*
|
|
336
|
+
* @param options - Browser options
|
|
337
|
+
* @returns WebSocket URL string
|
|
338
|
+
*
|
|
339
|
+
* @example
|
|
340
|
+
* ```typescript
|
|
341
|
+
* // Use with Playwright
|
|
342
|
+
* const { chromium } = require('playwright');
|
|
343
|
+
* const browser = await chromium.connectOverCDP(client.getBrowserWsUrl());
|
|
344
|
+
*
|
|
345
|
+
* // Use with Puppeteer
|
|
346
|
+
* const puppeteer = require('puppeteer-core');
|
|
347
|
+
* const browser = await puppeteer.connect({
|
|
348
|
+
* browserWSEndpoint: client.getBrowserWsUrl()
|
|
349
|
+
* });
|
|
350
|
+
*
|
|
351
|
+
* // With country targeting
|
|
352
|
+
* const wsUrl = client.getBrowserWsUrl({ proxyCountry: 'gb' });
|
|
353
|
+
* ```
|
|
354
|
+
*/
|
|
355
|
+
getBrowserWsUrl(options?: BrowserOptions): string;
|
|
356
|
+
/**
|
|
357
|
+
* Build the API request payload
|
|
358
|
+
*/
|
|
359
|
+
private buildPayload;
|
|
360
|
+
/**
|
|
361
|
+
* Make an API request with retries
|
|
362
|
+
*/
|
|
363
|
+
private makeRequest;
|
|
364
|
+
/**
|
|
365
|
+
* Handle API errors
|
|
366
|
+
*/
|
|
367
|
+
private handleError;
|
|
368
|
+
/**
|
|
369
|
+
* Sleep for a specified duration
|
|
370
|
+
*/
|
|
371
|
+
private sleep;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
export { type AmazonProduct, type BrowserOptions, ClearScrape, type ClearScrapeConfig, ClearScrapeError, type DomainType, type GoogleSerpResult, InsufficientCreditsError, type ProxyConfig, RateLimitError, type ScrapeErrorResponse, type ScrapeOptions, type ScrapeResponse, ClearScrape as default };
|