clearscrape 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +379 -0
- package/dist/index.d.mts +374 -0
- package/dist/index.d.ts +374 -0
- package/dist/index.js +356 -0
- package/dist/index.mjs +326 -0
- package/package.json +59 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/index.ts
|
|
21
|
+
var index_exports = {};
|
|
22
|
+
__export(index_exports, {
|
|
23
|
+
ClearScrape: () => ClearScrape,
|
|
24
|
+
ClearScrapeError: () => ClearScrapeError,
|
|
25
|
+
InsufficientCreditsError: () => InsufficientCreditsError,
|
|
26
|
+
RateLimitError: () => RateLimitError,
|
|
27
|
+
default: () => ClearScrape
|
|
28
|
+
});
|
|
29
|
+
module.exports = __toCommonJS(index_exports);
|
|
30
|
+
|
|
31
|
+
// src/types.ts
|
|
32
|
+
var ClearScrapeError = class extends Error {
|
|
33
|
+
constructor(message, statusCode, response) {
|
|
34
|
+
super(message);
|
|
35
|
+
this.name = "ClearScrapeError";
|
|
36
|
+
this.statusCode = statusCode;
|
|
37
|
+
this.response = response;
|
|
38
|
+
}
|
|
39
|
+
};
|
|
40
|
+
var InsufficientCreditsError = class extends ClearScrapeError {
|
|
41
|
+
constructor(message, required) {
|
|
42
|
+
super(message, 402);
|
|
43
|
+
this.name = "InsufficientCreditsError";
|
|
44
|
+
this.required = required;
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
var RateLimitError = class extends ClearScrapeError {
|
|
48
|
+
constructor(message) {
|
|
49
|
+
super(message, 429);
|
|
50
|
+
this.name = "RateLimitError";
|
|
51
|
+
}
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
// src/client.ts
|
|
55
|
+
var DEFAULT_BASE_URL = "https://api.clearscrape.io";
|
|
56
|
+
var DEFAULT_TIMEOUT = 6e4;
|
|
57
|
+
var DEFAULT_RETRIES = 3;
|
|
58
|
+
var ClearScrape = class {
|
|
59
|
+
constructor(config) {
|
|
60
|
+
if (!config.apiKey) {
|
|
61
|
+
throw new Error("API key is required");
|
|
62
|
+
}
|
|
63
|
+
this.apiKey = config.apiKey;
|
|
64
|
+
this.baseUrl = config.baseUrl || DEFAULT_BASE_URL;
|
|
65
|
+
this.timeout = config.timeout || DEFAULT_TIMEOUT;
|
|
66
|
+
this.retries = config.retries ?? DEFAULT_RETRIES;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Scrape a URL and return the HTML content
|
|
70
|
+
*
|
|
71
|
+
* @param options - Scraping options
|
|
72
|
+
* @returns Promise resolving to scrape response
|
|
73
|
+
*
|
|
74
|
+
* @example
|
|
75
|
+
* ```typescript
|
|
76
|
+
* // Basic scrape
|
|
77
|
+
* const result = await client.scrape({ url: 'https://example.com' });
|
|
78
|
+
*
|
|
79
|
+
* // With JavaScript rendering
|
|
80
|
+
* const result = await client.scrape({
|
|
81
|
+
* url: 'https://example.com',
|
|
82
|
+
* jsRender: true,
|
|
83
|
+
* waitFor: '.content'
|
|
84
|
+
* });
|
|
85
|
+
*
|
|
86
|
+
* // With premium proxy and country targeting
|
|
87
|
+
* const result = await client.scrape({
|
|
88
|
+
* url: 'https://example.com',
|
|
89
|
+
* premiumProxy: true,
|
|
90
|
+
* proxyCountry: 'us'
|
|
91
|
+
* });
|
|
92
|
+
* ```
|
|
93
|
+
*/
|
|
94
|
+
async scrape(options) {
|
|
95
|
+
const payload = this.buildPayload(options);
|
|
96
|
+
return this.makeRequest("/api/scrape", payload);
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Scrape a URL and return only the HTML content
|
|
100
|
+
*
|
|
101
|
+
* @param url - URL to scrape
|
|
102
|
+
* @param options - Additional scraping options
|
|
103
|
+
* @returns Promise resolving to HTML string
|
|
104
|
+
*/
|
|
105
|
+
async getHtml(url, options) {
|
|
106
|
+
const result = await this.scrape({ url, ...options });
|
|
107
|
+
return result.data.html;
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Scrape a URL and return only the text content
|
|
111
|
+
*
|
|
112
|
+
* @param url - URL to scrape
|
|
113
|
+
* @param options - Additional scraping options
|
|
114
|
+
* @returns Promise resolving to text string
|
|
115
|
+
*/
|
|
116
|
+
async getText(url, options) {
|
|
117
|
+
const result = await this.scrape({ url, ...options });
|
|
118
|
+
return result.data.text || "";
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Take a screenshot of a URL
|
|
122
|
+
*
|
|
123
|
+
* @param url - URL to screenshot
|
|
124
|
+
* @param options - Additional options
|
|
125
|
+
* @returns Promise resolving to base64 encoded screenshot
|
|
126
|
+
*
|
|
127
|
+
* @example
|
|
128
|
+
* ```typescript
|
|
129
|
+
* const screenshot = await client.screenshot('https://example.com');
|
|
130
|
+
* // Save to file
|
|
131
|
+
* fs.writeFileSync('screenshot.png', Buffer.from(screenshot, 'base64'));
|
|
132
|
+
* ```
|
|
133
|
+
*/
|
|
134
|
+
async screenshot(url, options) {
|
|
135
|
+
const result = await this.scrape({
|
|
136
|
+
url,
|
|
137
|
+
...options,
|
|
138
|
+
jsRender: true,
|
|
139
|
+
screenshot: true
|
|
140
|
+
});
|
|
141
|
+
if (!result.data.screenshot) {
|
|
142
|
+
throw new ClearScrapeError("Screenshot not returned", 500);
|
|
143
|
+
}
|
|
144
|
+
const base64 = result.data.screenshot.replace(/^data:image\/\w+;base64,/, "");
|
|
145
|
+
return base64;
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Scrape using a domain-specific extractor (Amazon, Walmart, Google, etc.)
|
|
149
|
+
*
|
|
150
|
+
* @param url - URL to scrape
|
|
151
|
+
* @param domain - Domain extractor to use
|
|
152
|
+
* @returns Promise resolving to extracted data
|
|
153
|
+
*
|
|
154
|
+
* @example
|
|
155
|
+
* ```typescript
|
|
156
|
+
* // Extract Amazon product data
|
|
157
|
+
* const product = await client.extract(
|
|
158
|
+
* 'https://www.amazon.com/dp/B09V3KXJPB',
|
|
159
|
+
* 'amazon'
|
|
160
|
+
* );
|
|
161
|
+
* console.log(product.title, product.price);
|
|
162
|
+
*
|
|
163
|
+
* // Extract Google SERP data
|
|
164
|
+
* const serp = await client.extract(
|
|
165
|
+
* 'https://www.google.com/search?q=best+laptops',
|
|
166
|
+
* 'google'
|
|
167
|
+
* );
|
|
168
|
+
* console.log(serp.organicResults);
|
|
169
|
+
* ```
|
|
170
|
+
*/
|
|
171
|
+
async extract(url, domain) {
|
|
172
|
+
const result = await this.scrape({ url, domain });
|
|
173
|
+
if (!result.data.extracted) {
|
|
174
|
+
throw new ClearScrapeError("No extracted data returned", 500);
|
|
175
|
+
}
|
|
176
|
+
return result.data.extracted;
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Get proxy configuration for the residential proxy service
|
|
180
|
+
*
|
|
181
|
+
* @param options - Proxy options
|
|
182
|
+
* @returns Proxy configuration object
|
|
183
|
+
*
|
|
184
|
+
* @example
|
|
185
|
+
* ```typescript
|
|
186
|
+
* // Basic proxy config
|
|
187
|
+
* const proxy = client.getProxyConfig();
|
|
188
|
+
* // { host: 'proxy.clearscrape.io', port: 8000, username: '...', password: '...' }
|
|
189
|
+
*
|
|
190
|
+
* // With country targeting
|
|
191
|
+
* const proxy = client.getProxyConfig({ country: 'us' });
|
|
192
|
+
*
|
|
193
|
+
* // With session sticky IP
|
|
194
|
+
* const proxy = client.getProxyConfig({ session: 'my-session-123' });
|
|
195
|
+
* ```
|
|
196
|
+
*/
|
|
197
|
+
getProxyConfig(options) {
|
|
198
|
+
let username = this.apiKey;
|
|
199
|
+
if (options?.country) {
|
|
200
|
+
username += `-country-${options.country}`;
|
|
201
|
+
}
|
|
202
|
+
if (options?.session) {
|
|
203
|
+
username += `-session-${options.session}`;
|
|
204
|
+
}
|
|
205
|
+
return {
|
|
206
|
+
host: "proxy.clearscrape.io",
|
|
207
|
+
port: 8e3,
|
|
208
|
+
username,
|
|
209
|
+
password: this.apiKey
|
|
210
|
+
};
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Get proxy URL string for use with HTTP clients
|
|
214
|
+
*
|
|
215
|
+
* @param options - Proxy options
|
|
216
|
+
* @returns Proxy URL string
|
|
217
|
+
*
|
|
218
|
+
* @example
|
|
219
|
+
* ```typescript
|
|
220
|
+
* const proxyUrl = client.getProxyUrl({ country: 'us' });
|
|
221
|
+
* // 'http://apikey-country-us:apikey@proxy.clearscrape.io:8000'
|
|
222
|
+
*
|
|
223
|
+
* // Use with axios
|
|
224
|
+
* const HttpsProxyAgent = require('https-proxy-agent');
|
|
225
|
+
* const agent = new HttpsProxyAgent(client.getProxyUrl());
|
|
226
|
+
* axios.get(url, { httpsAgent: agent });
|
|
227
|
+
* ```
|
|
228
|
+
*/
|
|
229
|
+
getProxyUrl(options) {
|
|
230
|
+
const config = this.getProxyConfig(options);
|
|
231
|
+
return `http://${config.username}:${config.password}@${config.host}:${config.port}`;
|
|
232
|
+
}
|
|
233
|
+
/**
|
|
234
|
+
* Get WebSocket URL for Scraping Browser (Playwright/Puppeteer)
|
|
235
|
+
*
|
|
236
|
+
* @param options - Browser options
|
|
237
|
+
* @returns WebSocket URL string
|
|
238
|
+
*
|
|
239
|
+
* @example
|
|
240
|
+
* ```typescript
|
|
241
|
+
* // Use with Playwright
|
|
242
|
+
* const { chromium } = require('playwright');
|
|
243
|
+
* const browser = await chromium.connectOverCDP(client.getBrowserWsUrl());
|
|
244
|
+
*
|
|
245
|
+
* // Use with Puppeteer
|
|
246
|
+
* const puppeteer = require('puppeteer-core');
|
|
247
|
+
* const browser = await puppeteer.connect({
|
|
248
|
+
* browserWSEndpoint: client.getBrowserWsUrl()
|
|
249
|
+
* });
|
|
250
|
+
*
|
|
251
|
+
* // With country targeting
|
|
252
|
+
* const wsUrl = client.getBrowserWsUrl({ proxyCountry: 'gb' });
|
|
253
|
+
* ```
|
|
254
|
+
*/
|
|
255
|
+
getBrowserWsUrl(options) {
|
|
256
|
+
let url = `wss://browser.clearscrape.io?apiKey=${this.apiKey}`;
|
|
257
|
+
if (options?.proxyCountry) {
|
|
258
|
+
url += `&proxy_country=${options.proxyCountry}`;
|
|
259
|
+
}
|
|
260
|
+
return url;
|
|
261
|
+
}
|
|
262
|
+
/**
|
|
263
|
+
* Build the API request payload
|
|
264
|
+
*/
|
|
265
|
+
buildPayload(options) {
|
|
266
|
+
const payload = {
|
|
267
|
+
url: options.url
|
|
268
|
+
};
|
|
269
|
+
if (options.method) payload.method = options.method;
|
|
270
|
+
if (options.jsRender) payload.js_render = options.jsRender;
|
|
271
|
+
if (options.premiumProxy) payload.premium_proxy = options.premiumProxy;
|
|
272
|
+
if (options.antibot) payload.antibot = options.antibot;
|
|
273
|
+
if (options.proxyCountry) payload.proxy_country = options.proxyCountry;
|
|
274
|
+
if (options.waitFor) payload.wait_for = options.waitFor;
|
|
275
|
+
if (options.wait) payload.wait = options.wait;
|
|
276
|
+
if (options.autoScroll) payload.auto_scroll = options.autoScroll;
|
|
277
|
+
if (options.screenshot) payload.screenshot = options.screenshot;
|
|
278
|
+
if (options.screenshotSelector) payload.screenshot_selector = options.screenshotSelector;
|
|
279
|
+
if (options.headers) payload.headers = options.headers;
|
|
280
|
+
if (options.body) payload.body = options.body;
|
|
281
|
+
if (options.domain) payload.domain = options.domain;
|
|
282
|
+
return payload;
|
|
283
|
+
}
|
|
284
|
+
/**
|
|
285
|
+
* Make an API request with retries
|
|
286
|
+
*/
|
|
287
|
+
async makeRequest(endpoint, payload, attempt = 1) {
|
|
288
|
+
const url = `${this.baseUrl}${endpoint}`;
|
|
289
|
+
const controller = new AbortController();
|
|
290
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
291
|
+
try {
|
|
292
|
+
const response = await fetch(url, {
|
|
293
|
+
method: "POST",
|
|
294
|
+
headers: {
|
|
295
|
+
"X-API-Key": this.apiKey,
|
|
296
|
+
"Content-Type": "application/json"
|
|
297
|
+
},
|
|
298
|
+
body: JSON.stringify(payload),
|
|
299
|
+
signal: controller.signal
|
|
300
|
+
});
|
|
301
|
+
clearTimeout(timeoutId);
|
|
302
|
+
const data = await response.json();
|
|
303
|
+
if (!response.ok) {
|
|
304
|
+
return this.handleError(response.status, data, payload, attempt);
|
|
305
|
+
}
|
|
306
|
+
return data;
|
|
307
|
+
} catch (error) {
|
|
308
|
+
clearTimeout(timeoutId);
|
|
309
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
310
|
+
throw new ClearScrapeError("Request timeout", 408);
|
|
311
|
+
}
|
|
312
|
+
if (attempt < this.retries) {
|
|
313
|
+
const delay = Math.pow(2, attempt) * 1e3;
|
|
314
|
+
await this.sleep(delay);
|
|
315
|
+
return this.makeRequest(endpoint, payload, attempt + 1);
|
|
316
|
+
}
|
|
317
|
+
throw new ClearScrapeError(
|
|
318
|
+
error instanceof Error ? error.message : "Unknown error",
|
|
319
|
+
500
|
|
320
|
+
);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
/**
|
|
324
|
+
* Handle API errors
|
|
325
|
+
*/
|
|
326
|
+
async handleError(statusCode, response, payload, attempt) {
|
|
327
|
+
if (statusCode >= 400 && statusCode < 500 && statusCode !== 429) {
|
|
328
|
+
if (statusCode === 402 && response.required) {
|
|
329
|
+
throw new InsufficientCreditsError(response.message, response.required);
|
|
330
|
+
}
|
|
331
|
+
throw new ClearScrapeError(response.message || response.error, statusCode, response);
|
|
332
|
+
}
|
|
333
|
+
if (attempt < this.retries) {
|
|
334
|
+
const delay = statusCode === 429 ? 5e3 : Math.pow(2, attempt) * 1e3;
|
|
335
|
+
await this.sleep(delay);
|
|
336
|
+
return this.makeRequest("/api/scrape", payload, attempt + 1);
|
|
337
|
+
}
|
|
338
|
+
if (statusCode === 429) {
|
|
339
|
+
throw new RateLimitError(response.message || "Rate limit exceeded");
|
|
340
|
+
}
|
|
341
|
+
throw new ClearScrapeError(response.message || response.error, statusCode, response);
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* Sleep for a specified duration
|
|
345
|
+
*/
|
|
346
|
+
sleep(ms) {
|
|
347
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
348
|
+
}
|
|
349
|
+
};
|
|
350
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
351
|
+
0 && (module.exports = {
|
|
352
|
+
ClearScrape,
|
|
353
|
+
ClearScrapeError,
|
|
354
|
+
InsufficientCreditsError,
|
|
355
|
+
RateLimitError
|
|
356
|
+
});
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
// src/types.ts
|
|
2
|
+
var ClearScrapeError = class extends Error {
|
|
3
|
+
constructor(message, statusCode, response) {
|
|
4
|
+
super(message);
|
|
5
|
+
this.name = "ClearScrapeError";
|
|
6
|
+
this.statusCode = statusCode;
|
|
7
|
+
this.response = response;
|
|
8
|
+
}
|
|
9
|
+
};
|
|
10
|
+
var InsufficientCreditsError = class extends ClearScrapeError {
|
|
11
|
+
constructor(message, required) {
|
|
12
|
+
super(message, 402);
|
|
13
|
+
this.name = "InsufficientCreditsError";
|
|
14
|
+
this.required = required;
|
|
15
|
+
}
|
|
16
|
+
};
|
|
17
|
+
var RateLimitError = class extends ClearScrapeError {
|
|
18
|
+
constructor(message) {
|
|
19
|
+
super(message, 429);
|
|
20
|
+
this.name = "RateLimitError";
|
|
21
|
+
}
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
// src/client.ts
|
|
25
|
+
var DEFAULT_BASE_URL = "https://api.clearscrape.io";
|
|
26
|
+
var DEFAULT_TIMEOUT = 6e4;
|
|
27
|
+
var DEFAULT_RETRIES = 3;
|
|
28
|
+
var ClearScrape = class {
|
|
29
|
+
constructor(config) {
|
|
30
|
+
if (!config.apiKey) {
|
|
31
|
+
throw new Error("API key is required");
|
|
32
|
+
}
|
|
33
|
+
this.apiKey = config.apiKey;
|
|
34
|
+
this.baseUrl = config.baseUrl || DEFAULT_BASE_URL;
|
|
35
|
+
this.timeout = config.timeout || DEFAULT_TIMEOUT;
|
|
36
|
+
this.retries = config.retries ?? DEFAULT_RETRIES;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Scrape a URL and return the HTML content
|
|
40
|
+
*
|
|
41
|
+
* @param options - Scraping options
|
|
42
|
+
* @returns Promise resolving to scrape response
|
|
43
|
+
*
|
|
44
|
+
* @example
|
|
45
|
+
* ```typescript
|
|
46
|
+
* // Basic scrape
|
|
47
|
+
* const result = await client.scrape({ url: 'https://example.com' });
|
|
48
|
+
*
|
|
49
|
+
* // With JavaScript rendering
|
|
50
|
+
* const result = await client.scrape({
|
|
51
|
+
* url: 'https://example.com',
|
|
52
|
+
* jsRender: true,
|
|
53
|
+
* waitFor: '.content'
|
|
54
|
+
* });
|
|
55
|
+
*
|
|
56
|
+
* // With premium proxy and country targeting
|
|
57
|
+
* const result = await client.scrape({
|
|
58
|
+
* url: 'https://example.com',
|
|
59
|
+
* premiumProxy: true,
|
|
60
|
+
* proxyCountry: 'us'
|
|
61
|
+
* });
|
|
62
|
+
* ```
|
|
63
|
+
*/
|
|
64
|
+
async scrape(options) {
|
|
65
|
+
const payload = this.buildPayload(options);
|
|
66
|
+
return this.makeRequest("/api/scrape", payload);
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Scrape a URL and return only the HTML content
|
|
70
|
+
*
|
|
71
|
+
* @param url - URL to scrape
|
|
72
|
+
* @param options - Additional scraping options
|
|
73
|
+
* @returns Promise resolving to HTML string
|
|
74
|
+
*/
|
|
75
|
+
async getHtml(url, options) {
|
|
76
|
+
const result = await this.scrape({ url, ...options });
|
|
77
|
+
return result.data.html;
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Scrape a URL and return only the text content
|
|
81
|
+
*
|
|
82
|
+
* @param url - URL to scrape
|
|
83
|
+
* @param options - Additional scraping options
|
|
84
|
+
* @returns Promise resolving to text string
|
|
85
|
+
*/
|
|
86
|
+
async getText(url, options) {
|
|
87
|
+
const result = await this.scrape({ url, ...options });
|
|
88
|
+
return result.data.text || "";
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Take a screenshot of a URL
|
|
92
|
+
*
|
|
93
|
+
* @param url - URL to screenshot
|
|
94
|
+
* @param options - Additional options
|
|
95
|
+
* @returns Promise resolving to base64 encoded screenshot
|
|
96
|
+
*
|
|
97
|
+
* @example
|
|
98
|
+
* ```typescript
|
|
99
|
+
* const screenshot = await client.screenshot('https://example.com');
|
|
100
|
+
* // Save to file
|
|
101
|
+
* fs.writeFileSync('screenshot.png', Buffer.from(screenshot, 'base64'));
|
|
102
|
+
* ```
|
|
103
|
+
*/
|
|
104
|
+
async screenshot(url, options) {
|
|
105
|
+
const result = await this.scrape({
|
|
106
|
+
url,
|
|
107
|
+
...options,
|
|
108
|
+
jsRender: true,
|
|
109
|
+
screenshot: true
|
|
110
|
+
});
|
|
111
|
+
if (!result.data.screenshot) {
|
|
112
|
+
throw new ClearScrapeError("Screenshot not returned", 500);
|
|
113
|
+
}
|
|
114
|
+
const base64 = result.data.screenshot.replace(/^data:image\/\w+;base64,/, "");
|
|
115
|
+
return base64;
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Scrape using a domain-specific extractor (Amazon, Walmart, Google, etc.)
|
|
119
|
+
*
|
|
120
|
+
* @param url - URL to scrape
|
|
121
|
+
* @param domain - Domain extractor to use
|
|
122
|
+
* @returns Promise resolving to extracted data
|
|
123
|
+
*
|
|
124
|
+
* @example
|
|
125
|
+
* ```typescript
|
|
126
|
+
* // Extract Amazon product data
|
|
127
|
+
* const product = await client.extract(
|
|
128
|
+
* 'https://www.amazon.com/dp/B09V3KXJPB',
|
|
129
|
+
* 'amazon'
|
|
130
|
+
* );
|
|
131
|
+
* console.log(product.title, product.price);
|
|
132
|
+
*
|
|
133
|
+
* // Extract Google SERP data
|
|
134
|
+
* const serp = await client.extract(
|
|
135
|
+
* 'https://www.google.com/search?q=best+laptops',
|
|
136
|
+
* 'google'
|
|
137
|
+
* );
|
|
138
|
+
* console.log(serp.organicResults);
|
|
139
|
+
* ```
|
|
140
|
+
*/
|
|
141
|
+
async extract(url, domain) {
|
|
142
|
+
const result = await this.scrape({ url, domain });
|
|
143
|
+
if (!result.data.extracted) {
|
|
144
|
+
throw new ClearScrapeError("No extracted data returned", 500);
|
|
145
|
+
}
|
|
146
|
+
return result.data.extracted;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Get proxy configuration for the residential proxy service
|
|
150
|
+
*
|
|
151
|
+
* @param options - Proxy options
|
|
152
|
+
* @returns Proxy configuration object
|
|
153
|
+
*
|
|
154
|
+
* @example
|
|
155
|
+
* ```typescript
|
|
156
|
+
* // Basic proxy config
|
|
157
|
+
* const proxy = client.getProxyConfig();
|
|
158
|
+
* // { host: 'proxy.clearscrape.io', port: 8000, username: '...', password: '...' }
|
|
159
|
+
*
|
|
160
|
+
* // With country targeting
|
|
161
|
+
* const proxy = client.getProxyConfig({ country: 'us' });
|
|
162
|
+
*
|
|
163
|
+
* // With session sticky IP
|
|
164
|
+
* const proxy = client.getProxyConfig({ session: 'my-session-123' });
|
|
165
|
+
* ```
|
|
166
|
+
*/
|
|
167
|
+
getProxyConfig(options) {
|
|
168
|
+
let username = this.apiKey;
|
|
169
|
+
if (options?.country) {
|
|
170
|
+
username += `-country-${options.country}`;
|
|
171
|
+
}
|
|
172
|
+
if (options?.session) {
|
|
173
|
+
username += `-session-${options.session}`;
|
|
174
|
+
}
|
|
175
|
+
return {
|
|
176
|
+
host: "proxy.clearscrape.io",
|
|
177
|
+
port: 8e3,
|
|
178
|
+
username,
|
|
179
|
+
password: this.apiKey
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
/**
|
|
183
|
+
* Get proxy URL string for use with HTTP clients
|
|
184
|
+
*
|
|
185
|
+
* @param options - Proxy options
|
|
186
|
+
* @returns Proxy URL string
|
|
187
|
+
*
|
|
188
|
+
* @example
|
|
189
|
+
* ```typescript
|
|
190
|
+
* const proxyUrl = client.getProxyUrl({ country: 'us' });
|
|
191
|
+
* // 'http://apikey-country-us:apikey@proxy.clearscrape.io:8000'
|
|
192
|
+
*
|
|
193
|
+
* // Use with axios
|
|
194
|
+
* const HttpsProxyAgent = require('https-proxy-agent');
|
|
195
|
+
* const agent = new HttpsProxyAgent(client.getProxyUrl());
|
|
196
|
+
* axios.get(url, { httpsAgent: agent });
|
|
197
|
+
* ```
|
|
198
|
+
*/
|
|
199
|
+
getProxyUrl(options) {
|
|
200
|
+
const config = this.getProxyConfig(options);
|
|
201
|
+
return `http://${config.username}:${config.password}@${config.host}:${config.port}`;
|
|
202
|
+
}
|
|
203
|
+
/**
|
|
204
|
+
* Get WebSocket URL for Scraping Browser (Playwright/Puppeteer)
|
|
205
|
+
*
|
|
206
|
+
* @param options - Browser options
|
|
207
|
+
* @returns WebSocket URL string
|
|
208
|
+
*
|
|
209
|
+
* @example
|
|
210
|
+
* ```typescript
|
|
211
|
+
* // Use with Playwright
|
|
212
|
+
* const { chromium } = require('playwright');
|
|
213
|
+
* const browser = await chromium.connectOverCDP(client.getBrowserWsUrl());
|
|
214
|
+
*
|
|
215
|
+
* // Use with Puppeteer
|
|
216
|
+
* const puppeteer = require('puppeteer-core');
|
|
217
|
+
* const browser = await puppeteer.connect({
|
|
218
|
+
* browserWSEndpoint: client.getBrowserWsUrl()
|
|
219
|
+
* });
|
|
220
|
+
*
|
|
221
|
+
* // With country targeting
|
|
222
|
+
* const wsUrl = client.getBrowserWsUrl({ proxyCountry: 'gb' });
|
|
223
|
+
* ```
|
|
224
|
+
*/
|
|
225
|
+
getBrowserWsUrl(options) {
|
|
226
|
+
let url = `wss://browser.clearscrape.io?apiKey=${this.apiKey}`;
|
|
227
|
+
if (options?.proxyCountry) {
|
|
228
|
+
url += `&proxy_country=${options.proxyCountry}`;
|
|
229
|
+
}
|
|
230
|
+
return url;
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Build the API request payload
|
|
234
|
+
*/
|
|
235
|
+
buildPayload(options) {
|
|
236
|
+
const payload = {
|
|
237
|
+
url: options.url
|
|
238
|
+
};
|
|
239
|
+
if (options.method) payload.method = options.method;
|
|
240
|
+
if (options.jsRender) payload.js_render = options.jsRender;
|
|
241
|
+
if (options.premiumProxy) payload.premium_proxy = options.premiumProxy;
|
|
242
|
+
if (options.antibot) payload.antibot = options.antibot;
|
|
243
|
+
if (options.proxyCountry) payload.proxy_country = options.proxyCountry;
|
|
244
|
+
if (options.waitFor) payload.wait_for = options.waitFor;
|
|
245
|
+
if (options.wait) payload.wait = options.wait;
|
|
246
|
+
if (options.autoScroll) payload.auto_scroll = options.autoScroll;
|
|
247
|
+
if (options.screenshot) payload.screenshot = options.screenshot;
|
|
248
|
+
if (options.screenshotSelector) payload.screenshot_selector = options.screenshotSelector;
|
|
249
|
+
if (options.headers) payload.headers = options.headers;
|
|
250
|
+
if (options.body) payload.body = options.body;
|
|
251
|
+
if (options.domain) payload.domain = options.domain;
|
|
252
|
+
return payload;
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Make an API request with retries
|
|
256
|
+
*/
|
|
257
|
+
async makeRequest(endpoint, payload, attempt = 1) {
|
|
258
|
+
const url = `${this.baseUrl}${endpoint}`;
|
|
259
|
+
const controller = new AbortController();
|
|
260
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
261
|
+
try {
|
|
262
|
+
const response = await fetch(url, {
|
|
263
|
+
method: "POST",
|
|
264
|
+
headers: {
|
|
265
|
+
"X-API-Key": this.apiKey,
|
|
266
|
+
"Content-Type": "application/json"
|
|
267
|
+
},
|
|
268
|
+
body: JSON.stringify(payload),
|
|
269
|
+
signal: controller.signal
|
|
270
|
+
});
|
|
271
|
+
clearTimeout(timeoutId);
|
|
272
|
+
const data = await response.json();
|
|
273
|
+
if (!response.ok) {
|
|
274
|
+
return this.handleError(response.status, data, payload, attempt);
|
|
275
|
+
}
|
|
276
|
+
return data;
|
|
277
|
+
} catch (error) {
|
|
278
|
+
clearTimeout(timeoutId);
|
|
279
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
280
|
+
throw new ClearScrapeError("Request timeout", 408);
|
|
281
|
+
}
|
|
282
|
+
if (attempt < this.retries) {
|
|
283
|
+
const delay = Math.pow(2, attempt) * 1e3;
|
|
284
|
+
await this.sleep(delay);
|
|
285
|
+
return this.makeRequest(endpoint, payload, attempt + 1);
|
|
286
|
+
}
|
|
287
|
+
throw new ClearScrapeError(
|
|
288
|
+
error instanceof Error ? error.message : "Unknown error",
|
|
289
|
+
500
|
|
290
|
+
);
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
/**
|
|
294
|
+
* Handle API errors
|
|
295
|
+
*/
|
|
296
|
+
async handleError(statusCode, response, payload, attempt) {
|
|
297
|
+
if (statusCode >= 400 && statusCode < 500 && statusCode !== 429) {
|
|
298
|
+
if (statusCode === 402 && response.required) {
|
|
299
|
+
throw new InsufficientCreditsError(response.message, response.required);
|
|
300
|
+
}
|
|
301
|
+
throw new ClearScrapeError(response.message || response.error, statusCode, response);
|
|
302
|
+
}
|
|
303
|
+
if (attempt < this.retries) {
|
|
304
|
+
const delay = statusCode === 429 ? 5e3 : Math.pow(2, attempt) * 1e3;
|
|
305
|
+
await this.sleep(delay);
|
|
306
|
+
return this.makeRequest("/api/scrape", payload, attempt + 1);
|
|
307
|
+
}
|
|
308
|
+
if (statusCode === 429) {
|
|
309
|
+
throw new RateLimitError(response.message || "Rate limit exceeded");
|
|
310
|
+
}
|
|
311
|
+
throw new ClearScrapeError(response.message || response.error, statusCode, response);
|
|
312
|
+
}
|
|
313
|
+
/**
|
|
314
|
+
* Sleep for a specified duration
|
|
315
|
+
*/
|
|
316
|
+
sleep(ms) {
|
|
317
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
318
|
+
}
|
|
319
|
+
};
|
|
320
|
+
export {
|
|
321
|
+
ClearScrape,
|
|
322
|
+
ClearScrapeError,
|
|
323
|
+
InsufficientCreditsError,
|
|
324
|
+
RateLimitError,
|
|
325
|
+
ClearScrape as default
|
|
326
|
+
};
|