@crawlgate/sdk 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +397 -0
- package/dist/index.cjs +1299 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +1356 -0
- package/dist/index.d.ts +1356 -0
- package/dist/index.js +1255 -0
- package/dist/index.js.map +1 -0
- package/package.json +60 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1255 @@
|
|
|
1
|
+
// src/utils/httpClient.ts
|
|
2
|
+
import axios from "axios";
|
|
3
|
+
|
|
4
|
+
// src/errors.ts
|
|
5
|
+
var CrawlGateError = class _CrawlGateError extends Error {
|
|
6
|
+
/**
|
|
7
|
+
* HTTP status code (if applicable)
|
|
8
|
+
*/
|
|
9
|
+
statusCode;
|
|
10
|
+
/**
|
|
11
|
+
* Error code for programmatic handling
|
|
12
|
+
*/
|
|
13
|
+
code;
|
|
14
|
+
/**
|
|
15
|
+
* Additional error details
|
|
16
|
+
*/
|
|
17
|
+
details;
|
|
18
|
+
constructor(message, statusCode, code, details) {
|
|
19
|
+
super(message);
|
|
20
|
+
this.name = "CrawlGateError";
|
|
21
|
+
this.statusCode = statusCode;
|
|
22
|
+
this.code = code;
|
|
23
|
+
this.details = details;
|
|
24
|
+
if (Error.captureStackTrace) {
|
|
25
|
+
Error.captureStackTrace(this, _CrawlGateError);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
};
|
|
29
|
+
var AuthenticationError = class extends CrawlGateError {
|
|
30
|
+
constructor(message = "Invalid API Key") {
|
|
31
|
+
super(message, 401, "AUTHENTICATION_ERROR");
|
|
32
|
+
this.name = "AuthenticationError";
|
|
33
|
+
}
|
|
34
|
+
};
|
|
35
|
+
var ValidationError = class extends CrawlGateError {
|
|
36
|
+
constructor(message, details) {
|
|
37
|
+
super(message, 400, "VALIDATION_ERROR", details);
|
|
38
|
+
this.name = "ValidationError";
|
|
39
|
+
}
|
|
40
|
+
};
|
|
41
|
+
var JobTimeoutError = class extends CrawlGateError {
|
|
42
|
+
/**
|
|
43
|
+
* Job ID that timed out
|
|
44
|
+
*/
|
|
45
|
+
jobId;
|
|
46
|
+
/**
|
|
47
|
+
* Timeout duration in seconds
|
|
48
|
+
*/
|
|
49
|
+
timeoutSeconds;
|
|
50
|
+
constructor(jobId, timeoutSeconds) {
|
|
51
|
+
super(
|
|
52
|
+
`Crawl job ${jobId} did not complete within ${timeoutSeconds} seconds`,
|
|
53
|
+
void 0,
|
|
54
|
+
"JOB_TIMEOUT"
|
|
55
|
+
);
|
|
56
|
+
this.name = "JobTimeoutError";
|
|
57
|
+
this.jobId = jobId;
|
|
58
|
+
this.timeoutSeconds = timeoutSeconds;
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
var ServiceUnavailableError = class extends CrawlGateError {
|
|
62
|
+
constructor(message = "Service temporarily unavailable") {
|
|
63
|
+
super(message, 503, "SERVICE_UNAVAILABLE");
|
|
64
|
+
this.name = "ServiceUnavailableError";
|
|
65
|
+
}
|
|
66
|
+
};
|
|
67
|
+
var RateLimitError = class extends CrawlGateError {
|
|
68
|
+
/**
|
|
69
|
+
* Time to wait before retrying (in seconds)
|
|
70
|
+
*/
|
|
71
|
+
retryAfter;
|
|
72
|
+
constructor(message = "Rate limit exceeded", retryAfter) {
|
|
73
|
+
super(message, 429, "RATE_LIMIT_EXCEEDED");
|
|
74
|
+
this.name = "RateLimitError";
|
|
75
|
+
this.retryAfter = retryAfter;
|
|
76
|
+
}
|
|
77
|
+
};
|
|
78
|
+
var ExtractionError = class extends CrawlGateError {
|
|
79
|
+
/**
|
|
80
|
+
* Provider that failed
|
|
81
|
+
*/
|
|
82
|
+
provider;
|
|
83
|
+
constructor(message, provider) {
|
|
84
|
+
super(message, void 0, "EXTRACTION_ERROR");
|
|
85
|
+
this.name = "ExtractionError";
|
|
86
|
+
this.provider = provider;
|
|
87
|
+
}
|
|
88
|
+
};
|
|
89
|
+
function parseApiError(status, data) {
|
|
90
|
+
const message = data.error || data.message || "Unknown error";
|
|
91
|
+
switch (status) {
|
|
92
|
+
case 400:
|
|
93
|
+
throw new ValidationError(message, data.details);
|
|
94
|
+
case 401:
|
|
95
|
+
throw new AuthenticationError(message);
|
|
96
|
+
case 429:
|
|
97
|
+
throw new RateLimitError(message);
|
|
98
|
+
case 502:
|
|
99
|
+
case 503:
|
|
100
|
+
throw new ServiceUnavailableError(message);
|
|
101
|
+
default:
|
|
102
|
+
throw new CrawlGateError(message, status, void 0, data.details);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// src/utils/httpClient.ts
|
|
107
|
+
var HttpClient = class {
|
|
108
|
+
instance;
|
|
109
|
+
apiKey;
|
|
110
|
+
apiUrl;
|
|
111
|
+
maxRetries;
|
|
112
|
+
backoffFactor;
|
|
113
|
+
constructor(options) {
|
|
114
|
+
this.apiKey = options.apiKey;
|
|
115
|
+
this.apiUrl = options.apiUrl.replace(/\/$/, "");
|
|
116
|
+
this.maxRetries = options.maxRetries ?? 3;
|
|
117
|
+
this.backoffFactor = options.backoffFactor ?? 0.5;
|
|
118
|
+
this.instance = axios.create({
|
|
119
|
+
baseURL: this.apiUrl,
|
|
120
|
+
timeout: options.timeoutMs ?? 9e4,
|
|
121
|
+
headers: {
|
|
122
|
+
"Content-Type": "application/json",
|
|
123
|
+
"x-api-key": this.apiKey
|
|
124
|
+
}
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Get the configured API URL
|
|
129
|
+
*/
|
|
130
|
+
getApiUrl() {
|
|
131
|
+
return this.apiUrl;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Get the configured API key
|
|
135
|
+
*/
|
|
136
|
+
getApiKey() {
|
|
137
|
+
return this.apiKey;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Sleep for specified seconds
|
|
141
|
+
*/
|
|
142
|
+
sleep(seconds) {
|
|
143
|
+
return new Promise((resolve) => setTimeout(resolve, seconds * 1e3));
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Check if error is retryable
|
|
147
|
+
*/
|
|
148
|
+
isRetryableError(status) {
|
|
149
|
+
return status === 502 || status === 503 || status === 429;
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Make HTTP request with retry logic
|
|
153
|
+
*/
|
|
154
|
+
async request(config) {
|
|
155
|
+
let lastError;
|
|
156
|
+
for (let attempt = 0; attempt < this.maxRetries; attempt++) {
|
|
157
|
+
try {
|
|
158
|
+
if (config.method && ["post", "put", "patch"].includes(config.method.toLowerCase())) {
|
|
159
|
+
const data = config.data ?? {};
|
|
160
|
+
config.data = { ...data, origin: "crawlgate-sdk" };
|
|
161
|
+
if (typeof data.timeout === "number") {
|
|
162
|
+
config.timeout = data.timeout + 5e3;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
const response = await this.instance.request(config);
|
|
166
|
+
if (this.isRetryableError(response.status) && attempt < this.maxRetries - 1) {
|
|
167
|
+
await this.sleep(this.backoffFactor * Math.pow(2, attempt));
|
|
168
|
+
continue;
|
|
169
|
+
}
|
|
170
|
+
return response;
|
|
171
|
+
} catch (err) {
|
|
172
|
+
const axiosError = err;
|
|
173
|
+
lastError = err instanceof Error ? err : new Error(String(axiosError?.message));
|
|
174
|
+
const status = axiosError?.response?.status;
|
|
175
|
+
if (this.isRetryableError(status) && attempt < this.maxRetries - 1) {
|
|
176
|
+
await this.sleep(this.backoffFactor * Math.pow(2, attempt));
|
|
177
|
+
continue;
|
|
178
|
+
}
|
|
179
|
+
if (axiosError?.response) {
|
|
180
|
+
parseApiError(
|
|
181
|
+
axiosError.response.status,
|
|
182
|
+
axiosError.response.data
|
|
183
|
+
);
|
|
184
|
+
}
|
|
185
|
+
throw lastError;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
throw lastError ?? new CrawlGateError("Unexpected HTTP client error");
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Make POST request
|
|
192
|
+
*/
|
|
193
|
+
async post(endpoint, body, headers) {
|
|
194
|
+
return this.request({
|
|
195
|
+
method: "post",
|
|
196
|
+
url: endpoint,
|
|
197
|
+
data: body,
|
|
198
|
+
headers
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Make GET request
|
|
203
|
+
*/
|
|
204
|
+
async get(endpoint, headers) {
|
|
205
|
+
return this.request({
|
|
206
|
+
method: "get",
|
|
207
|
+
url: endpoint,
|
|
208
|
+
headers
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Make DELETE request
|
|
213
|
+
*/
|
|
214
|
+
async delete(endpoint, headers) {
|
|
215
|
+
return this.request({
|
|
216
|
+
method: "delete",
|
|
217
|
+
url: endpoint,
|
|
218
|
+
headers
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
// src/methods/scrape.ts
|
|
224
|
+
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
225
|
+
function convertSchema(schema) {
|
|
226
|
+
if (schema && typeof schema === "object" && "_def" in schema) {
|
|
227
|
+
return zodToJsonSchema(schema);
|
|
228
|
+
}
|
|
229
|
+
return schema;
|
|
230
|
+
}
|
|
231
|
+
function buildScrapeBody(url, options) {
|
|
232
|
+
const body = { url };
|
|
233
|
+
if (options?.engine) {
|
|
234
|
+
body.engine = options.engine;
|
|
235
|
+
}
|
|
236
|
+
if (options?.formats) {
|
|
237
|
+
body.formats = options.formats;
|
|
238
|
+
}
|
|
239
|
+
if (options?.onlyMainContent !== void 0) {
|
|
240
|
+
body.onlyMainContent = options.onlyMainContent;
|
|
241
|
+
}
|
|
242
|
+
if (options?.excludeTags) {
|
|
243
|
+
body.excludeTags = options.excludeTags;
|
|
244
|
+
}
|
|
245
|
+
if (options?.waitFor !== void 0) {
|
|
246
|
+
body.waitFor = options.waitFor;
|
|
247
|
+
}
|
|
248
|
+
if (options?.timeout !== void 0) {
|
|
249
|
+
body.timeout = options.timeout;
|
|
250
|
+
}
|
|
251
|
+
if (options?.proxy) {
|
|
252
|
+
body.proxy = options.proxy;
|
|
253
|
+
}
|
|
254
|
+
if (options?.projectId) {
|
|
255
|
+
body.project_id = options.projectId;
|
|
256
|
+
}
|
|
257
|
+
if (options?.extract) {
|
|
258
|
+
body.extract = {
|
|
259
|
+
schema: convertSchema(options.extract.schema),
|
|
260
|
+
systemPrompt: options.extract.systemPrompt,
|
|
261
|
+
provider: options.extract.provider,
|
|
262
|
+
enableFallback: options.extract.enableFallback
|
|
263
|
+
};
|
|
264
|
+
Object.keys(body.extract).forEach((key) => {
|
|
265
|
+
if (body.extract[key] === void 0) {
|
|
266
|
+
delete body.extract[key];
|
|
267
|
+
}
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
return body;
|
|
271
|
+
}
|
|
272
|
+
async function scrape(http, url, options) {
|
|
273
|
+
const body = buildScrapeBody(url, options);
|
|
274
|
+
const response = await http.post("/v1/scrape", body);
|
|
275
|
+
if (!response.data.success) {
|
|
276
|
+
throw new CrawlGateError(
|
|
277
|
+
response.data.error || "Scrape failed",
|
|
278
|
+
void 0,
|
|
279
|
+
"SCRAPE_ERROR"
|
|
280
|
+
);
|
|
281
|
+
}
|
|
282
|
+
if (!response.data.data) {
|
|
283
|
+
throw new CrawlGateError("No data returned from scrape", void 0, "NO_DATA");
|
|
284
|
+
}
|
|
285
|
+
const document = {
|
|
286
|
+
...response.data.data
|
|
287
|
+
};
|
|
288
|
+
return document;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// src/methods/crawl.ts
|
|
292
|
+
function buildCrawlBody(url, options) {
|
|
293
|
+
const body = { url };
|
|
294
|
+
if (options?.engine) {
|
|
295
|
+
body.engine = options.engine;
|
|
296
|
+
}
|
|
297
|
+
if (options?.limit !== void 0) {
|
|
298
|
+
body.limit = options.limit;
|
|
299
|
+
}
|
|
300
|
+
if (options?.formats) {
|
|
301
|
+
body.formats = options.formats;
|
|
302
|
+
}
|
|
303
|
+
if (options?.onlyMainContent !== void 0) {
|
|
304
|
+
body.onlyMainContent = options.onlyMainContent;
|
|
305
|
+
}
|
|
306
|
+
if (options?.excludeTags) {
|
|
307
|
+
body.excludeTags = options.excludeTags;
|
|
308
|
+
}
|
|
309
|
+
if (options?.proxy) {
|
|
310
|
+
body.proxy = options.proxy;
|
|
311
|
+
}
|
|
312
|
+
if (options?.projectId) {
|
|
313
|
+
body.project_id = options.projectId;
|
|
314
|
+
}
|
|
315
|
+
return body;
|
|
316
|
+
}
|
|
317
|
+
async function startCrawl(http, url, options) {
|
|
318
|
+
const body = buildCrawlBody(url, options);
|
|
319
|
+
const response = await http.post("/v1/crawl", body);
|
|
320
|
+
if (!response.data.success && !response.data.id) {
|
|
321
|
+
throw new CrawlGateError(
|
|
322
|
+
"Failed to start crawl job",
|
|
323
|
+
void 0,
|
|
324
|
+
"CRAWL_START_ERROR"
|
|
325
|
+
);
|
|
326
|
+
}
|
|
327
|
+
return {
|
|
328
|
+
success: true,
|
|
329
|
+
id: response.data.id,
|
|
330
|
+
jobId: response.data.id,
|
|
331
|
+
status: response.data.status || "scraping",
|
|
332
|
+
engine: response.data.engine
|
|
333
|
+
};
|
|
334
|
+
}
|
|
335
|
+
async function getCrawlStatus(http, jobId) {
|
|
336
|
+
const response = await http.get(`/v1/crawl/${jobId}`);
|
|
337
|
+
return {
|
|
338
|
+
id: response.data.id || jobId,
|
|
339
|
+
status: response.data.status,
|
|
340
|
+
total: response.data.total || 0,
|
|
341
|
+
completed: response.data.completed || 0,
|
|
342
|
+
data: response.data.data || [],
|
|
343
|
+
engine: response.data.engine,
|
|
344
|
+
error: response.data.error
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
async function cancelCrawl(http, jobId) {
|
|
348
|
+
const response = await http.delete(
|
|
349
|
+
`/v1/crawl/${jobId}`
|
|
350
|
+
);
|
|
351
|
+
return response.data.success !== false;
|
|
352
|
+
}
|
|
353
|
+
function sleep(ms) {
|
|
354
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
355
|
+
}
|
|
356
|
+
async function crawl(http, url, options) {
|
|
357
|
+
const pollInterval = options?.pollInterval ?? 2e3;
|
|
358
|
+
const timeout = options?.timeout ?? 300;
|
|
359
|
+
const { id: jobId } = await startCrawl(http, url, options);
|
|
360
|
+
const startTime = Date.now();
|
|
361
|
+
const timeoutMs = timeout * 1e3;
|
|
362
|
+
while (true) {
|
|
363
|
+
const status = await getCrawlStatus(http, jobId);
|
|
364
|
+
if (status.status === "completed") {
|
|
365
|
+
return status;
|
|
366
|
+
}
|
|
367
|
+
if (status.status === "failed") {
|
|
368
|
+
throw new CrawlGateError(
|
|
369
|
+
status.error || "Crawl job failed",
|
|
370
|
+
void 0,
|
|
371
|
+
"CRAWL_FAILED"
|
|
372
|
+
);
|
|
373
|
+
}
|
|
374
|
+
if (status.status === "cancelled") {
|
|
375
|
+
throw new CrawlGateError(
|
|
376
|
+
"Crawl job was cancelled",
|
|
377
|
+
void 0,
|
|
378
|
+
"CRAWL_CANCELLED"
|
|
379
|
+
);
|
|
380
|
+
}
|
|
381
|
+
if (Date.now() - startTime > timeoutMs) {
|
|
382
|
+
throw new JobTimeoutError(jobId, timeout);
|
|
383
|
+
}
|
|
384
|
+
await sleep(pollInterval);
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
async function getCrawlErrors(http, jobId) {
|
|
388
|
+
const response = await http.get(`/v1/crawl/${jobId}/errors`);
|
|
389
|
+
const payload = response.data.data ?? response.data;
|
|
390
|
+
return {
|
|
391
|
+
errors: (payload.errors || []).map((e) => ({
|
|
392
|
+
id: e.id || "",
|
|
393
|
+
timestamp: e.timestamp,
|
|
394
|
+
url: e.url || "",
|
|
395
|
+
code: e.code,
|
|
396
|
+
error: e.error || e.message || "Unknown error"
|
|
397
|
+
})),
|
|
398
|
+
robotsBlocked: payload.robotsBlocked || []
|
|
399
|
+
};
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
// src/methods/map.ts
|
|
403
|
+
function buildMapBody(url, options) {
|
|
404
|
+
const body = { url };
|
|
405
|
+
if (options?.engine) {
|
|
406
|
+
body.engine = options.engine;
|
|
407
|
+
}
|
|
408
|
+
if (options?.proxy) {
|
|
409
|
+
body.proxy = options.proxy;
|
|
410
|
+
}
|
|
411
|
+
if (options?.projectId) {
|
|
412
|
+
body.project_id = options.projectId;
|
|
413
|
+
}
|
|
414
|
+
return body;
|
|
415
|
+
}
|
|
416
|
+
async function map(http, url, options) {
|
|
417
|
+
const body = buildMapBody(url, options);
|
|
418
|
+
const response = await http.post("/v1/map", body);
|
|
419
|
+
if (!response.data.success) {
|
|
420
|
+
throw new CrawlGateError(
|
|
421
|
+
response.data.error || "Map failed",
|
|
422
|
+
void 0,
|
|
423
|
+
"MAP_ERROR"
|
|
424
|
+
);
|
|
425
|
+
}
|
|
426
|
+
return {
|
|
427
|
+
success: true,
|
|
428
|
+
links: response.data.links || [],
|
|
429
|
+
count: response.data.count || response.data.links?.length || 0,
|
|
430
|
+
engine: response.data.engine
|
|
431
|
+
};
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
// src/methods/search.ts
|
|
435
|
+
import { zodToJsonSchema as zodToJsonSchema2 } from "zod-to-json-schema";
|
|
436
|
+
function convertSchema2(schema) {
|
|
437
|
+
if (schema && typeof schema === "object" && "_def" in schema) {
|
|
438
|
+
return zodToJsonSchema2(schema);
|
|
439
|
+
}
|
|
440
|
+
return schema;
|
|
441
|
+
}
|
|
442
|
+
function buildSearchBody(query, options) {
|
|
443
|
+
const body = { query };
|
|
444
|
+
if (options?.limit !== void 0) {
|
|
445
|
+
body.limit = options.limit;
|
|
446
|
+
}
|
|
447
|
+
if (options?.lang) {
|
|
448
|
+
body.lang = options.lang;
|
|
449
|
+
}
|
|
450
|
+
if (options?.country) {
|
|
451
|
+
body.country = options.country;
|
|
452
|
+
}
|
|
453
|
+
if (options?.engines) {
|
|
454
|
+
body.engines = options.engines;
|
|
455
|
+
}
|
|
456
|
+
if (options?.scrapeOptions) {
|
|
457
|
+
body.scrapeOptions = options.scrapeOptions;
|
|
458
|
+
}
|
|
459
|
+
if (options?.engine) {
|
|
460
|
+
body.engine = options.engine;
|
|
461
|
+
}
|
|
462
|
+
if (options?.projectId) {
|
|
463
|
+
body.project_id = options.projectId;
|
|
464
|
+
}
|
|
465
|
+
if (options?.extract) {
|
|
466
|
+
body.extract = {
|
|
467
|
+
schema: convertSchema2(options.extract.schema),
|
|
468
|
+
systemPrompt: options.extract.systemPrompt,
|
|
469
|
+
provider: options.extract.provider,
|
|
470
|
+
enableFallback: options.extract.enableFallback
|
|
471
|
+
};
|
|
472
|
+
Object.keys(body.extract).forEach((key) => {
|
|
473
|
+
if (body.extract[key] === void 0) {
|
|
474
|
+
delete body.extract[key];
|
|
475
|
+
}
|
|
476
|
+
});
|
|
477
|
+
}
|
|
478
|
+
return body;
|
|
479
|
+
}
|
|
480
|
+
async function search(http, query, options) {
|
|
481
|
+
const body = buildSearchBody(query, options);
|
|
482
|
+
const response = await http.post("/v1/search", body);
|
|
483
|
+
if (!response.data.success) {
|
|
484
|
+
throw new CrawlGateError(
|
|
485
|
+
response.data.error || "Search failed",
|
|
486
|
+
void 0,
|
|
487
|
+
"SEARCH_ERROR"
|
|
488
|
+
);
|
|
489
|
+
}
|
|
490
|
+
return {
|
|
491
|
+
success: true,
|
|
492
|
+
data: response.data.data || [],
|
|
493
|
+
query: response.data.query || query,
|
|
494
|
+
totalResults: response.data.totalResults,
|
|
495
|
+
searchTime: response.data.searchTime,
|
|
496
|
+
extract: response.data.extract
|
|
497
|
+
};
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
// src/methods/batch.ts
|
|
501
|
+
function buildBatchBody(urls, options) {
|
|
502
|
+
const body = { urls };
|
|
503
|
+
if (options?.options) {
|
|
504
|
+
const scrapeOpts = options.options;
|
|
505
|
+
if (scrapeOpts.engine) body.engine = scrapeOpts.engine;
|
|
506
|
+
if (scrapeOpts.formats) body.formats = scrapeOpts.formats;
|
|
507
|
+
if (scrapeOpts.onlyMainContent !== void 0) body.onlyMainContent = scrapeOpts.onlyMainContent;
|
|
508
|
+
if (scrapeOpts.excludeTags) body.excludeTags = scrapeOpts.excludeTags;
|
|
509
|
+
if (scrapeOpts.waitFor !== void 0) body.waitFor = scrapeOpts.waitFor;
|
|
510
|
+
if (scrapeOpts.timeout !== void 0) body.timeout = scrapeOpts.timeout;
|
|
511
|
+
if (scrapeOpts.proxy) body.proxy = scrapeOpts.proxy;
|
|
512
|
+
}
|
|
513
|
+
if (options?.webhook != null) {
|
|
514
|
+
body.webhook = options.webhook;
|
|
515
|
+
}
|
|
516
|
+
if (options?.appendToId != null) {
|
|
517
|
+
body.appendToId = options.appendToId;
|
|
518
|
+
}
|
|
519
|
+
if (options?.ignoreInvalidURLs != null) {
|
|
520
|
+
body.ignoreInvalidURLs = options.ignoreInvalidURLs;
|
|
521
|
+
}
|
|
522
|
+
if (options?.maxConcurrency != null) {
|
|
523
|
+
body.maxConcurrency = options.maxConcurrency;
|
|
524
|
+
}
|
|
525
|
+
if (options?.projectId) {
|
|
526
|
+
body.project_id = options.projectId;
|
|
527
|
+
}
|
|
528
|
+
return body;
|
|
529
|
+
}
|
|
530
|
+
async function startBatchScrape(http, urls, options) {
|
|
531
|
+
if (!Array.isArray(urls) || urls.length === 0) {
|
|
532
|
+
throw new CrawlGateError("URLs array cannot be empty", 400, "VALIDATION_ERROR");
|
|
533
|
+
}
|
|
534
|
+
const body = buildBatchBody(urls, options);
|
|
535
|
+
const headers = {};
|
|
536
|
+
if (options?.idempotencyKey) {
|
|
537
|
+
headers["Idempotency-Key"] = options.idempotencyKey;
|
|
538
|
+
}
|
|
539
|
+
const response = await http.post(
|
|
540
|
+
"/v1/batch/scrape",
|
|
541
|
+
body,
|
|
542
|
+
Object.keys(headers).length > 0 ? headers : void 0
|
|
543
|
+
);
|
|
544
|
+
if (!response.data.success && !response.data.id) {
|
|
545
|
+
throw new CrawlGateError(
|
|
546
|
+
response.data.error || "Failed to start batch scrape job",
|
|
547
|
+
void 0,
|
|
548
|
+
"BATCH_START_ERROR"
|
|
549
|
+
);
|
|
550
|
+
}
|
|
551
|
+
return {
|
|
552
|
+
success: true,
|
|
553
|
+
id: response.data.id,
|
|
554
|
+
url: response.data.url,
|
|
555
|
+
invalidURLs: response.data.invalidURLs
|
|
556
|
+
};
|
|
557
|
+
}
|
|
558
|
+
async function getBatchScrapeStatus(http, jobId) {
|
|
559
|
+
const response = await http.get(
|
|
560
|
+
`/v1/batch/scrape/${jobId}`
|
|
561
|
+
);
|
|
562
|
+
return {
|
|
563
|
+
id: response.data.id || jobId,
|
|
564
|
+
status: response.data.status,
|
|
565
|
+
total: response.data.total || 0,
|
|
566
|
+
completed: response.data.completed || 0,
|
|
567
|
+
creditsUsed: response.data.creditsUsed,
|
|
568
|
+
expiresAt: response.data.expiresAt,
|
|
569
|
+
next: response.data.next ?? null,
|
|
570
|
+
data: response.data.data || [],
|
|
571
|
+
error: response.data.error
|
|
572
|
+
};
|
|
573
|
+
}
|
|
574
|
+
async function cancelBatchScrape(http, jobId) {
|
|
575
|
+
const response = await http.delete(
|
|
576
|
+
`/v1/batch/scrape/${jobId}`
|
|
577
|
+
);
|
|
578
|
+
return response.data.status === "cancelled" || response.data.success !== false;
|
|
579
|
+
}
|
|
580
|
+
async function getBatchScrapeErrors(http, jobId) {
|
|
581
|
+
const response = await http.get(`/v1/batch/scrape/${jobId}/errors`);
|
|
582
|
+
const payload = response.data.data ?? response.data;
|
|
583
|
+
return {
|
|
584
|
+
errors: (payload.errors || []).map((e) => ({
|
|
585
|
+
id: e.id || "",
|
|
586
|
+
timestamp: e.timestamp,
|
|
587
|
+
url: e.url || "",
|
|
588
|
+
code: e.code,
|
|
589
|
+
error: e.error || e.message || "Unknown error"
|
|
590
|
+
})),
|
|
591
|
+
robotsBlocked: payload.robotsBlocked || []
|
|
592
|
+
};
|
|
593
|
+
}
|
|
594
|
+
function sleep2(ms) {
|
|
595
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
596
|
+
}
|
|
597
|
+
async function waitForBatchCompletion(http, jobId, pollInterval = 2e3, timeout) {
|
|
598
|
+
const startTime = Date.now();
|
|
599
|
+
const timeoutMs = timeout ? timeout * 1e3 : void 0;
|
|
600
|
+
while (true) {
|
|
601
|
+
const status = await getBatchScrapeStatus(http, jobId);
|
|
602
|
+
if (status.status === "completed") {
|
|
603
|
+
return status;
|
|
604
|
+
}
|
|
605
|
+
if (status.status === "failed") {
|
|
606
|
+
throw new CrawlGateError(
|
|
607
|
+
status.error || "Batch scrape job failed",
|
|
608
|
+
void 0,
|
|
609
|
+
"BATCH_FAILED"
|
|
610
|
+
);
|
|
611
|
+
}
|
|
612
|
+
if (status.status === "cancelled") {
|
|
613
|
+
throw new CrawlGateError(
|
|
614
|
+
"Batch scrape job was cancelled",
|
|
615
|
+
void 0,
|
|
616
|
+
"BATCH_CANCELLED"
|
|
617
|
+
);
|
|
618
|
+
}
|
|
619
|
+
if (timeoutMs && Date.now() - startTime > timeoutMs) {
|
|
620
|
+
throw new JobTimeoutError(jobId, timeout);
|
|
621
|
+
}
|
|
622
|
+
await sleep2(Math.max(1e3, pollInterval));
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
async function batchScrape(http, urls, options) {
|
|
626
|
+
const pollInterval = options?.pollInterval ?? 2e3;
|
|
627
|
+
const timeout = options?.timeout;
|
|
628
|
+
const { id: jobId } = await startBatchScrape(http, urls, options);
|
|
629
|
+
return waitForBatchCompletion(http, jobId, pollInterval, timeout);
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
// src/methods/extract.ts
|
|
633
|
+
import { zodToJsonSchema as zodToJsonSchema3 } from "zod-to-json-schema";
|
|
634
|
+
function isZodSchema(value) {
|
|
635
|
+
return value !== null && typeof value === "object" && "_def" in value && (typeof value.safeParse === "function" || typeof value.parse === "function");
|
|
636
|
+
}
|
|
637
|
+
function convertSchema3(schema) {
|
|
638
|
+
if (isZodSchema(schema)) {
|
|
639
|
+
return zodToJsonSchema3(schema);
|
|
640
|
+
}
|
|
641
|
+
return schema;
|
|
642
|
+
}
|
|
643
|
+
function buildExtractBody(options) {
|
|
644
|
+
const body = {};
|
|
645
|
+
if (options.urls) {
|
|
646
|
+
body.urls = options.urls;
|
|
647
|
+
}
|
|
648
|
+
if (options.prompt != null) {
|
|
649
|
+
body.prompt = options.prompt;
|
|
650
|
+
}
|
|
651
|
+
if (options.schema != null) {
|
|
652
|
+
body.schema = convertSchema3(options.schema);
|
|
653
|
+
}
|
|
654
|
+
if (options.systemPrompt != null) {
|
|
655
|
+
body.systemPrompt = options.systemPrompt;
|
|
656
|
+
}
|
|
657
|
+
if (options.allowExternalLinks != null) {
|
|
658
|
+
body.allowExternalLinks = options.allowExternalLinks;
|
|
659
|
+
}
|
|
660
|
+
if (options.enableWebSearch != null) {
|
|
661
|
+
body.enableWebSearch = options.enableWebSearch;
|
|
662
|
+
}
|
|
663
|
+
if (options.showSources != null) {
|
|
664
|
+
body.showSources = options.showSources;
|
|
665
|
+
}
|
|
666
|
+
if (options.ignoreInvalidURLs != null) {
|
|
667
|
+
body.ignoreInvalidURLs = options.ignoreInvalidURLs;
|
|
668
|
+
}
|
|
669
|
+
if (options.provider) {
|
|
670
|
+
body.provider = options.provider;
|
|
671
|
+
}
|
|
672
|
+
if (options.projectId) {
|
|
673
|
+
body.project_id = options.projectId;
|
|
674
|
+
}
|
|
675
|
+
if (options.scrapeOptions) {
|
|
676
|
+
body.scrapeOptions = options.scrapeOptions;
|
|
677
|
+
}
|
|
678
|
+
return body;
|
|
679
|
+
}
|
|
680
|
+
async function startExtract(http, options) {
|
|
681
|
+
const body = buildExtractBody(options);
|
|
682
|
+
const response = await http.post("/v1/extract", body);
|
|
683
|
+
if (response.data.success === false && response.data.error) {
|
|
684
|
+
throw new CrawlGateError(
|
|
685
|
+
response.data.error,
|
|
686
|
+
void 0,
|
|
687
|
+
"EXTRACT_ERROR"
|
|
688
|
+
);
|
|
689
|
+
}
|
|
690
|
+
return response.data;
|
|
691
|
+
}
|
|
692
|
+
async function getExtractStatus(http, jobId) {
|
|
693
|
+
const response = await http.get(`/v1/extract/${jobId}`);
|
|
694
|
+
if (response.data.success === false && response.data.error) {
|
|
695
|
+
throw new CrawlGateError(
|
|
696
|
+
response.data.error,
|
|
697
|
+
void 0,
|
|
698
|
+
"EXTRACT_STATUS_ERROR"
|
|
699
|
+
);
|
|
700
|
+
}
|
|
701
|
+
return response.data;
|
|
702
|
+
}
|
|
703
|
+
function sleep3(ms) {
|
|
704
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
705
|
+
}
|
|
706
|
+
async function waitForExtractCompletion(http, jobId, pollInterval = 2e3, timeout) {
|
|
707
|
+
const startTime = Date.now();
|
|
708
|
+
const timeoutMs = timeout ? timeout * 1e3 : void 0;
|
|
709
|
+
while (true) {
|
|
710
|
+
const status = await getExtractStatus(http, jobId);
|
|
711
|
+
if (status.status === "completed") {
|
|
712
|
+
return status;
|
|
713
|
+
}
|
|
714
|
+
if (status.status === "failed") {
|
|
715
|
+
throw new CrawlGateError(
|
|
716
|
+
status.error || "Extract job failed",
|
|
717
|
+
void 0,
|
|
718
|
+
"EXTRACT_FAILED"
|
|
719
|
+
);
|
|
720
|
+
}
|
|
721
|
+
if (status.status === "cancelled") {
|
|
722
|
+
throw new CrawlGateError(
|
|
723
|
+
"Extract job was cancelled",
|
|
724
|
+
void 0,
|
|
725
|
+
"EXTRACT_CANCELLED"
|
|
726
|
+
);
|
|
727
|
+
}
|
|
728
|
+
if (timeoutMs && Date.now() - startTime > timeoutMs) {
|
|
729
|
+
throw new JobTimeoutError(jobId, timeout);
|
|
730
|
+
}
|
|
731
|
+
await sleep3(Math.max(1e3, pollInterval));
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
async function extract(http, options) {
|
|
735
|
+
const pollInterval = options.pollInterval ?? 2e3;
|
|
736
|
+
const timeout = options.timeout;
|
|
737
|
+
const started = await startExtract(http, options);
|
|
738
|
+
if (!started.id) {
|
|
739
|
+
return started;
|
|
740
|
+
}
|
|
741
|
+
if (started.status === "completed") {
|
|
742
|
+
return started;
|
|
743
|
+
}
|
|
744
|
+
return waitForExtractCompletion(http, started.id, pollInterval, timeout);
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
// src/methods/usage.ts
|
|
748
|
+
async function getConcurrency(http) {
|
|
749
|
+
const response = await http.get("/v1/concurrency");
|
|
750
|
+
return {
|
|
751
|
+
concurrency: response.data.concurrency ?? 0,
|
|
752
|
+
maxConcurrency: response.data.maxConcurrency ?? 0
|
|
753
|
+
};
|
|
754
|
+
}
|
|
755
|
+
async function getCreditUsage(http) {
|
|
756
|
+
const response = await http.get("/v1/credits");
|
|
757
|
+
return {
|
|
758
|
+
remainingCredits: response.data.remainingCredits ?? 0,
|
|
759
|
+
planCredits: response.data.planCredits,
|
|
760
|
+
billingPeriodStart: response.data.billingPeriodStart,
|
|
761
|
+
billingPeriodEnd: response.data.billingPeriodEnd
|
|
762
|
+
};
|
|
763
|
+
}
|
|
764
|
+
async function getTokenUsage(http) {
|
|
765
|
+
const response = await http.get("/v1/tokens");
|
|
766
|
+
return {
|
|
767
|
+
remainingTokens: response.data.remainingTokens ?? 0,
|
|
768
|
+
planTokens: response.data.planTokens,
|
|
769
|
+
billingPeriodStart: response.data.billingPeriodStart,
|
|
770
|
+
billingPeriodEnd: response.data.billingPeriodEnd
|
|
771
|
+
};
|
|
772
|
+
}
|
|
773
|
+
async function getQueueStatus(http) {
|
|
774
|
+
const response = await http.get("/v1/queue");
|
|
775
|
+
return {
|
|
776
|
+
success: response.data.success ?? true,
|
|
777
|
+
jobsInQueue: response.data.jobsInQueue ?? 0,
|
|
778
|
+
activeJobsInQueue: response.data.activeJobsInQueue ?? 0,
|
|
779
|
+
waitingJobsInQueue: response.data.waitingJobsInQueue ?? 0,
|
|
780
|
+
maxConcurrency: response.data.maxConcurrency ?? 0,
|
|
781
|
+
mostRecentSuccess: response.data.mostRecentSuccess
|
|
782
|
+
};
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
// src/client.ts
|
|
786
|
+
var CrawlGateClient = class {
|
|
787
|
+
http;
|
|
788
|
+
/**
|
|
789
|
+
* Create a new CrawlGate client
|
|
790
|
+
*
|
|
791
|
+
* @param options - Client configuration options
|
|
792
|
+
* @throws {CrawlGateError} If API key is not provided
|
|
793
|
+
*/
|
|
794
|
+
constructor(options = {}) {
|
|
795
|
+
const apiKey = options.apiKey ?? process.env.CRAWLGATE_API_KEY ?? "";
|
|
796
|
+
const apiUrl = (options.apiUrl ?? process.env.CRAWLGATE_API_URL ?? "https://api.crawlgate.io").replace(/\/$/, "");
|
|
797
|
+
if (!apiKey) {
|
|
798
|
+
throw new CrawlGateError(
|
|
799
|
+
"API key is required. Set CRAWLGATE_API_KEY env variable or pass apiKey option.",
|
|
800
|
+
void 0,
|
|
801
|
+
"MISSING_API_KEY"
|
|
802
|
+
);
|
|
803
|
+
}
|
|
804
|
+
this.http = new HttpClient({
|
|
805
|
+
apiKey,
|
|
806
|
+
apiUrl,
|
|
807
|
+
timeoutMs: options.timeoutMs,
|
|
808
|
+
maxRetries: options.maxRetries,
|
|
809
|
+
backoffFactor: options.backoffFactor
|
|
810
|
+
});
|
|
811
|
+
}
|
|
812
|
+
// ==========================================================================
|
|
813
|
+
// Scrape Methods
|
|
814
|
+
// ==========================================================================
|
|
815
|
+
/**
|
|
816
|
+
* Scrape a single URL
|
|
817
|
+
*
|
|
818
|
+
* @param url - URL to scrape
|
|
819
|
+
* @param options - Scrape options
|
|
820
|
+
* @returns Scraped document with requested formats
|
|
821
|
+
*
|
|
822
|
+
* @example
|
|
823
|
+
* ```typescript
|
|
824
|
+
* const doc = await client.scrape('https://example.com', {
|
|
825
|
+
* engine: 'smart',
|
|
826
|
+
* formats: ['markdown', 'html'],
|
|
827
|
+
* onlyMainContent: true
|
|
828
|
+
* });
|
|
829
|
+
* console.log(doc.markdown);
|
|
830
|
+
* ```
|
|
831
|
+
*
|
|
832
|
+
* @example With LLM extraction
|
|
833
|
+
* ```typescript
|
|
834
|
+
* import { z } from 'zod';
|
|
835
|
+
*
|
|
836
|
+
* const schema = z.object({
|
|
837
|
+
* title: z.string(),
|
|
838
|
+
* price: z.number(),
|
|
839
|
+
* inStock: z.boolean()
|
|
840
|
+
* });
|
|
841
|
+
*
|
|
842
|
+
* const doc = await client.scrape('https://example.com/product', {
|
|
843
|
+
* engine: 'smart',
|
|
844
|
+
* extract: {
|
|
845
|
+
* schema,
|
|
846
|
+
* systemPrompt: 'Extract product details',
|
|
847
|
+
* provider: 'openai'
|
|
848
|
+
* }
|
|
849
|
+
* });
|
|
850
|
+
* console.log(doc.extract?.data);
|
|
851
|
+
* ```
|
|
852
|
+
*/
|
|
853
|
+
async scrape(url, options) {
|
|
854
|
+
return scrape(this.http, url, options);
|
|
855
|
+
}
|
|
856
|
+
// ==========================================================================
|
|
857
|
+
// Batch Scrape Methods
|
|
858
|
+
// ==========================================================================
|
|
859
|
+
/**
|
|
860
|
+
* Start a batch scrape job (async)
|
|
861
|
+
*
|
|
862
|
+
* @param urls - Array of URLs to scrape
|
|
863
|
+
* @param options - Batch scrape options
|
|
864
|
+
* @returns Batch job ID and initial status
|
|
865
|
+
*
|
|
866
|
+
* @example
|
|
867
|
+
* ```typescript
|
|
868
|
+
* const { id } = await client.startBatchScrape(
|
|
869
|
+
* ['https://a.com', 'https://b.com', 'https://c.com'],
|
|
870
|
+
* { options: { formats: ['markdown'] } }
|
|
871
|
+
* );
|
|
872
|
+
*
|
|
873
|
+
* // Poll manually
|
|
874
|
+
* let status = await client.getBatchScrapeStatus(id);
|
|
875
|
+
* while (status.status === 'scraping') {
|
|
876
|
+
* await new Promise(r => setTimeout(r, 2000));
|
|
877
|
+
* status = await client.getBatchScrapeStatus(id);
|
|
878
|
+
* }
|
|
879
|
+
* ```
|
|
880
|
+
*/
|
|
881
|
+
async startBatchScrape(urls, options) {
|
|
882
|
+
return startBatchScrape(this.http, urls, options);
|
|
883
|
+
}
|
|
884
|
+
/**
|
|
885
|
+
* Get batch scrape job status and data
|
|
886
|
+
*
|
|
887
|
+
* @param jobId - Batch job ID
|
|
888
|
+
* @returns Current job status and scraped data
|
|
889
|
+
*/
|
|
890
|
+
async getBatchScrapeStatus(jobId) {
|
|
891
|
+
return getBatchScrapeStatus(this.http, jobId);
|
|
892
|
+
}
|
|
893
|
+
/**
|
|
894
|
+
* Cancel a batch scrape job
|
|
895
|
+
*
|
|
896
|
+
* @param jobId - Batch job ID
|
|
897
|
+
* @returns True if cancelled successfully
|
|
898
|
+
*/
|
|
899
|
+
async cancelBatchScrape(jobId) {
|
|
900
|
+
return cancelBatchScrape(this.http, jobId);
|
|
901
|
+
}
|
|
902
|
+
/**
|
|
903
|
+
* Get batch scrape job errors
|
|
904
|
+
*
|
|
905
|
+
* @param jobId - Batch job ID
|
|
906
|
+
* @returns Errors and robots.txt blocked URLs
|
|
907
|
+
*/
|
|
908
|
+
async getBatchScrapeErrors(jobId) {
|
|
909
|
+
return getBatchScrapeErrors(this.http, jobId);
|
|
910
|
+
}
|
|
911
|
+
/**
|
|
912
|
+
* Batch scrape multiple URLs and wait for completion
|
|
913
|
+
*
|
|
914
|
+
* @param urls - Array of URLs to scrape
|
|
915
|
+
* @param options - Batch options including pollInterval and timeout
|
|
916
|
+
* @returns Final job with all scraped data
|
|
917
|
+
*
|
|
918
|
+
* @example
|
|
919
|
+
* ```typescript
|
|
920
|
+
* const job = await client.batchScrape(
|
|
921
|
+
* ['https://a.com', 'https://b.com', 'https://c.com'],
|
|
922
|
+
* {
|
|
923
|
+
* options: { formats: ['markdown'], engine: 'smart' },
|
|
924
|
+
* pollInterval: 2000,
|
|
925
|
+
* timeout: 300
|
|
926
|
+
* }
|
|
927
|
+
* );
|
|
928
|
+
*
|
|
929
|
+
* console.log(`Scraped ${job.completed} URLs`);
|
|
930
|
+
* job.data.forEach(doc => console.log(doc.url, doc.markdown?.length));
|
|
931
|
+
* ```
|
|
932
|
+
*/
|
|
933
|
+
async batchScrape(urls, options) {
|
|
934
|
+
return batchScrape(this.http, urls, options);
|
|
935
|
+
}
|
|
936
|
+
// ==========================================================================
|
|
937
|
+
// Crawl Methods
|
|
938
|
+
// ==========================================================================
|
|
939
|
+
/**
|
|
940
|
+
* Start a crawl job (async)
|
|
941
|
+
*
|
|
942
|
+
* Use this method when you want to start a crawl and manage polling yourself.
|
|
943
|
+
* For automatic polling, use the `crawl()` method instead.
|
|
944
|
+
*
|
|
945
|
+
* @param url - Root URL to crawl
|
|
946
|
+
* @param options - Crawl options
|
|
947
|
+
* @returns Crawl job ID and initial status
|
|
948
|
+
*
|
|
949
|
+
* @example
|
|
950
|
+
* ```typescript
|
|
951
|
+
* const { id } = await client.startCrawl('https://example.com', {
|
|
952
|
+
* limit: 10,
|
|
953
|
+
* engine: 'dynamic'
|
|
954
|
+
* });
|
|
955
|
+
*
|
|
956
|
+
* // Poll for status manually
|
|
957
|
+
* let status = await client.getCrawlStatus(id);
|
|
958
|
+
* while (status.status === 'scraping') {
|
|
959
|
+
* await new Promise(r => setTimeout(r, 2000));
|
|
960
|
+
* status = await client.getCrawlStatus(id);
|
|
961
|
+
* }
|
|
962
|
+
* ```
|
|
963
|
+
*/
|
|
964
|
+
async startCrawl(url, options) {
|
|
965
|
+
return startCrawl(this.http, url, options);
|
|
966
|
+
}
|
|
967
|
+
/**
|
|
968
|
+
* Get crawl job status and data
|
|
969
|
+
*
|
|
970
|
+
* @param jobId - Crawl job ID
|
|
971
|
+
* @returns Current job status and scraped data
|
|
972
|
+
*/
|
|
973
|
+
async getCrawlStatus(jobId) {
|
|
974
|
+
return getCrawlStatus(this.http, jobId);
|
|
975
|
+
}
|
|
976
|
+
/**
|
|
977
|
+
* Cancel a crawl job
|
|
978
|
+
*
|
|
979
|
+
* @param jobId - Crawl job ID
|
|
980
|
+
* @returns True if cancelled successfully
|
|
981
|
+
*/
|
|
982
|
+
async cancelCrawl(jobId) {
|
|
983
|
+
return cancelCrawl(this.http, jobId);
|
|
984
|
+
}
|
|
985
|
+
/**
|
|
986
|
+
* Get crawl job errors and robots.txt blocks
|
|
987
|
+
*
|
|
988
|
+
* @param jobId - Crawl job ID
|
|
989
|
+
* @returns Errors and robots.txt blocked URLs
|
|
990
|
+
*/
|
|
991
|
+
async getCrawlErrors(jobId) {
|
|
992
|
+
return getCrawlErrors(this.http, jobId);
|
|
993
|
+
}
|
|
994
|
+
/**
|
|
995
|
+
* Crawl a website and wait for completion
|
|
996
|
+
*
|
|
997
|
+
* This method starts a crawl job and automatically polls until completion.
|
|
998
|
+
*
|
|
999
|
+
* @param url - Root URL to crawl
|
|
1000
|
+
* @param options - Crawl options including pollInterval and timeout
|
|
1001
|
+
* @returns Final crawl job with all scraped data
|
|
1002
|
+
*
|
|
1003
|
+
* @example
|
|
1004
|
+
* ```typescript
|
|
1005
|
+
* const job = await client.crawl('https://example.com', {
|
|
1006
|
+
* limit: 10,
|
|
1007
|
+
* engine: 'dynamic',
|
|
1008
|
+
* formats: ['markdown'],
|
|
1009
|
+
* pollInterval: 2000, // Poll every 2 seconds
|
|
1010
|
+
* timeout: 300 // 5 minute timeout
|
|
1011
|
+
* });
|
|
1012
|
+
*
|
|
1013
|
+
* console.log(`Crawled ${job.completed} pages`);
|
|
1014
|
+
* job.data.forEach(doc => console.log(doc.url));
|
|
1015
|
+
* ```
|
|
1016
|
+
*/
|
|
1017
|
+
async crawl(url, options) {
|
|
1018
|
+
return crawl(this.http, url, options);
|
|
1019
|
+
}
|
|
1020
|
+
// ==========================================================================
|
|
1021
|
+
// Extract Methods (Standalone LLM Extraction)
|
|
1022
|
+
// ==========================================================================
|
|
1023
|
+
/**
|
|
1024
|
+
* Start an extract job (async)
|
|
1025
|
+
*
|
|
1026
|
+
* @param options - Extract request options
|
|
1027
|
+
* @returns Extract job ID or immediate result
|
|
1028
|
+
*
|
|
1029
|
+
* @example
|
|
1030
|
+
* ```typescript
|
|
1031
|
+
* const { id } = await client.startExtract({
|
|
1032
|
+
* urls: ['https://example.com/product'],
|
|
1033
|
+
* schema: { name: 'string', price: 'number' },
|
|
1034
|
+
* provider: 'openai'
|
|
1035
|
+
* });
|
|
1036
|
+
*
|
|
1037
|
+
* // Poll manually
|
|
1038
|
+
* let status = await client.getExtractStatus(id);
|
|
1039
|
+
* while (status.status === 'processing') {
|
|
1040
|
+
* await new Promise(r => setTimeout(r, 2000));
|
|
1041
|
+
* status = await client.getExtractStatus(id);
|
|
1042
|
+
* }
|
|
1043
|
+
* console.log(status.data);
|
|
1044
|
+
* ```
|
|
1045
|
+
*/
|
|
1046
|
+
async startExtract(options) {
|
|
1047
|
+
return startExtract(this.http, options);
|
|
1048
|
+
}
|
|
1049
|
+
/**
|
|
1050
|
+
* Get extract job status and data
|
|
1051
|
+
*
|
|
1052
|
+
* @param jobId - Extract job ID
|
|
1053
|
+
* @returns Current job status and extracted data
|
|
1054
|
+
*/
|
|
1055
|
+
async getExtractStatus(jobId) {
|
|
1056
|
+
return getExtractStatus(this.http, jobId);
|
|
1057
|
+
}
|
|
1058
|
+
/**
|
|
1059
|
+
* Extract structured data from URLs using LLM and wait for completion
|
|
1060
|
+
*
|
|
1061
|
+
* @param options - Extract options including schema, prompt, and timeout
|
|
1062
|
+
* @returns Final extract result with structured data
|
|
1063
|
+
*
|
|
1064
|
+
* @example With Zod schema
|
|
1065
|
+
* ```typescript
|
|
1066
|
+
* import { z } from 'zod';
|
|
1067
|
+
*
|
|
1068
|
+
* const result = await client.extract({
|
|
1069
|
+
* urls: ['https://example.com/product'],
|
|
1070
|
+
* schema: z.object({
|
|
1071
|
+
* name: z.string(),
|
|
1072
|
+
* price: z.number(),
|
|
1073
|
+
* inStock: z.boolean(),
|
|
1074
|
+
* features: z.array(z.string())
|
|
1075
|
+
* }),
|
|
1076
|
+
* systemPrompt: 'Extract product information from the page',
|
|
1077
|
+
* provider: 'openai',
|
|
1078
|
+
* timeout: 60
|
|
1079
|
+
* });
|
|
1080
|
+
*
|
|
1081
|
+
* console.log(result.data);
|
|
1082
|
+
* ```
|
|
1083
|
+
*
|
|
1084
|
+
* @example With natural language prompt
|
|
1085
|
+
* ```typescript
|
|
1086
|
+
* const result = await client.extract({
|
|
1087
|
+
* urls: ['https://example.com/about'],
|
|
1088
|
+
* prompt: 'Extract the company name, founding year, and list of team members',
|
|
1089
|
+
* enableWebSearch: true
|
|
1090
|
+
* });
|
|
1091
|
+
*
|
|
1092
|
+
* console.log(result.data);
|
|
1093
|
+
* ```
|
|
1094
|
+
*/
|
|
1095
|
+
async extract(options) {
|
|
1096
|
+
return extract(this.http, options);
|
|
1097
|
+
}
|
|
1098
|
+
// ==========================================================================
|
|
1099
|
+
// Map Methods
|
|
1100
|
+
// ==========================================================================
|
|
1101
|
+
/**
|
|
1102
|
+
* Map a website to discover all URLs
|
|
1103
|
+
*
|
|
1104
|
+
* @param url - Root URL to map
|
|
1105
|
+
* @param options - Map options
|
|
1106
|
+
* @returns List of discovered URLs
|
|
1107
|
+
*
|
|
1108
|
+
* @example
|
|
1109
|
+
* ```typescript
|
|
1110
|
+
* const result = await client.map('https://example.com', {
|
|
1111
|
+
* engine: 'dynamic'
|
|
1112
|
+
* });
|
|
1113
|
+
*
|
|
1114
|
+
* console.log(`Found ${result.count} URLs:`);
|
|
1115
|
+
* result.links.forEach(url => console.log(url));
|
|
1116
|
+
* ```
|
|
1117
|
+
*/
|
|
1118
|
+
async map(url, options) {
|
|
1119
|
+
return map(this.http, url, options);
|
|
1120
|
+
}
|
|
1121
|
+
// ==========================================================================
|
|
1122
|
+
// Search Methods
|
|
1123
|
+
// ==========================================================================
|
|
1124
|
+
/**
|
|
1125
|
+
* Search the web and optionally scrape results
|
|
1126
|
+
*
|
|
1127
|
+
* @param query - Search query
|
|
1128
|
+
* @param options - Search options
|
|
1129
|
+
* @returns Search results with optional scraped content
|
|
1130
|
+
*
|
|
1131
|
+
* @example Basic search
|
|
1132
|
+
* ```typescript
|
|
1133
|
+
* const results = await client.search('best restaurants in NYC', {
|
|
1134
|
+
* limit: 10,
|
|
1135
|
+
* lang: 'en',
|
|
1136
|
+
* country: 'us'
|
|
1137
|
+
* });
|
|
1138
|
+
*
|
|
1139
|
+
* results.data.forEach(r => {
|
|
1140
|
+
* console.log(`${r.title}: ${r.url}`);
|
|
1141
|
+
* });
|
|
1142
|
+
* ```
|
|
1143
|
+
*
|
|
1144
|
+
* @example Search with scraping
|
|
1145
|
+
* ```typescript
|
|
1146
|
+
* const results = await client.search('best laptops 2024', {
|
|
1147
|
+
* limit: 5,
|
|
1148
|
+
* scrapeOptions: {
|
|
1149
|
+
* formats: ['markdown']
|
|
1150
|
+
* },
|
|
1151
|
+
* engine: 'smart'
|
|
1152
|
+
* });
|
|
1153
|
+
*
|
|
1154
|
+
* results.data.forEach(r => {
|
|
1155
|
+
* console.log(r.title);
|
|
1156
|
+
* console.log(r.markdown?.substring(0, 200));
|
|
1157
|
+
* });
|
|
1158
|
+
* ```
|
|
1159
|
+
*
|
|
1160
|
+
* @example Search with LLM extraction
|
|
1161
|
+
* ```typescript
|
|
1162
|
+
* import { z } from 'zod';
|
|
1163
|
+
*
|
|
1164
|
+
* const results = await client.search('iPhone 15 Pro reviews', {
|
|
1165
|
+
* limit: 5,
|
|
1166
|
+
* scrapeOptions: { formats: ['markdown'] },
|
|
1167
|
+
* extract: {
|
|
1168
|
+
* schema: z.object({
|
|
1169
|
+
* pros: z.array(z.string()),
|
|
1170
|
+
* cons: z.array(z.string()),
|
|
1171
|
+
* rating: z.number()
|
|
1172
|
+
* }),
|
|
1173
|
+
* systemPrompt: 'Extract review summary from the content'
|
|
1174
|
+
* }
|
|
1175
|
+
* });
|
|
1176
|
+
*
|
|
1177
|
+
* console.log(results.extract?.data);
|
|
1178
|
+
* ```
|
|
1179
|
+
*/
|
|
1180
|
+
async search(query, options) {
|
|
1181
|
+
return search(this.http, query, options);
|
|
1182
|
+
}
|
|
1183
|
+
// ==========================================================================
|
|
1184
|
+
// Usage & Monitoring Methods
|
|
1185
|
+
// ==========================================================================
|
|
1186
|
+
/**
|
|
1187
|
+
* Get current concurrency usage
|
|
1188
|
+
*
|
|
1189
|
+
* @returns Current and max concurrency
|
|
1190
|
+
*
|
|
1191
|
+
* @example
|
|
1192
|
+
* ```typescript
|
|
1193
|
+
* const { concurrency, maxConcurrency } = await client.getConcurrency();
|
|
1194
|
+
* console.log(`Using ${concurrency}/${maxConcurrency} concurrent requests`);
|
|
1195
|
+
* ```
|
|
1196
|
+
*/
|
|
1197
|
+
async getConcurrency() {
|
|
1198
|
+
return getConcurrency(this.http);
|
|
1199
|
+
}
|
|
1200
|
+
/**
|
|
1201
|
+
* Get current credit usage
|
|
1202
|
+
*
|
|
1203
|
+
* @returns Credit usage information
|
|
1204
|
+
*
|
|
1205
|
+
* @example
|
|
1206
|
+
* ```typescript
|
|
1207
|
+
* const credits = await client.getCreditUsage();
|
|
1208
|
+
* console.log(`Remaining credits: ${credits.remainingCredits}`);
|
|
1209
|
+
* ```
|
|
1210
|
+
*/
|
|
1211
|
+
async getCreditUsage() {
|
|
1212
|
+
return getCreditUsage(this.http);
|
|
1213
|
+
}
|
|
1214
|
+
/**
|
|
1215
|
+
* Get current token usage (for LLM extraction)
|
|
1216
|
+
*
|
|
1217
|
+
* @returns Token usage information
|
|
1218
|
+
*
|
|
1219
|
+
* @example
|
|
1220
|
+
* ```typescript
|
|
1221
|
+
* const tokens = await client.getTokenUsage();
|
|
1222
|
+
* console.log(`Remaining tokens: ${tokens.remainingTokens}`);
|
|
1223
|
+
* ```
|
|
1224
|
+
*/
|
|
1225
|
+
async getTokenUsage() {
|
|
1226
|
+
return getTokenUsage(this.http);
|
|
1227
|
+
}
|
|
1228
|
+
/**
|
|
1229
|
+
* Get queue status information
|
|
1230
|
+
*
|
|
1231
|
+
* @returns Queue status metrics
|
|
1232
|
+
*
|
|
1233
|
+
* @example
|
|
1234
|
+
* ```typescript
|
|
1235
|
+
* const queue = await client.getQueueStatus();
|
|
1236
|
+
* console.log(`Jobs in queue: ${queue.jobsInQueue}`);
|
|
1237
|
+
* console.log(`Active: ${queue.activeJobsInQueue}, Waiting: ${queue.waitingJobsInQueue}`);
|
|
1238
|
+
* ```
|
|
1239
|
+
*/
|
|
1240
|
+
async getQueueStatus() {
|
|
1241
|
+
return getQueueStatus(this.http);
|
|
1242
|
+
}
|
|
1243
|
+
};
|
|
1244
|
+
export {
|
|
1245
|
+
AuthenticationError,
|
|
1246
|
+
CrawlGateClient,
|
|
1247
|
+
CrawlGateError,
|
|
1248
|
+
ExtractionError,
|
|
1249
|
+
JobTimeoutError,
|
|
1250
|
+
RateLimitError,
|
|
1251
|
+
ServiceUnavailableError,
|
|
1252
|
+
ValidationError,
|
|
1253
|
+
CrawlGateClient as default
|
|
1254
|
+
};
|
|
1255
|
+
//# sourceMappingURL=index.js.map
|