firecrawl 1.6.1 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -0
- package/dist/index.cjs +118 -0
- package/dist/index.d.cts +50 -1
- package/dist/index.d.ts +50 -1
- package/dist/index.js +118 -0
- package/dump.rdb +0 -0
- package/package.json +1 -1
- package/src/index.ts +164 -0
package/README.md
CHANGED
|
@@ -145,6 +145,46 @@ watch.addEventListener("done", state => {
|
|
|
145
145
|
});
|
|
146
146
|
```
|
|
147
147
|
|
|
148
|
+
### Batch scraping multiple URLs
|
|
149
|
+
|
|
150
|
+
To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats.
|
|
151
|
+
|
|
152
|
+
```js
|
|
153
|
+
const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], {
|
|
154
|
+
formats: ['markdown', 'html'],
|
|
155
|
+
})
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
#### Asynchronous batch scrape
|
|
160
|
+
|
|
161
|
+
To initiate an asynchronous batch scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape.
|
|
162
|
+
|
|
163
|
+
```js
|
|
164
|
+
const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
#### Batch scrape with WebSockets
|
|
168
|
+
|
|
169
|
+
To use batch scrape with WebSockets, use the `batchScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats.
|
|
170
|
+
|
|
171
|
+
```js
|
|
172
|
+
// Batch scrape multiple URLs with WebSockets:
|
|
173
|
+
const watch = await app.batchScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
|
|
174
|
+
|
|
175
|
+
watch.addEventListener("document", doc => {
|
|
176
|
+
console.log("DOC", doc.detail);
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
watch.addEventListener("error", err => {
|
|
180
|
+
console.error("ERR", err.detail.error);
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
watch.addEventListener("done", state => {
|
|
184
|
+
console.log("DONE", state.detail.status);
|
|
185
|
+
});
|
|
186
|
+
```
|
|
187
|
+
|
|
148
188
|
## Error Handling
|
|
149
189
|
|
|
150
190
|
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks.
|
package/dist/index.cjs
CHANGED
|
@@ -286,6 +286,124 @@ var FirecrawlApp = class {
|
|
|
286
286
|
}
|
|
287
287
|
return { success: false, error: "Internal server error." };
|
|
288
288
|
}
|
|
289
|
+
/**
|
|
290
|
+
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
|
|
291
|
+
* @param url - The URLs to scrape.
|
|
292
|
+
* @param params - Additional parameters for the scrape request.
|
|
293
|
+
* @param pollInterval - Time in seconds for job status checks.
|
|
294
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
295
|
+
* @returns The response from the crawl operation.
|
|
296
|
+
*/
|
|
297
|
+
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey) {
|
|
298
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
299
|
+
let jsonData = { urls, ...params ?? {} };
|
|
300
|
+
try {
|
|
301
|
+
const response = await this.postRequest(
|
|
302
|
+
this.apiUrl + `/v1/batch/scrape`,
|
|
303
|
+
jsonData,
|
|
304
|
+
headers
|
|
305
|
+
);
|
|
306
|
+
if (response.status === 200) {
|
|
307
|
+
const id = response.data.id;
|
|
308
|
+
return this.monitorJobStatus(id, headers, pollInterval);
|
|
309
|
+
} else {
|
|
310
|
+
this.handleError(response, "start batch scrape job");
|
|
311
|
+
}
|
|
312
|
+
} catch (error) {
|
|
313
|
+
if (error.response?.data?.error) {
|
|
314
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ""}`, error.response.status);
|
|
315
|
+
} else {
|
|
316
|
+
throw new FirecrawlError(error.message, 500);
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
return { success: false, error: "Internal server error." };
|
|
320
|
+
}
|
|
321
|
+
async asyncBatchScrapeUrls(urls, params, idempotencyKey) {
|
|
322
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
323
|
+
let jsonData = { urls, ...params ?? {} };
|
|
324
|
+
try {
|
|
325
|
+
const response = await this.postRequest(
|
|
326
|
+
this.apiUrl + `/v1/batch/scrape`,
|
|
327
|
+
jsonData,
|
|
328
|
+
headers
|
|
329
|
+
);
|
|
330
|
+
if (response.status === 200) {
|
|
331
|
+
return response.data;
|
|
332
|
+
} else {
|
|
333
|
+
this.handleError(response, "start batch scrape job");
|
|
334
|
+
}
|
|
335
|
+
} catch (error) {
|
|
336
|
+
if (error.response?.data?.error) {
|
|
337
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ""}`, error.response.status);
|
|
338
|
+
} else {
|
|
339
|
+
throw new FirecrawlError(error.message, 500);
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
return { success: false, error: "Internal server error." };
|
|
343
|
+
}
|
|
344
|
+
/**
|
|
345
|
+
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
346
|
+
* @param urls - The URL to scrape.
|
|
347
|
+
* @param params - Additional parameters for the scrape request.
|
|
348
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
349
|
+
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
350
|
+
*/
|
|
351
|
+
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey) {
|
|
352
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
353
|
+
if (crawl.success && crawl.id) {
|
|
354
|
+
const id = crawl.id;
|
|
355
|
+
return new CrawlWatcher(id, this);
|
|
356
|
+
}
|
|
357
|
+
throw new FirecrawlError("Batch scrape job failed to start", 400);
|
|
358
|
+
}
|
|
359
|
+
/**
|
|
360
|
+
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
361
|
+
* @param id - The ID of the batch scrape operation.
|
|
362
|
+
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
363
|
+
* @returns The response containing the job status.
|
|
364
|
+
*/
|
|
365
|
+
async checkBatchScrapeStatus(id, getAllData = false) {
|
|
366
|
+
if (!id) {
|
|
367
|
+
throw new FirecrawlError("No batch scrape ID provided", 400);
|
|
368
|
+
}
|
|
369
|
+
const headers = this.prepareHeaders();
|
|
370
|
+
try {
|
|
371
|
+
const response = await this.getRequest(
|
|
372
|
+
`${this.apiUrl}/v1/batch/scrape/${id}`,
|
|
373
|
+
headers
|
|
374
|
+
);
|
|
375
|
+
if (response.status === 200) {
|
|
376
|
+
let allData = response.data.data;
|
|
377
|
+
if (getAllData && response.data.status === "completed") {
|
|
378
|
+
let statusData = response.data;
|
|
379
|
+
if ("data" in statusData) {
|
|
380
|
+
let data = statusData.data;
|
|
381
|
+
while ("next" in statusData) {
|
|
382
|
+
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
383
|
+
data = data.concat(statusData.data);
|
|
384
|
+
}
|
|
385
|
+
allData = data;
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
return {
|
|
389
|
+
success: response.data.success,
|
|
390
|
+
status: response.data.status,
|
|
391
|
+
total: response.data.total,
|
|
392
|
+
completed: response.data.completed,
|
|
393
|
+
creditsUsed: response.data.creditsUsed,
|
|
394
|
+
expiresAt: new Date(response.data.expiresAt),
|
|
395
|
+
next: response.data.next,
|
|
396
|
+
data: allData,
|
|
397
|
+
error: response.data.error
|
|
398
|
+
};
|
|
399
|
+
} else {
|
|
400
|
+
this.handleError(response, "check batch scrape status");
|
|
401
|
+
}
|
|
402
|
+
} catch (error) {
|
|
403
|
+
throw new FirecrawlError(error.message, 500);
|
|
404
|
+
}
|
|
405
|
+
return { success: false, error: "Internal server error." };
|
|
406
|
+
}
|
|
289
407
|
/**
|
|
290
408
|
* Prepares the headers for an API request.
|
|
291
409
|
* @param idempotencyKey - Optional key to ensure idempotency.
|
package/dist/index.d.cts
CHANGED
|
@@ -142,6 +142,16 @@ interface CrawlResponse {
|
|
|
142
142
|
success: true;
|
|
143
143
|
error?: string;
|
|
144
144
|
}
|
|
145
|
+
/**
|
|
146
|
+
* Response interface for batch scrape operations.
|
|
147
|
+
* Defines the structure of the response received after initiating a crawl.
|
|
148
|
+
*/
|
|
149
|
+
interface BatchScrapeResponse {
|
|
150
|
+
id?: string;
|
|
151
|
+
url?: string;
|
|
152
|
+
success: true;
|
|
153
|
+
error?: string;
|
|
154
|
+
}
|
|
145
155
|
/**
|
|
146
156
|
* Response interface for job status checks.
|
|
147
157
|
* Provides detailed status of a crawl job including progress and results.
|
|
@@ -156,6 +166,20 @@ interface CrawlStatusResponse {
|
|
|
156
166
|
next?: string;
|
|
157
167
|
data: FirecrawlDocument<undefined>[];
|
|
158
168
|
}
|
|
169
|
+
/**
|
|
170
|
+
* Response interface for batch scrape job status checks.
|
|
171
|
+
* Provides detailed status of a batch scrape job including progress and results.
|
|
172
|
+
*/
|
|
173
|
+
interface BatchScrapeStatusResponse {
|
|
174
|
+
success: true;
|
|
175
|
+
status: "scraping" | "completed" | "failed" | "cancelled";
|
|
176
|
+
completed: number;
|
|
177
|
+
total: number;
|
|
178
|
+
creditsUsed: number;
|
|
179
|
+
expiresAt: Date;
|
|
180
|
+
next?: string;
|
|
181
|
+
data: FirecrawlDocument<undefined>[];
|
|
182
|
+
}
|
|
159
183
|
/**
|
|
160
184
|
* Parameters for mapping operations.
|
|
161
185
|
* Defines options for mapping URLs during a crawl.
|
|
@@ -255,6 +279,31 @@ declare class FirecrawlApp {
|
|
|
255
279
|
* @returns The response from the map operation.
|
|
256
280
|
*/
|
|
257
281
|
mapUrl(url: string, params?: MapParams): Promise<MapResponse | ErrorResponse>;
|
|
282
|
+
/**
|
|
283
|
+
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
|
|
284
|
+
* @param url - The URLs to scrape.
|
|
285
|
+
* @param params - Additional parameters for the scrape request.
|
|
286
|
+
* @param pollInterval - Time in seconds for job status checks.
|
|
287
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
288
|
+
* @returns The response from the crawl operation.
|
|
289
|
+
*/
|
|
290
|
+
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
291
|
+
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
292
|
+
/**
|
|
293
|
+
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
294
|
+
* @param urls - The URL to scrape.
|
|
295
|
+
* @param params - Additional parameters for the scrape request.
|
|
296
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
297
|
+
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
298
|
+
*/
|
|
299
|
+
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<CrawlWatcher>;
|
|
300
|
+
/**
|
|
301
|
+
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
302
|
+
* @param id - The ID of the batch scrape operation.
|
|
303
|
+
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
304
|
+
* @returns The response containing the job status.
|
|
305
|
+
*/
|
|
306
|
+
checkBatchScrapeStatus(id?: string, getAllData?: boolean): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
258
307
|
/**
|
|
259
308
|
* Prepares the headers for an API request.
|
|
260
309
|
* @param idempotencyKey - Optional key to ensure idempotency.
|
|
@@ -319,4 +368,4 @@ declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
319
368
|
close(): void;
|
|
320
369
|
}
|
|
321
370
|
|
|
322
|
-
export { type Action, type ActionsResult, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, FirecrawlApp as default };
|
|
371
|
+
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, FirecrawlApp as default };
|
package/dist/index.d.ts
CHANGED
|
@@ -142,6 +142,16 @@ interface CrawlResponse {
|
|
|
142
142
|
success: true;
|
|
143
143
|
error?: string;
|
|
144
144
|
}
|
|
145
|
+
/**
|
|
146
|
+
* Response interface for batch scrape operations.
|
|
147
|
+
* Defines the structure of the response received after initiating a crawl.
|
|
148
|
+
*/
|
|
149
|
+
interface BatchScrapeResponse {
|
|
150
|
+
id?: string;
|
|
151
|
+
url?: string;
|
|
152
|
+
success: true;
|
|
153
|
+
error?: string;
|
|
154
|
+
}
|
|
145
155
|
/**
|
|
146
156
|
* Response interface for job status checks.
|
|
147
157
|
* Provides detailed status of a crawl job including progress and results.
|
|
@@ -156,6 +166,20 @@ interface CrawlStatusResponse {
|
|
|
156
166
|
next?: string;
|
|
157
167
|
data: FirecrawlDocument<undefined>[];
|
|
158
168
|
}
|
|
169
|
+
/**
|
|
170
|
+
* Response interface for batch scrape job status checks.
|
|
171
|
+
* Provides detailed status of a batch scrape job including progress and results.
|
|
172
|
+
*/
|
|
173
|
+
interface BatchScrapeStatusResponse {
|
|
174
|
+
success: true;
|
|
175
|
+
status: "scraping" | "completed" | "failed" | "cancelled";
|
|
176
|
+
completed: number;
|
|
177
|
+
total: number;
|
|
178
|
+
creditsUsed: number;
|
|
179
|
+
expiresAt: Date;
|
|
180
|
+
next?: string;
|
|
181
|
+
data: FirecrawlDocument<undefined>[];
|
|
182
|
+
}
|
|
159
183
|
/**
|
|
160
184
|
* Parameters for mapping operations.
|
|
161
185
|
* Defines options for mapping URLs during a crawl.
|
|
@@ -255,6 +279,31 @@ declare class FirecrawlApp {
|
|
|
255
279
|
* @returns The response from the map operation.
|
|
256
280
|
*/
|
|
257
281
|
mapUrl(url: string, params?: MapParams): Promise<MapResponse | ErrorResponse>;
|
|
282
|
+
/**
|
|
283
|
+
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
|
|
284
|
+
* @param url - The URLs to scrape.
|
|
285
|
+
* @param params - Additional parameters for the scrape request.
|
|
286
|
+
* @param pollInterval - Time in seconds for job status checks.
|
|
287
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
288
|
+
* @returns The response from the crawl operation.
|
|
289
|
+
*/
|
|
290
|
+
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
291
|
+
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
292
|
+
/**
|
|
293
|
+
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
294
|
+
* @param urls - The URL to scrape.
|
|
295
|
+
* @param params - Additional parameters for the scrape request.
|
|
296
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
297
|
+
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
298
|
+
*/
|
|
299
|
+
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<CrawlWatcher>;
|
|
300
|
+
/**
|
|
301
|
+
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
302
|
+
* @param id - The ID of the batch scrape operation.
|
|
303
|
+
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
304
|
+
* @returns The response containing the job status.
|
|
305
|
+
*/
|
|
306
|
+
checkBatchScrapeStatus(id?: string, getAllData?: boolean): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
258
307
|
/**
|
|
259
308
|
* Prepares the headers for an API request.
|
|
260
309
|
* @param idempotencyKey - Optional key to ensure idempotency.
|
|
@@ -319,4 +368,4 @@ declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
319
368
|
close(): void;
|
|
320
369
|
}
|
|
321
370
|
|
|
322
|
-
export { type Action, type ActionsResult, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, FirecrawlApp as default };
|
|
371
|
+
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, FirecrawlApp as default };
|
package/dist/index.js
CHANGED
|
@@ -250,6 +250,124 @@ var FirecrawlApp = class {
|
|
|
250
250
|
}
|
|
251
251
|
return { success: false, error: "Internal server error." };
|
|
252
252
|
}
|
|
253
|
+
/**
|
|
254
|
+
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
|
|
255
|
+
* @param url - The URLs to scrape.
|
|
256
|
+
* @param params - Additional parameters for the scrape request.
|
|
257
|
+
* @param pollInterval - Time in seconds for job status checks.
|
|
258
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
259
|
+
* @returns The response from the crawl operation.
|
|
260
|
+
*/
|
|
261
|
+
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey) {
|
|
262
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
263
|
+
let jsonData = { urls, ...params ?? {} };
|
|
264
|
+
try {
|
|
265
|
+
const response = await this.postRequest(
|
|
266
|
+
this.apiUrl + `/v1/batch/scrape`,
|
|
267
|
+
jsonData,
|
|
268
|
+
headers
|
|
269
|
+
);
|
|
270
|
+
if (response.status === 200) {
|
|
271
|
+
const id = response.data.id;
|
|
272
|
+
return this.monitorJobStatus(id, headers, pollInterval);
|
|
273
|
+
} else {
|
|
274
|
+
this.handleError(response, "start batch scrape job");
|
|
275
|
+
}
|
|
276
|
+
} catch (error) {
|
|
277
|
+
if (error.response?.data?.error) {
|
|
278
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ""}`, error.response.status);
|
|
279
|
+
} else {
|
|
280
|
+
throw new FirecrawlError(error.message, 500);
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
return { success: false, error: "Internal server error." };
|
|
284
|
+
}
|
|
285
|
+
async asyncBatchScrapeUrls(urls, params, idempotencyKey) {
|
|
286
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
287
|
+
let jsonData = { urls, ...params ?? {} };
|
|
288
|
+
try {
|
|
289
|
+
const response = await this.postRequest(
|
|
290
|
+
this.apiUrl + `/v1/batch/scrape`,
|
|
291
|
+
jsonData,
|
|
292
|
+
headers
|
|
293
|
+
);
|
|
294
|
+
if (response.status === 200) {
|
|
295
|
+
return response.data;
|
|
296
|
+
} else {
|
|
297
|
+
this.handleError(response, "start batch scrape job");
|
|
298
|
+
}
|
|
299
|
+
} catch (error) {
|
|
300
|
+
if (error.response?.data?.error) {
|
|
301
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ""}`, error.response.status);
|
|
302
|
+
} else {
|
|
303
|
+
throw new FirecrawlError(error.message, 500);
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
return { success: false, error: "Internal server error." };
|
|
307
|
+
}
|
|
308
|
+
/**
|
|
309
|
+
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
310
|
+
* @param urls - The URL to scrape.
|
|
311
|
+
* @param params - Additional parameters for the scrape request.
|
|
312
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
313
|
+
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
314
|
+
*/
|
|
315
|
+
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey) {
|
|
316
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
317
|
+
if (crawl.success && crawl.id) {
|
|
318
|
+
const id = crawl.id;
|
|
319
|
+
return new CrawlWatcher(id, this);
|
|
320
|
+
}
|
|
321
|
+
throw new FirecrawlError("Batch scrape job failed to start", 400);
|
|
322
|
+
}
|
|
323
|
+
/**
|
|
324
|
+
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
325
|
+
* @param id - The ID of the batch scrape operation.
|
|
326
|
+
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
327
|
+
* @returns The response containing the job status.
|
|
328
|
+
*/
|
|
329
|
+
async checkBatchScrapeStatus(id, getAllData = false) {
|
|
330
|
+
if (!id) {
|
|
331
|
+
throw new FirecrawlError("No batch scrape ID provided", 400);
|
|
332
|
+
}
|
|
333
|
+
const headers = this.prepareHeaders();
|
|
334
|
+
try {
|
|
335
|
+
const response = await this.getRequest(
|
|
336
|
+
`${this.apiUrl}/v1/batch/scrape/${id}`,
|
|
337
|
+
headers
|
|
338
|
+
);
|
|
339
|
+
if (response.status === 200) {
|
|
340
|
+
let allData = response.data.data;
|
|
341
|
+
if (getAllData && response.data.status === "completed") {
|
|
342
|
+
let statusData = response.data;
|
|
343
|
+
if ("data" in statusData) {
|
|
344
|
+
let data = statusData.data;
|
|
345
|
+
while ("next" in statusData) {
|
|
346
|
+
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
347
|
+
data = data.concat(statusData.data);
|
|
348
|
+
}
|
|
349
|
+
allData = data;
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
return {
|
|
353
|
+
success: response.data.success,
|
|
354
|
+
status: response.data.status,
|
|
355
|
+
total: response.data.total,
|
|
356
|
+
completed: response.data.completed,
|
|
357
|
+
creditsUsed: response.data.creditsUsed,
|
|
358
|
+
expiresAt: new Date(response.data.expiresAt),
|
|
359
|
+
next: response.data.next,
|
|
360
|
+
data: allData,
|
|
361
|
+
error: response.data.error
|
|
362
|
+
};
|
|
363
|
+
} else {
|
|
364
|
+
this.handleError(response, "check batch scrape status");
|
|
365
|
+
}
|
|
366
|
+
} catch (error) {
|
|
367
|
+
throw new FirecrawlError(error.message, 500);
|
|
368
|
+
}
|
|
369
|
+
return { success: false, error: "Internal server error." };
|
|
370
|
+
}
|
|
253
371
|
/**
|
|
254
372
|
* Prepares the headers for an API request.
|
|
255
373
|
* @param idempotencyKey - Optional key to ensure idempotency.
|
package/dump.rdb
ADDED
|
Binary file
|
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -154,6 +154,17 @@ export interface CrawlResponse {
|
|
|
154
154
|
error?: string;
|
|
155
155
|
}
|
|
156
156
|
|
|
157
|
+
/**
|
|
158
|
+
* Response interface for batch scrape operations.
|
|
159
|
+
* Defines the structure of the response received after initiating a crawl.
|
|
160
|
+
*/
|
|
161
|
+
export interface BatchScrapeResponse {
|
|
162
|
+
id?: string;
|
|
163
|
+
url?: string;
|
|
164
|
+
success: true;
|
|
165
|
+
error?: string;
|
|
166
|
+
}
|
|
167
|
+
|
|
157
168
|
/**
|
|
158
169
|
* Response interface for job status checks.
|
|
159
170
|
* Provides detailed status of a crawl job including progress and results.
|
|
@@ -169,6 +180,21 @@ export interface CrawlStatusResponse {
|
|
|
169
180
|
data: FirecrawlDocument<undefined>[];
|
|
170
181
|
};
|
|
171
182
|
|
|
183
|
+
/**
|
|
184
|
+
* Response interface for batch scrape job status checks.
|
|
185
|
+
* Provides detailed status of a batch scrape job including progress and results.
|
|
186
|
+
*/
|
|
187
|
+
export interface BatchScrapeStatusResponse {
|
|
188
|
+
success: true;
|
|
189
|
+
status: "scraping" | "completed" | "failed" | "cancelled";
|
|
190
|
+
completed: number;
|
|
191
|
+
total: number;
|
|
192
|
+
creditsUsed: number;
|
|
193
|
+
expiresAt: Date;
|
|
194
|
+
next?: string;
|
|
195
|
+
data: FirecrawlDocument<undefined>[];
|
|
196
|
+
};
|
|
197
|
+
|
|
172
198
|
/**
|
|
173
199
|
* Parameters for mapping operations.
|
|
174
200
|
* Defines options for mapping URLs during a crawl.
|
|
@@ -493,6 +519,144 @@ export default class FirecrawlApp {
|
|
|
493
519
|
return { success: false, error: "Internal server error." };
|
|
494
520
|
}
|
|
495
521
|
|
|
522
|
+
/**
|
|
523
|
+
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
|
|
524
|
+
* @param url - The URLs to scrape.
|
|
525
|
+
* @param params - Additional parameters for the scrape request.
|
|
526
|
+
* @param pollInterval - Time in seconds for job status checks.
|
|
527
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
528
|
+
* @returns The response from the crawl operation.
|
|
529
|
+
*/
|
|
530
|
+
async batchScrapeUrls(
|
|
531
|
+
urls: string[],
|
|
532
|
+
params?: ScrapeParams,
|
|
533
|
+
pollInterval: number = 2,
|
|
534
|
+
idempotencyKey?: string
|
|
535
|
+
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
|
536
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
537
|
+
let jsonData: any = { urls, ...(params ?? {}) };
|
|
538
|
+
try {
|
|
539
|
+
const response: AxiosResponse = await this.postRequest(
|
|
540
|
+
this.apiUrl + `/v1/batch/scrape`,
|
|
541
|
+
jsonData,
|
|
542
|
+
headers
|
|
543
|
+
);
|
|
544
|
+
if (response.status === 200) {
|
|
545
|
+
const id: string = response.data.id;
|
|
546
|
+
return this.monitorJobStatus(id, headers, pollInterval);
|
|
547
|
+
} else {
|
|
548
|
+
this.handleError(response, "start batch scrape job");
|
|
549
|
+
}
|
|
550
|
+
} catch (error: any) {
|
|
551
|
+
if (error.response?.data?.error) {
|
|
552
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
553
|
+
} else {
|
|
554
|
+
throw new FirecrawlError(error.message, 500);
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
return { success: false, error: "Internal server error." };
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
async asyncBatchScrapeUrls(
|
|
561
|
+
urls: string[],
|
|
562
|
+
params?: ScrapeParams,
|
|
563
|
+
idempotencyKey?: string
|
|
564
|
+
): Promise<BatchScrapeResponse | ErrorResponse> {
|
|
565
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
566
|
+
let jsonData: any = { urls, ...(params ?? {}) };
|
|
567
|
+
try {
|
|
568
|
+
const response: AxiosResponse = await this.postRequest(
|
|
569
|
+
this.apiUrl + `/v1/batch/scrape`,
|
|
570
|
+
jsonData,
|
|
571
|
+
headers
|
|
572
|
+
);
|
|
573
|
+
if (response.status === 200) {
|
|
574
|
+
return response.data;
|
|
575
|
+
} else {
|
|
576
|
+
this.handleError(response, "start batch scrape job");
|
|
577
|
+
}
|
|
578
|
+
} catch (error: any) {
|
|
579
|
+
if (error.response?.data?.error) {
|
|
580
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
581
|
+
} else {
|
|
582
|
+
throw new FirecrawlError(error.message, 500);
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
return { success: false, error: "Internal server error." };
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
/**
|
|
589
|
+
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
590
|
+
* @param urls - The URL to scrape.
|
|
591
|
+
* @param params - Additional parameters for the scrape request.
|
|
592
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
593
|
+
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
594
|
+
*/
|
|
595
|
+
async batchScrapeUrlsAndWatch(
|
|
596
|
+
urls: string[],
|
|
597
|
+
params?: ScrapeParams,
|
|
598
|
+
idempotencyKey?: string,
|
|
599
|
+
) {
|
|
600
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
601
|
+
|
|
602
|
+
if (crawl.success && crawl.id) {
|
|
603
|
+
const id = crawl.id;
|
|
604
|
+
return new CrawlWatcher(id, this);
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
throw new FirecrawlError("Batch scrape job failed to start", 400);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
/**
|
|
611
|
+
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
612
|
+
* @param id - The ID of the batch scrape operation.
|
|
613
|
+
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
614
|
+
* @returns The response containing the job status.
|
|
615
|
+
*/
|
|
616
|
+
async checkBatchScrapeStatus(id?: string, getAllData = false): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
|
617
|
+
if (!id) {
|
|
618
|
+
throw new FirecrawlError("No batch scrape ID provided", 400);
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
|
622
|
+
try {
|
|
623
|
+
const response: AxiosResponse = await this.getRequest(
|
|
624
|
+
`${this.apiUrl}/v1/batch/scrape/${id}`,
|
|
625
|
+
headers
|
|
626
|
+
);
|
|
627
|
+
if (response.status === 200) {
|
|
628
|
+
let allData = response.data.data;
|
|
629
|
+
if (getAllData && response.data.status === "completed") {
|
|
630
|
+
let statusData = response.data
|
|
631
|
+
if ("data" in statusData) {
|
|
632
|
+
let data = statusData.data;
|
|
633
|
+
while ('next' in statusData) {
|
|
634
|
+
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
635
|
+
data = data.concat(statusData.data);
|
|
636
|
+
}
|
|
637
|
+
allData = data;
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
return ({
|
|
641
|
+
success: response.data.success,
|
|
642
|
+
status: response.data.status,
|
|
643
|
+
total: response.data.total,
|
|
644
|
+
completed: response.data.completed,
|
|
645
|
+
creditsUsed: response.data.creditsUsed,
|
|
646
|
+
expiresAt: new Date(response.data.expiresAt),
|
|
647
|
+
next: response.data.next,
|
|
648
|
+
data: allData,
|
|
649
|
+
error: response.data.error,
|
|
650
|
+
})
|
|
651
|
+
} else {
|
|
652
|
+
this.handleError(response, "check batch scrape status");
|
|
653
|
+
}
|
|
654
|
+
} catch (error: any) {
|
|
655
|
+
throw new FirecrawlError(error.message, 500);
|
|
656
|
+
}
|
|
657
|
+
return { success: false, error: "Internal server error." };
|
|
658
|
+
}
|
|
659
|
+
|
|
496
660
|
/**
|
|
497
661
|
* Prepares the headers for an API request.
|
|
498
662
|
* @param idempotencyKey - Optional key to ensure idempotency.
|