firecrawl 1.2.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +410 -0
- package/dist/index.d.cts +264 -0
- package/{types → dist}/index.d.ts +39 -34
- package/dist/index.js +375 -0
- package/package.json +12 -14
- package/src/__tests__/v1/e2e_withAuth/index.test.ts +111 -88
- package/src/index.ts +67 -40
- package/tsconfig.json +19 -105
- package/tsup.config.ts +9 -0
- package/build/cjs/index.js +0 -347
- package/build/cjs/package.json +0 -1
- package/build/esm/index.js +0 -339
- package/build/esm/package.json +0 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse,
|
|
1
|
+
import FirecrawlApp, { type CrawlParams, type CrawlResponse, type CrawlStatusResponse, type MapResponse, type ScrapeResponse } from '../../../index';
|
|
2
2
|
import { v4 as uuidv4 } from 'uuid';
|
|
3
3
|
import dotenv from 'dotenv';
|
|
4
4
|
import { describe, test, expect } from '@jest/globals';
|
|
@@ -6,7 +6,7 @@ import { describe, test, expect } from '@jest/globals';
|
|
|
6
6
|
dotenv.config();
|
|
7
7
|
|
|
8
8
|
const TEST_API_KEY = process.env.TEST_API_KEY;
|
|
9
|
-
const API_URL = "
|
|
9
|
+
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
|
|
10
10
|
|
|
11
11
|
describe('FirecrawlApp E2E Tests', () => {
|
|
12
12
|
test.concurrent('should throw error for no API key', async () => {
|
|
@@ -71,6 +71,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
71
71
|
expect(response.links?.length).toBeGreaterThan(0);
|
|
72
72
|
expect(response.links?.[0]).toContain("https://");
|
|
73
73
|
expect(response.metadata).not.toBeNull();
|
|
74
|
+
expect(response.metadata).not.toBeUndefined();
|
|
74
75
|
expect(response.metadata).toHaveProperty("title");
|
|
75
76
|
expect(response.metadata).toHaveProperty("description");
|
|
76
77
|
expect(response.metadata).toHaveProperty("keywords");
|
|
@@ -85,19 +86,21 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
85
86
|
expect(response.metadata).not.toHaveProperty("pageStatusCode");
|
|
86
87
|
expect(response.metadata).toHaveProperty("statusCode");
|
|
87
88
|
expect(response.metadata).not.toHaveProperty("pageError");
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
89
|
+
if (response.metadata !== undefined) {
|
|
90
|
+
expect(response.metadata.error).toBeUndefined();
|
|
91
|
+
expect(response.metadata.title).toBe("Roast My Website");
|
|
92
|
+
expect(response.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
|
|
93
|
+
expect(response.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
|
|
94
|
+
expect(response.metadata.robots).toBe("follow, index");
|
|
95
|
+
expect(response.metadata.ogTitle).toBe("Roast My Website");
|
|
96
|
+
expect(response.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
|
|
97
|
+
expect(response.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
|
|
98
|
+
expect(response.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
|
|
99
|
+
expect(response.metadata.ogLocaleAlternate).toStrictEqual([]);
|
|
100
|
+
expect(response.metadata.ogSiteName).toBe("Roast My Website");
|
|
101
|
+
expect(response.metadata.sourceURL).toBe("https://roastmywebsite.ai");
|
|
102
|
+
expect(response.metadata.statusCode).toBe(200);
|
|
103
|
+
}
|
|
101
104
|
}, 30000); // 30 seconds timeout
|
|
102
105
|
|
|
103
106
|
test.concurrent('should return successful response for valid scrape with PDF file', async () => {
|
|
@@ -127,7 +130,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
127
130
|
|
|
128
131
|
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
|
129
132
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
130
|
-
const response = await app.crawlUrl('https://roastmywebsite.ai', {},
|
|
133
|
+
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, 30) as CrawlStatusResponse;
|
|
131
134
|
expect(response).not.toBeNull();
|
|
132
135
|
expect(response).toHaveProperty("total");
|
|
133
136
|
expect(response.total).toBeGreaterThan(0);
|
|
@@ -138,21 +141,25 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
138
141
|
expect(response).toHaveProperty("status");
|
|
139
142
|
expect(response.status).toBe("completed");
|
|
140
143
|
expect(response).not.toHaveProperty("next"); // wait until done
|
|
141
|
-
expect(response.data
|
|
142
|
-
expect(response.data
|
|
143
|
-
expect(response.data
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
144
|
+
expect(response.data.length).toBeGreaterThan(0);
|
|
145
|
+
expect(response.data[0]).not.toBeNull();
|
|
146
|
+
expect(response.data[0]).not.toBeUndefined();
|
|
147
|
+
if (response.data[0]) {
|
|
148
|
+
expect(response.data[0]).toHaveProperty("markdown");
|
|
149
|
+
expect(response.data[0].markdown).toContain("_Roast_");
|
|
150
|
+
expect(response.data[0]).not.toHaveProperty('content'); // v0
|
|
151
|
+
expect(response.data[0]).not.toHaveProperty("html");
|
|
152
|
+
expect(response.data[0]).not.toHaveProperty("rawHtml");
|
|
153
|
+
expect(response.data[0]).not.toHaveProperty("screenshot");
|
|
154
|
+
expect(response.data[0]).not.toHaveProperty("links");
|
|
155
|
+
expect(response.data[0]).toHaveProperty("metadata");
|
|
156
|
+
expect(response.data[0].metadata).toHaveProperty("title");
|
|
157
|
+
expect(response.data[0].metadata).toHaveProperty("description");
|
|
158
|
+
expect(response.data[0].metadata).toHaveProperty("language");
|
|
159
|
+
expect(response.data[0].metadata).toHaveProperty("sourceURL");
|
|
160
|
+
expect(response.data[0].metadata).toHaveProperty("statusCode");
|
|
161
|
+
expect(response.data[0].metadata).not.toHaveProperty("error");
|
|
162
|
+
}
|
|
156
163
|
}, 60000); // 60 seconds timeout
|
|
157
164
|
|
|
158
165
|
test.concurrent('should return successful response for crawl with options and wait for completion', async () => {
|
|
@@ -173,7 +180,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
173
180
|
onlyMainContent: true,
|
|
174
181
|
waitFor: 1000
|
|
175
182
|
}
|
|
176
|
-
} as CrawlParams,
|
|
183
|
+
} as CrawlParams, 30) as CrawlStatusResponse;
|
|
177
184
|
expect(response).not.toBeNull();
|
|
178
185
|
expect(response).toHaveProperty("total");
|
|
179
186
|
expect(response.total).toBeGreaterThan(0);
|
|
@@ -184,41 +191,45 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
184
191
|
expect(response).toHaveProperty("status");
|
|
185
192
|
expect(response.status).toBe("completed");
|
|
186
193
|
expect(response).not.toHaveProperty("next");
|
|
187
|
-
expect(response.data
|
|
188
|
-
expect(response.data
|
|
189
|
-
expect(response.data
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
194
|
+
expect(response.data.length).toBeGreaterThan(0);
|
|
195
|
+
expect(response.data[0]).not.toBeNull();
|
|
196
|
+
expect(response.data[0]).not.toBeUndefined();
|
|
197
|
+
if (response.data[0]) {
|
|
198
|
+
expect(response.data[0]).toHaveProperty("markdown");
|
|
199
|
+
expect(response.data[0].markdown).toContain("_Roast_");
|
|
200
|
+
expect(response.data[0]).not.toHaveProperty('content'); // v0
|
|
201
|
+
expect(response.data[0]).toHaveProperty("html");
|
|
202
|
+
expect(response.data[0].html).toContain("<h1");
|
|
203
|
+
expect(response.data[0]).toHaveProperty("rawHtml");
|
|
204
|
+
expect(response.data[0].rawHtml).toContain("<h1");
|
|
205
|
+
expect(response.data[0]).toHaveProperty("screenshot");
|
|
206
|
+
expect(response.data[0].screenshot).toContain("https://");
|
|
207
|
+
expect(response.data[0]).toHaveProperty("links");
|
|
208
|
+
expect(response.data[0].links).not.toBeNull();
|
|
209
|
+
expect(response.data[0].links?.length).toBeGreaterThan(0);
|
|
210
|
+
expect(response.data[0]).toHaveProperty("metadata");
|
|
211
|
+
expect(response.data[0].metadata).toHaveProperty("title");
|
|
212
|
+
expect(response.data[0].metadata).toHaveProperty("description");
|
|
213
|
+
expect(response.data[0].metadata).toHaveProperty("language");
|
|
214
|
+
expect(response.data[0].metadata).toHaveProperty("sourceURL");
|
|
215
|
+
expect(response.data[0].metadata).toHaveProperty("statusCode");
|
|
216
|
+
expect(response.data[0].metadata).not.toHaveProperty("error");
|
|
217
|
+
}
|
|
207
218
|
}, 60000); // 60 seconds timeout
|
|
208
219
|
|
|
209
220
|
test.concurrent('should handle idempotency key for crawl', async () => {
|
|
210
221
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
211
222
|
const uniqueIdempotencyKey = uuidv4();
|
|
212
|
-
const response = await app.
|
|
223
|
+
const response = await app.asyncCrawlUrl('https://roastmywebsite.ai', {}, uniqueIdempotencyKey) as CrawlResponse;
|
|
213
224
|
expect(response).not.toBeNull();
|
|
214
225
|
expect(response.id).toBeDefined();
|
|
215
226
|
|
|
216
|
-
await expect(app.crawlUrl('https://roastmywebsite.ai', {},
|
|
227
|
+
await expect(app.crawlUrl('https://roastmywebsite.ai', {}, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
|
|
217
228
|
});
|
|
218
229
|
|
|
219
230
|
test.concurrent('should check crawl status', async () => {
|
|
220
231
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
221
|
-
const response = await app.
|
|
232
|
+
const response = await app.asyncCrawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams) as CrawlResponse;
|
|
222
233
|
expect(response).not.toBeNull();
|
|
223
234
|
expect(response.id).toBeDefined();
|
|
224
235
|
|
|
@@ -226,7 +237,8 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
226
237
|
const maxChecks = 15;
|
|
227
238
|
let checks = 0;
|
|
228
239
|
|
|
229
|
-
|
|
240
|
+
expect(statusResponse.success).toBe(true);
|
|
241
|
+
while ((statusResponse as any).status === 'scraping' && checks < maxChecks) {
|
|
230
242
|
await new Promise(resolve => setTimeout(resolve, 5000));
|
|
231
243
|
expect(statusResponse).not.toHaveProperty("partial_data"); // v0
|
|
232
244
|
expect(statusResponse).not.toHaveProperty("current"); // v0
|
|
@@ -236,44 +248,55 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
236
248
|
expect(statusResponse).toHaveProperty("expiresAt");
|
|
237
249
|
expect(statusResponse).toHaveProperty("status");
|
|
238
250
|
expect(statusResponse).toHaveProperty("next");
|
|
239
|
-
expect(statusResponse.
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
251
|
+
expect(statusResponse.success).toBe(true);
|
|
252
|
+
if (statusResponse.success === true) {
|
|
253
|
+
expect(statusResponse.total).toBeGreaterThan(0);
|
|
254
|
+
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
|
|
255
|
+
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
|
|
256
|
+
expect(statusResponse.status).toBe("scraping");
|
|
257
|
+
expect(statusResponse.next).toContain("/v1/crawl/");
|
|
258
|
+
}
|
|
244
259
|
statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
|
|
260
|
+
expect(statusResponse.success).toBe(true);
|
|
245
261
|
checks++;
|
|
246
262
|
}
|
|
247
263
|
|
|
248
264
|
expect(statusResponse).not.toBeNull();
|
|
249
265
|
expect(statusResponse).toHaveProperty("total");
|
|
250
|
-
expect(statusResponse.
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
266
|
+
expect(statusResponse.success).toBe(true);
|
|
267
|
+
if (statusResponse.success === true) {
|
|
268
|
+
expect(statusResponse.total).toBeGreaterThan(0);
|
|
269
|
+
expect(statusResponse).toHaveProperty("creditsUsed");
|
|
270
|
+
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
|
|
271
|
+
expect(statusResponse).toHaveProperty("expiresAt");
|
|
272
|
+
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
|
|
273
|
+
expect(statusResponse).toHaveProperty("status");
|
|
274
|
+
expect(statusResponse.status).toBe("completed");
|
|
275
|
+
expect(statusResponse.data.length).toBeGreaterThan(0);
|
|
276
|
+
expect(statusResponse.data[0]).not.toBeNull();
|
|
277
|
+
expect(statusResponse.data[0]).not.toBeUndefined();
|
|
278
|
+
if (statusResponse.data[0]) {
|
|
279
|
+
expect(statusResponse.data[0]).toHaveProperty("markdown");
|
|
280
|
+
expect(statusResponse.data[0].markdown?.length).toBeGreaterThan(10);
|
|
281
|
+
expect(statusResponse.data[0]).not.toHaveProperty('content'); // v0
|
|
282
|
+
expect(statusResponse.data[0]).toHaveProperty("html");
|
|
283
|
+
expect(statusResponse.data[0].html).toContain("<div");
|
|
284
|
+
expect(statusResponse.data[0]).toHaveProperty("rawHtml");
|
|
285
|
+
expect(statusResponse.data[0].rawHtml).toContain("<div");
|
|
286
|
+
expect(statusResponse.data[0]).toHaveProperty("screenshot");
|
|
287
|
+
expect(statusResponse.data[0].screenshot).toContain("https://");
|
|
288
|
+
expect(statusResponse.data[0]).toHaveProperty("links");
|
|
289
|
+
expect(statusResponse.data[0].links).not.toBeNull();
|
|
290
|
+
expect(statusResponse.data[0].links?.length).toBeGreaterThan(0);
|
|
291
|
+
expect(statusResponse.data[0]).toHaveProperty("metadata");
|
|
292
|
+
expect(statusResponse.data[0].metadata).toHaveProperty("title");
|
|
293
|
+
expect(statusResponse.data[0].metadata).toHaveProperty("description");
|
|
294
|
+
expect(statusResponse.data[0].metadata).toHaveProperty("language");
|
|
295
|
+
expect(statusResponse.data[0].metadata).toHaveProperty("sourceURL");
|
|
296
|
+
expect(statusResponse.data[0].metadata).toHaveProperty("statusCode");
|
|
297
|
+
expect(statusResponse.data[0].metadata).not.toHaveProperty("error");
|
|
298
|
+
}
|
|
299
|
+
}
|
|
277
300
|
}, 60000); // 60 seconds timeout
|
|
278
301
|
|
|
279
302
|
test.concurrent('should throw error for invalid API key on map', async () => {
|
package/src/index.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
|
|
2
|
-
import
|
|
1
|
+
import axios, { type AxiosResponse, type AxiosRequestHeaders } from "axios";
|
|
2
|
+
import type * as zt from "zod";
|
|
3
3
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
4
4
|
import { WebSocket } from "isows";
|
|
5
5
|
import { TypedEventTarget } from "typescript-event-target";
|
|
@@ -58,13 +58,13 @@ export interface FirecrawlDocumentMetadata {
|
|
|
58
58
|
* Document interface for Firecrawl.
|
|
59
59
|
* Represents a document retrieved or processed by Firecrawl.
|
|
60
60
|
*/
|
|
61
|
-
export interface FirecrawlDocument {
|
|
61
|
+
export interface FirecrawlDocument<T> {
|
|
62
62
|
url?: string;
|
|
63
63
|
markdown?: string;
|
|
64
64
|
html?: string;
|
|
65
65
|
rawHtml?: string;
|
|
66
66
|
links?: string[];
|
|
67
|
-
extract?:
|
|
67
|
+
extract?: T;
|
|
68
68
|
screenshot?: string;
|
|
69
69
|
metadata?: FirecrawlDocumentMetadata;
|
|
70
70
|
}
|
|
@@ -73,26 +73,29 @@ export interface FirecrawlDocument {
|
|
|
73
73
|
* Parameters for scraping operations.
|
|
74
74
|
* Defines the options and configurations available for scraping web content.
|
|
75
75
|
*/
|
|
76
|
-
export interface
|
|
76
|
+
export interface CrawlScrapeOptions {
|
|
77
77
|
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[];
|
|
78
78
|
headers?: Record<string, string>;
|
|
79
79
|
includeTags?: string[];
|
|
80
80
|
excludeTags?: string[];
|
|
81
81
|
onlyMainContent?: boolean;
|
|
82
|
+
waitFor?: number;
|
|
83
|
+
timeout?: number;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export interface ScrapeParams<LLMSchema extends zt.ZodSchema> extends CrawlScrapeOptions {
|
|
82
87
|
extract?: {
|
|
83
88
|
prompt?: string;
|
|
84
|
-
schema?:
|
|
89
|
+
schema?: LLMSchema;
|
|
85
90
|
systemPrompt?: string;
|
|
86
91
|
};
|
|
87
|
-
waitFor?: number;
|
|
88
|
-
timeout?: number;
|
|
89
92
|
}
|
|
90
93
|
|
|
91
94
|
/**
|
|
92
95
|
* Response interface for scraping operations.
|
|
93
96
|
* Defines the structure of the response received after a scraping operation.
|
|
94
97
|
*/
|
|
95
|
-
export interface ScrapeResponse extends FirecrawlDocument {
|
|
98
|
+
export interface ScrapeResponse<LLMResult> extends FirecrawlDocument<LLMResult> {
|
|
96
99
|
success: true;
|
|
97
100
|
warning?: string;
|
|
98
101
|
error?: string;
|
|
@@ -110,7 +113,8 @@ export interface CrawlParams {
|
|
|
110
113
|
allowBackwardLinks?: boolean;
|
|
111
114
|
allowExternalLinks?: boolean;
|
|
112
115
|
ignoreSitemap?: boolean;
|
|
113
|
-
scrapeOptions?:
|
|
116
|
+
scrapeOptions?: CrawlScrapeOptions;
|
|
117
|
+
webhook?: string;
|
|
114
118
|
}
|
|
115
119
|
|
|
116
120
|
/**
|
|
@@ -130,15 +134,14 @@ export interface CrawlResponse {
|
|
|
130
134
|
*/
|
|
131
135
|
export interface CrawlStatusResponse {
|
|
132
136
|
success: true;
|
|
133
|
-
|
|
137
|
+
status: "scraping" | "completed" | "failed" | "cancelled";
|
|
134
138
|
completed: number;
|
|
139
|
+
total: number;
|
|
135
140
|
creditsUsed: number;
|
|
136
141
|
expiresAt: Date;
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
error?: string;
|
|
141
|
-
}
|
|
142
|
+
next?: string;
|
|
143
|
+
data: FirecrawlDocument<undefined>[];
|
|
144
|
+
};
|
|
142
145
|
|
|
143
146
|
/**
|
|
144
147
|
* Parameters for mapping operations.
|
|
@@ -183,7 +186,11 @@ export default class FirecrawlApp {
|
|
|
183
186
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
184
187
|
*/
|
|
185
188
|
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
|
186
|
-
|
|
189
|
+
if (typeof apiKey !== "string") {
|
|
190
|
+
throw new Error("No API key provided");
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
this.apiKey = apiKey;
|
|
187
194
|
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
|
188
195
|
}
|
|
189
196
|
|
|
@@ -193,10 +200,10 @@ export default class FirecrawlApp {
|
|
|
193
200
|
* @param params - Additional parameters for the scrape request.
|
|
194
201
|
* @returns The response from the scrape operation.
|
|
195
202
|
*/
|
|
196
|
-
async scrapeUrl(
|
|
203
|
+
async scrapeUrl<T extends zt.ZodSchema>(
|
|
197
204
|
url: string,
|
|
198
|
-
params?: ScrapeParams
|
|
199
|
-
): Promise<ScrapeResponse | ErrorResponse> {
|
|
205
|
+
params?: ScrapeParams<T>
|
|
206
|
+
): Promise<ScrapeResponse<zt.infer<T>> | ErrorResponse> {
|
|
200
207
|
const headers: AxiosRequestHeaders = {
|
|
201
208
|
"Content-Type": "application/json",
|
|
202
209
|
Authorization: `Bearer ${this.apiKey}`,
|
|
@@ -328,9 +335,10 @@ export default class FirecrawlApp {
|
|
|
328
335
|
/**
|
|
329
336
|
* Checks the status of a crawl job using the Firecrawl API.
|
|
330
337
|
* @param id - The ID of the crawl operation.
|
|
338
|
+
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
331
339
|
* @returns The response containing the job status.
|
|
332
340
|
*/
|
|
333
|
-
async checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
341
|
+
async checkCrawlStatus(id?: string, getAllData = false): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
334
342
|
if (!id) {
|
|
335
343
|
throw new Error("No crawl ID provided");
|
|
336
344
|
}
|
|
@@ -342,16 +350,28 @@ export default class FirecrawlApp {
|
|
|
342
350
|
headers
|
|
343
351
|
);
|
|
344
352
|
if (response.status === 200) {
|
|
353
|
+
let allData = response.data.data;
|
|
354
|
+
if (getAllData && response.data.status === "completed") {
|
|
355
|
+
let statusData = response.data
|
|
356
|
+
if ("data" in statusData) {
|
|
357
|
+
let data = statusData.data;
|
|
358
|
+
while ('next' in statusData) {
|
|
359
|
+
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
360
|
+
data = data.concat(statusData.data);
|
|
361
|
+
}
|
|
362
|
+
allData = data;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
345
365
|
return ({
|
|
346
|
-
success:
|
|
366
|
+
success: response.data.success,
|
|
347
367
|
status: response.data.status,
|
|
348
368
|
total: response.data.total,
|
|
349
369
|
completed: response.data.completed,
|
|
350
370
|
creditsUsed: response.data.creditsUsed,
|
|
351
371
|
expiresAt: new Date(response.data.expiresAt),
|
|
352
372
|
next: response.data.next,
|
|
353
|
-
data:
|
|
354
|
-
error: response.data.error
|
|
373
|
+
data: allData,
|
|
374
|
+
error: response.data.error,
|
|
355
375
|
})
|
|
356
376
|
} else {
|
|
357
377
|
this.handleError(response, "check crawl status");
|
|
@@ -451,22 +471,29 @@ export default class FirecrawlApp {
|
|
|
451
471
|
id: string,
|
|
452
472
|
headers: AxiosRequestHeaders,
|
|
453
473
|
checkInterval: number
|
|
454
|
-
): Promise<CrawlStatusResponse> {
|
|
474
|
+
): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
455
475
|
while (true) {
|
|
456
|
-
|
|
476
|
+
let statusResponse: AxiosResponse = await this.getRequest(
|
|
457
477
|
`${this.apiUrl}/v1/crawl/${id}`,
|
|
458
478
|
headers
|
|
459
479
|
);
|
|
460
480
|
if (statusResponse.status === 200) {
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
481
|
+
let statusData = statusResponse.data;
|
|
482
|
+
if (statusData.status === "completed") {
|
|
483
|
+
if ("data" in statusData) {
|
|
484
|
+
let data = statusData.data;
|
|
485
|
+
while ('next' in statusData) {
|
|
486
|
+
statusResponse = await this.getRequest(statusData.next, headers);
|
|
487
|
+
statusData = statusResponse.data;
|
|
488
|
+
data = data.concat(statusData.data);
|
|
489
|
+
}
|
|
490
|
+
statusData.data = data;
|
|
491
|
+
return statusData;
|
|
492
|
+
} else {
|
|
493
|
+
throw new Error("Crawl job completed but no data was returned");
|
|
494
|
+
}
|
|
495
|
+
} else if (
|
|
496
|
+
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
|
|
470
497
|
) {
|
|
471
498
|
checkInterval = Math.max(checkInterval, 2);
|
|
472
499
|
await new Promise((resolve) =>
|
|
@@ -504,21 +531,21 @@ export default class FirecrawlApp {
|
|
|
504
531
|
}
|
|
505
532
|
|
|
506
533
|
interface CrawlWatcherEvents {
|
|
507
|
-
document: CustomEvent<FirecrawlDocument
|
|
534
|
+
document: CustomEvent<FirecrawlDocument<undefined>>,
|
|
508
535
|
done: CustomEvent<{
|
|
509
536
|
status: CrawlStatusResponse["status"];
|
|
510
|
-
data: FirecrawlDocument[];
|
|
537
|
+
data: FirecrawlDocument<undefined>[];
|
|
511
538
|
}>,
|
|
512
539
|
error: CustomEvent<{
|
|
513
540
|
status: CrawlStatusResponse["status"],
|
|
514
|
-
data: FirecrawlDocument[],
|
|
541
|
+
data: FirecrawlDocument<undefined>[],
|
|
515
542
|
error: string,
|
|
516
543
|
}>,
|
|
517
544
|
}
|
|
518
545
|
|
|
519
546
|
export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
520
547
|
private ws: WebSocket;
|
|
521
|
-
public data: FirecrawlDocument[];
|
|
548
|
+
public data: FirecrawlDocument<undefined>[];
|
|
522
549
|
public status: CrawlStatusResponse["status"];
|
|
523
550
|
|
|
524
551
|
constructor(id: string, app: FirecrawlApp) {
|
|
@@ -539,7 +566,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
539
566
|
|
|
540
567
|
type DocumentMessage = {
|
|
541
568
|
type: "document",
|
|
542
|
-
data: FirecrawlDocument
|
|
569
|
+
data: FirecrawlDocument<undefined>,
|
|
543
570
|
}
|
|
544
571
|
|
|
545
572
|
type DoneMessage = { type: "done" }
|