@mendable/firecrawl-js 1.2.2 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +410 -0
- package/dist/index.d.cts +264 -0
- package/{types → dist}/index.d.ts +38 -34
- package/dist/index.js +375 -0
- package/package.json +12 -14
- package/src/__tests__/v1/e2e_withAuth/index.test.ts +111 -88
- package/src/index.ts +62 -43
- package/tsconfig.json +19 -105
- package/tsup.config.ts +9 -0
- package/build/cjs/index.js +0 -354
- package/build/cjs/package.json +0 -1
- package/build/esm/index.js +0 -346
- package/build/esm/package.json +0 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse,
|
|
1
|
+
import FirecrawlApp, { type CrawlParams, type CrawlResponse, type CrawlStatusResponse, type MapResponse, type ScrapeResponse } from '../../../index';
|
|
2
2
|
import { v4 as uuidv4 } from 'uuid';
|
|
3
3
|
import dotenv from 'dotenv';
|
|
4
4
|
import { describe, test, expect } from '@jest/globals';
|
|
@@ -6,7 +6,7 @@ import { describe, test, expect } from '@jest/globals';
|
|
|
6
6
|
dotenv.config();
|
|
7
7
|
|
|
8
8
|
const TEST_API_KEY = process.env.TEST_API_KEY;
|
|
9
|
-
const API_URL = "
|
|
9
|
+
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
|
|
10
10
|
|
|
11
11
|
describe('FirecrawlApp E2E Tests', () => {
|
|
12
12
|
test.concurrent('should throw error for no API key', async () => {
|
|
@@ -71,6 +71,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
71
71
|
expect(response.links?.length).toBeGreaterThan(0);
|
|
72
72
|
expect(response.links?.[0]).toContain("https://");
|
|
73
73
|
expect(response.metadata).not.toBeNull();
|
|
74
|
+
expect(response.metadata).not.toBeUndefined();
|
|
74
75
|
expect(response.metadata).toHaveProperty("title");
|
|
75
76
|
expect(response.metadata).toHaveProperty("description");
|
|
76
77
|
expect(response.metadata).toHaveProperty("keywords");
|
|
@@ -85,19 +86,21 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
85
86
|
expect(response.metadata).not.toHaveProperty("pageStatusCode");
|
|
86
87
|
expect(response.metadata).toHaveProperty("statusCode");
|
|
87
88
|
expect(response.metadata).not.toHaveProperty("pageError");
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
89
|
+
if (response.metadata !== undefined) {
|
|
90
|
+
expect(response.metadata.error).toBeUndefined();
|
|
91
|
+
expect(response.metadata.title).toBe("Roast My Website");
|
|
92
|
+
expect(response.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
|
|
93
|
+
expect(response.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
|
|
94
|
+
expect(response.metadata.robots).toBe("follow, index");
|
|
95
|
+
expect(response.metadata.ogTitle).toBe("Roast My Website");
|
|
96
|
+
expect(response.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
|
|
97
|
+
expect(response.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
|
|
98
|
+
expect(response.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
|
|
99
|
+
expect(response.metadata.ogLocaleAlternate).toStrictEqual([]);
|
|
100
|
+
expect(response.metadata.ogSiteName).toBe("Roast My Website");
|
|
101
|
+
expect(response.metadata.sourceURL).toBe("https://roastmywebsite.ai");
|
|
102
|
+
expect(response.metadata.statusCode).toBe(200);
|
|
103
|
+
}
|
|
101
104
|
}, 30000); // 30 seconds timeout
|
|
102
105
|
|
|
103
106
|
test.concurrent('should return successful response for valid scrape with PDF file', async () => {
|
|
@@ -127,7 +130,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
127
130
|
|
|
128
131
|
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
|
129
132
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
130
|
-
const response = await app.crawlUrl('https://roastmywebsite.ai', {},
|
|
133
|
+
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, 30) as CrawlStatusResponse;
|
|
131
134
|
expect(response).not.toBeNull();
|
|
132
135
|
expect(response).toHaveProperty("total");
|
|
133
136
|
expect(response.total).toBeGreaterThan(0);
|
|
@@ -138,21 +141,25 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
138
141
|
expect(response).toHaveProperty("status");
|
|
139
142
|
expect(response.status).toBe("completed");
|
|
140
143
|
expect(response).not.toHaveProperty("next"); // wait until done
|
|
141
|
-
expect(response.data
|
|
142
|
-
expect(response.data
|
|
143
|
-
expect(response.data
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
144
|
+
expect(response.data.length).toBeGreaterThan(0);
|
|
145
|
+
expect(response.data[0]).not.toBeNull();
|
|
146
|
+
expect(response.data[0]).not.toBeUndefined();
|
|
147
|
+
if (response.data[0]) {
|
|
148
|
+
expect(response.data[0]).toHaveProperty("markdown");
|
|
149
|
+
expect(response.data[0].markdown).toContain("_Roast_");
|
|
150
|
+
expect(response.data[0]).not.toHaveProperty('content'); // v0
|
|
151
|
+
expect(response.data[0]).not.toHaveProperty("html");
|
|
152
|
+
expect(response.data[0]).not.toHaveProperty("rawHtml");
|
|
153
|
+
expect(response.data[0]).not.toHaveProperty("screenshot");
|
|
154
|
+
expect(response.data[0]).not.toHaveProperty("links");
|
|
155
|
+
expect(response.data[0]).toHaveProperty("metadata");
|
|
156
|
+
expect(response.data[0].metadata).toHaveProperty("title");
|
|
157
|
+
expect(response.data[0].metadata).toHaveProperty("description");
|
|
158
|
+
expect(response.data[0].metadata).toHaveProperty("language");
|
|
159
|
+
expect(response.data[0].metadata).toHaveProperty("sourceURL");
|
|
160
|
+
expect(response.data[0].metadata).toHaveProperty("statusCode");
|
|
161
|
+
expect(response.data[0].metadata).not.toHaveProperty("error");
|
|
162
|
+
}
|
|
156
163
|
}, 60000); // 60 seconds timeout
|
|
157
164
|
|
|
158
165
|
test.concurrent('should return successful response for crawl with options and wait for completion', async () => {
|
|
@@ -173,7 +180,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
173
180
|
onlyMainContent: true,
|
|
174
181
|
waitFor: 1000
|
|
175
182
|
}
|
|
176
|
-
} as CrawlParams,
|
|
183
|
+
} as CrawlParams, 30) as CrawlStatusResponse;
|
|
177
184
|
expect(response).not.toBeNull();
|
|
178
185
|
expect(response).toHaveProperty("total");
|
|
179
186
|
expect(response.total).toBeGreaterThan(0);
|
|
@@ -184,41 +191,45 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
184
191
|
expect(response).toHaveProperty("status");
|
|
185
192
|
expect(response.status).toBe("completed");
|
|
186
193
|
expect(response).not.toHaveProperty("next");
|
|
187
|
-
expect(response.data
|
|
188
|
-
expect(response.data
|
|
189
|
-
expect(response.data
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
194
|
+
expect(response.data.length).toBeGreaterThan(0);
|
|
195
|
+
expect(response.data[0]).not.toBeNull();
|
|
196
|
+
expect(response.data[0]).not.toBeUndefined();
|
|
197
|
+
if (response.data[0]) {
|
|
198
|
+
expect(response.data[0]).toHaveProperty("markdown");
|
|
199
|
+
expect(response.data[0].markdown).toContain("_Roast_");
|
|
200
|
+
expect(response.data[0]).not.toHaveProperty('content'); // v0
|
|
201
|
+
expect(response.data[0]).toHaveProperty("html");
|
|
202
|
+
expect(response.data[0].html).toContain("<h1");
|
|
203
|
+
expect(response.data[0]).toHaveProperty("rawHtml");
|
|
204
|
+
expect(response.data[0].rawHtml).toContain("<h1");
|
|
205
|
+
expect(response.data[0]).toHaveProperty("screenshot");
|
|
206
|
+
expect(response.data[0].screenshot).toContain("https://");
|
|
207
|
+
expect(response.data[0]).toHaveProperty("links");
|
|
208
|
+
expect(response.data[0].links).not.toBeNull();
|
|
209
|
+
expect(response.data[0].links?.length).toBeGreaterThan(0);
|
|
210
|
+
expect(response.data[0]).toHaveProperty("metadata");
|
|
211
|
+
expect(response.data[0].metadata).toHaveProperty("title");
|
|
212
|
+
expect(response.data[0].metadata).toHaveProperty("description");
|
|
213
|
+
expect(response.data[0].metadata).toHaveProperty("language");
|
|
214
|
+
expect(response.data[0].metadata).toHaveProperty("sourceURL");
|
|
215
|
+
expect(response.data[0].metadata).toHaveProperty("statusCode");
|
|
216
|
+
expect(response.data[0].metadata).not.toHaveProperty("error");
|
|
217
|
+
}
|
|
207
218
|
}, 60000); // 60 seconds timeout
|
|
208
219
|
|
|
209
220
|
test.concurrent('should handle idempotency key for crawl', async () => {
|
|
210
221
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
211
222
|
const uniqueIdempotencyKey = uuidv4();
|
|
212
|
-
const response = await app.
|
|
223
|
+
const response = await app.asyncCrawlUrl('https://roastmywebsite.ai', {}, uniqueIdempotencyKey) as CrawlResponse;
|
|
213
224
|
expect(response).not.toBeNull();
|
|
214
225
|
expect(response.id).toBeDefined();
|
|
215
226
|
|
|
216
|
-
await expect(app.crawlUrl('https://roastmywebsite.ai', {},
|
|
227
|
+
await expect(app.crawlUrl('https://roastmywebsite.ai', {}, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
|
|
217
228
|
});
|
|
218
229
|
|
|
219
230
|
test.concurrent('should check crawl status', async () => {
|
|
220
231
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
221
|
-
const response = await app.
|
|
232
|
+
const response = await app.asyncCrawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams) as CrawlResponse;
|
|
222
233
|
expect(response).not.toBeNull();
|
|
223
234
|
expect(response.id).toBeDefined();
|
|
224
235
|
|
|
@@ -226,7 +237,8 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
226
237
|
const maxChecks = 15;
|
|
227
238
|
let checks = 0;
|
|
228
239
|
|
|
229
|
-
|
|
240
|
+
expect(statusResponse.success).toBe(true);
|
|
241
|
+
while ((statusResponse as any).status === 'scraping' && checks < maxChecks) {
|
|
230
242
|
await new Promise(resolve => setTimeout(resolve, 5000));
|
|
231
243
|
expect(statusResponse).not.toHaveProperty("partial_data"); // v0
|
|
232
244
|
expect(statusResponse).not.toHaveProperty("current"); // v0
|
|
@@ -236,44 +248,55 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
236
248
|
expect(statusResponse).toHaveProperty("expiresAt");
|
|
237
249
|
expect(statusResponse).toHaveProperty("status");
|
|
238
250
|
expect(statusResponse).toHaveProperty("next");
|
|
239
|
-
expect(statusResponse.
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
251
|
+
expect(statusResponse.success).toBe(true);
|
|
252
|
+
if (statusResponse.success === true) {
|
|
253
|
+
expect(statusResponse.total).toBeGreaterThan(0);
|
|
254
|
+
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
|
|
255
|
+
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
|
|
256
|
+
expect(statusResponse.status).toBe("scraping");
|
|
257
|
+
expect(statusResponse.next).toContain("/v1/crawl/");
|
|
258
|
+
}
|
|
244
259
|
statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
|
|
260
|
+
expect(statusResponse.success).toBe(true);
|
|
245
261
|
checks++;
|
|
246
262
|
}
|
|
247
263
|
|
|
248
264
|
expect(statusResponse).not.toBeNull();
|
|
249
265
|
expect(statusResponse).toHaveProperty("total");
|
|
250
|
-
expect(statusResponse.
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
266
|
+
expect(statusResponse.success).toBe(true);
|
|
267
|
+
if (statusResponse.success === true) {
|
|
268
|
+
expect(statusResponse.total).toBeGreaterThan(0);
|
|
269
|
+
expect(statusResponse).toHaveProperty("creditsUsed");
|
|
270
|
+
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
|
|
271
|
+
expect(statusResponse).toHaveProperty("expiresAt");
|
|
272
|
+
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
|
|
273
|
+
expect(statusResponse).toHaveProperty("status");
|
|
274
|
+
expect(statusResponse.status).toBe("completed");
|
|
275
|
+
expect(statusResponse.data.length).toBeGreaterThan(0);
|
|
276
|
+
expect(statusResponse.data[0]).not.toBeNull();
|
|
277
|
+
expect(statusResponse.data[0]).not.toBeUndefined();
|
|
278
|
+
if (statusResponse.data[0]) {
|
|
279
|
+
expect(statusResponse.data[0]).toHaveProperty("markdown");
|
|
280
|
+
expect(statusResponse.data[0].markdown?.length).toBeGreaterThan(10);
|
|
281
|
+
expect(statusResponse.data[0]).not.toHaveProperty('content'); // v0
|
|
282
|
+
expect(statusResponse.data[0]).toHaveProperty("html");
|
|
283
|
+
expect(statusResponse.data[0].html).toContain("<div");
|
|
284
|
+
expect(statusResponse.data[0]).toHaveProperty("rawHtml");
|
|
285
|
+
expect(statusResponse.data[0].rawHtml).toContain("<div");
|
|
286
|
+
expect(statusResponse.data[0]).toHaveProperty("screenshot");
|
|
287
|
+
expect(statusResponse.data[0].screenshot).toContain("https://");
|
|
288
|
+
expect(statusResponse.data[0]).toHaveProperty("links");
|
|
289
|
+
expect(statusResponse.data[0].links).not.toBeNull();
|
|
290
|
+
expect(statusResponse.data[0].links?.length).toBeGreaterThan(0);
|
|
291
|
+
expect(statusResponse.data[0]).toHaveProperty("metadata");
|
|
292
|
+
expect(statusResponse.data[0].metadata).toHaveProperty("title");
|
|
293
|
+
expect(statusResponse.data[0].metadata).toHaveProperty("description");
|
|
294
|
+
expect(statusResponse.data[0].metadata).toHaveProperty("language");
|
|
295
|
+
expect(statusResponse.data[0].metadata).toHaveProperty("sourceURL");
|
|
296
|
+
expect(statusResponse.data[0].metadata).toHaveProperty("statusCode");
|
|
297
|
+
expect(statusResponse.data[0].metadata).not.toHaveProperty("error");
|
|
298
|
+
}
|
|
299
|
+
}
|
|
277
300
|
}, 60000); // 60 seconds timeout
|
|
278
301
|
|
|
279
302
|
test.concurrent('should throw error for invalid API key on map', async () => {
|
package/src/index.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
|
|
2
|
-
import {
|
|
1
|
+
import axios, { type AxiosResponse, type AxiosRequestHeaders } from "axios";
|
|
2
|
+
import type { infer as ZodInfer, ZodSchema } from "zod";
|
|
3
3
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
4
4
|
import { WebSocket } from "isows";
|
|
5
5
|
import { TypedEventTarget } from "typescript-event-target";
|
|
@@ -58,13 +58,13 @@ export interface FirecrawlDocumentMetadata {
|
|
|
58
58
|
* Document interface for Firecrawl.
|
|
59
59
|
* Represents a document retrieved or processed by Firecrawl.
|
|
60
60
|
*/
|
|
61
|
-
export interface FirecrawlDocument {
|
|
61
|
+
export interface FirecrawlDocument<T> {
|
|
62
62
|
url?: string;
|
|
63
63
|
markdown?: string;
|
|
64
64
|
html?: string;
|
|
65
65
|
rawHtml?: string;
|
|
66
66
|
links?: string[];
|
|
67
|
-
extract?:
|
|
67
|
+
extract?: T;
|
|
68
68
|
screenshot?: string;
|
|
69
69
|
metadata?: FirecrawlDocumentMetadata;
|
|
70
70
|
}
|
|
@@ -73,26 +73,29 @@ export interface FirecrawlDocument {
|
|
|
73
73
|
* Parameters for scraping operations.
|
|
74
74
|
* Defines the options and configurations available for scraping web content.
|
|
75
75
|
*/
|
|
76
|
-
export interface
|
|
76
|
+
export interface CrawlScrapeOptions {
|
|
77
77
|
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[];
|
|
78
78
|
headers?: Record<string, string>;
|
|
79
79
|
includeTags?: string[];
|
|
80
80
|
excludeTags?: string[];
|
|
81
81
|
onlyMainContent?: boolean;
|
|
82
|
+
waitFor?: number;
|
|
83
|
+
timeout?: number;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export interface ScrapeParams<LLMSchema extends ZodSchema> extends CrawlScrapeOptions {
|
|
82
87
|
extract?: {
|
|
83
88
|
prompt?: string;
|
|
84
|
-
schema?:
|
|
89
|
+
schema?: LLMSchema;
|
|
85
90
|
systemPrompt?: string;
|
|
86
91
|
};
|
|
87
|
-
waitFor?: number;
|
|
88
|
-
timeout?: number;
|
|
89
92
|
}
|
|
90
93
|
|
|
91
94
|
/**
|
|
92
95
|
* Response interface for scraping operations.
|
|
93
96
|
* Defines the structure of the response received after a scraping operation.
|
|
94
97
|
*/
|
|
95
|
-
export interface ScrapeResponse extends FirecrawlDocument {
|
|
98
|
+
export interface ScrapeResponse<LLMResult> extends FirecrawlDocument<LLMResult> {
|
|
96
99
|
success: true;
|
|
97
100
|
warning?: string;
|
|
98
101
|
error?: string;
|
|
@@ -110,7 +113,7 @@ export interface CrawlParams {
|
|
|
110
113
|
allowBackwardLinks?: boolean;
|
|
111
114
|
allowExternalLinks?: boolean;
|
|
112
115
|
ignoreSitemap?: boolean;
|
|
113
|
-
scrapeOptions?:
|
|
116
|
+
scrapeOptions?: CrawlScrapeOptions;
|
|
114
117
|
webhook?: string;
|
|
115
118
|
}
|
|
116
119
|
|
|
@@ -131,15 +134,14 @@ export interface CrawlResponse {
|
|
|
131
134
|
*/
|
|
132
135
|
export interface CrawlStatusResponse {
|
|
133
136
|
success: true;
|
|
134
|
-
|
|
137
|
+
status: "scraping" | "completed" | "failed" | "cancelled";
|
|
135
138
|
completed: number;
|
|
139
|
+
total: number;
|
|
136
140
|
creditsUsed: number;
|
|
137
141
|
expiresAt: Date;
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
error?: string;
|
|
142
|
-
}
|
|
142
|
+
next?: string;
|
|
143
|
+
data: FirecrawlDocument<undefined>[];
|
|
144
|
+
};
|
|
143
145
|
|
|
144
146
|
/**
|
|
145
147
|
* Parameters for mapping operations.
|
|
@@ -184,7 +186,11 @@ export default class FirecrawlApp {
|
|
|
184
186
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
185
187
|
*/
|
|
186
188
|
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
|
187
|
-
|
|
189
|
+
if (typeof apiKey !== "string") {
|
|
190
|
+
throw new Error("No API key provided");
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
this.apiKey = apiKey;
|
|
188
194
|
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
|
189
195
|
}
|
|
190
196
|
|
|
@@ -194,10 +200,10 @@ export default class FirecrawlApp {
|
|
|
194
200
|
* @param params - Additional parameters for the scrape request.
|
|
195
201
|
* @returns The response from the scrape operation.
|
|
196
202
|
*/
|
|
197
|
-
async scrapeUrl(
|
|
203
|
+
async scrapeUrl<T extends ZodSchema>(
|
|
198
204
|
url: string,
|
|
199
|
-
params?: ScrapeParams
|
|
200
|
-
): Promise<ScrapeResponse | ErrorResponse> {
|
|
205
|
+
params?: ScrapeParams<T>
|
|
206
|
+
): Promise<ScrapeResponse<ZodInfer<T>> | ErrorResponse> {
|
|
201
207
|
const headers: AxiosRequestHeaders = {
|
|
202
208
|
"Content-Type": "application/json",
|
|
203
209
|
Authorization: `Bearer ${this.apiKey}`,
|
|
@@ -329,9 +335,10 @@ export default class FirecrawlApp {
|
|
|
329
335
|
/**
|
|
330
336
|
* Checks the status of a crawl job using the Firecrawl API.
|
|
331
337
|
* @param id - The ID of the crawl operation.
|
|
338
|
+
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
332
339
|
* @returns The response containing the job status.
|
|
333
340
|
*/
|
|
334
|
-
async checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
341
|
+
async checkCrawlStatus(id?: string, getAllData = false): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
335
342
|
if (!id) {
|
|
336
343
|
throw new Error("No crawl ID provided");
|
|
337
344
|
}
|
|
@@ -343,16 +350,28 @@ export default class FirecrawlApp {
|
|
|
343
350
|
headers
|
|
344
351
|
);
|
|
345
352
|
if (response.status === 200) {
|
|
353
|
+
let allData = response.data.data;
|
|
354
|
+
if (getAllData && response.data.status === "completed") {
|
|
355
|
+
let statusData = response.data
|
|
356
|
+
if ("data" in statusData) {
|
|
357
|
+
let data = statusData.data;
|
|
358
|
+
while ('next' in statusData) {
|
|
359
|
+
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
360
|
+
data = data.concat(statusData.data);
|
|
361
|
+
}
|
|
362
|
+
allData = data;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
346
365
|
return ({
|
|
347
|
-
success:
|
|
366
|
+
success: response.data.success,
|
|
348
367
|
status: response.data.status,
|
|
349
368
|
total: response.data.total,
|
|
350
369
|
completed: response.data.completed,
|
|
351
370
|
creditsUsed: response.data.creditsUsed,
|
|
352
371
|
expiresAt: new Date(response.data.expiresAt),
|
|
353
372
|
next: response.data.next,
|
|
354
|
-
data:
|
|
355
|
-
error: response.data.error
|
|
373
|
+
data: allData,
|
|
374
|
+
error: response.data.error,
|
|
356
375
|
})
|
|
357
376
|
} else {
|
|
358
377
|
this.handleError(response, "check crawl status");
|
|
@@ -452,7 +471,7 @@ export default class FirecrawlApp {
|
|
|
452
471
|
id: string,
|
|
453
472
|
headers: AxiosRequestHeaders,
|
|
454
473
|
checkInterval: number
|
|
455
|
-
): Promise<CrawlStatusResponse> {
|
|
474
|
+
): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
456
475
|
while (true) {
|
|
457
476
|
let statusResponse: AxiosResponse = await this.getRequest(
|
|
458
477
|
`${this.apiUrl}/v1/crawl/${id}`,
|
|
@@ -460,20 +479,20 @@ export default class FirecrawlApp {
|
|
|
460
479
|
);
|
|
461
480
|
if (statusResponse.status === 200) {
|
|
462
481
|
let statusData = statusResponse.data;
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
482
|
+
if (statusData.status === "completed") {
|
|
483
|
+
if ("data" in statusData) {
|
|
484
|
+
let data = statusData.data;
|
|
485
|
+
while ('next' in statusData) {
|
|
486
|
+
statusResponse = await this.getRequest(statusData.next, headers);
|
|
487
|
+
statusData = statusResponse.data;
|
|
488
|
+
data = data.concat(statusData.data);
|
|
489
|
+
}
|
|
490
|
+
statusData.data = data;
|
|
491
|
+
return statusData;
|
|
492
|
+
} else {
|
|
493
|
+
throw new Error("Crawl job completed but no data was returned");
|
|
470
494
|
}
|
|
471
|
-
|
|
472
|
-
return statusData;
|
|
473
|
-
} else {
|
|
474
|
-
throw new Error("Crawl job completed but no data was returned");
|
|
475
|
-
}
|
|
476
|
-
} else if (
|
|
495
|
+
} else if (
|
|
477
496
|
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
|
|
478
497
|
) {
|
|
479
498
|
checkInterval = Math.max(checkInterval, 2);
|
|
@@ -512,21 +531,21 @@ export default class FirecrawlApp {
|
|
|
512
531
|
}
|
|
513
532
|
|
|
514
533
|
interface CrawlWatcherEvents {
|
|
515
|
-
document: CustomEvent<FirecrawlDocument
|
|
534
|
+
document: CustomEvent<FirecrawlDocument<undefined>>,
|
|
516
535
|
done: CustomEvent<{
|
|
517
536
|
status: CrawlStatusResponse["status"];
|
|
518
|
-
data: FirecrawlDocument[];
|
|
537
|
+
data: FirecrawlDocument<undefined>[];
|
|
519
538
|
}>,
|
|
520
539
|
error: CustomEvent<{
|
|
521
540
|
status: CrawlStatusResponse["status"],
|
|
522
|
-
data: FirecrawlDocument[],
|
|
541
|
+
data: FirecrawlDocument<undefined>[],
|
|
523
542
|
error: string,
|
|
524
543
|
}>,
|
|
525
544
|
}
|
|
526
545
|
|
|
527
546
|
export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
528
547
|
private ws: WebSocket;
|
|
529
|
-
public data: FirecrawlDocument[];
|
|
548
|
+
public data: FirecrawlDocument<undefined>[];
|
|
530
549
|
public status: CrawlStatusResponse["status"];
|
|
531
550
|
|
|
532
551
|
constructor(id: string, app: FirecrawlApp) {
|
|
@@ -547,7 +566,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
547
566
|
|
|
548
567
|
type DocumentMessage = {
|
|
549
568
|
type: "document",
|
|
550
|
-
data: FirecrawlDocument
|
|
569
|
+
data: FirecrawlDocument<undefined>,
|
|
551
570
|
}
|
|
552
571
|
|
|
553
572
|
type DoneMessage = { type: "done" }
|