firecrawl 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index';
1
+ import FirecrawlApp, { type CrawlParams, type CrawlResponse, type CrawlStatusResponse, type MapResponse, type ScrapeResponse } from '../../../index';
2
2
  import { v4 as uuidv4 } from 'uuid';
3
3
  import dotenv from 'dotenv';
4
4
  import { describe, test, expect } from '@jest/globals';
@@ -6,7 +6,7 @@ import { describe, test, expect } from '@jest/globals';
6
6
  dotenv.config();
7
7
 
8
8
  const TEST_API_KEY = process.env.TEST_API_KEY;
9
- const API_URL = "http://127.0.0.1:3002";
9
+ const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
10
10
 
11
11
  describe('FirecrawlApp E2E Tests', () => {
12
12
  test.concurrent('should throw error for no API key', async () => {
@@ -71,6 +71,7 @@ describe('FirecrawlApp E2E Tests', () => {
71
71
  expect(response.links?.length).toBeGreaterThan(0);
72
72
  expect(response.links?.[0]).toContain("https://");
73
73
  expect(response.metadata).not.toBeNull();
74
+ expect(response.metadata).not.toBeUndefined();
74
75
  expect(response.metadata).toHaveProperty("title");
75
76
  expect(response.metadata).toHaveProperty("description");
76
77
  expect(response.metadata).toHaveProperty("keywords");
@@ -85,19 +86,21 @@ describe('FirecrawlApp E2E Tests', () => {
85
86
  expect(response.metadata).not.toHaveProperty("pageStatusCode");
86
87
  expect(response.metadata).toHaveProperty("statusCode");
87
88
  expect(response.metadata).not.toHaveProperty("pageError");
88
- expect(response.metadata.error).toBeUndefined();
89
- expect(response.metadata.title).toBe("Roast My Website");
90
- expect(response.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
91
- expect(response.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
92
- expect(response.metadata.robots).toBe("follow, index");
93
- expect(response.metadata.ogTitle).toBe("Roast My Website");
94
- expect(response.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
95
- expect(response.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
96
- expect(response.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
97
- expect(response.metadata.ogLocaleAlternate).toStrictEqual([]);
98
- expect(response.metadata.ogSiteName).toBe("Roast My Website");
99
- expect(response.metadata.sourceURL).toBe("https://roastmywebsite.ai");
100
- expect(response.metadata.statusCode).toBe(200);
89
+ if (response.metadata !== undefined) {
90
+ expect(response.metadata.error).toBeUndefined();
91
+ expect(response.metadata.title).toBe("Roast My Website");
92
+ expect(response.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
93
+ expect(response.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
94
+ expect(response.metadata.robots).toBe("follow, index");
95
+ expect(response.metadata.ogTitle).toBe("Roast My Website");
96
+ expect(response.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
97
+ expect(response.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
98
+ expect(response.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
99
+ expect(response.metadata.ogLocaleAlternate).toStrictEqual([]);
100
+ expect(response.metadata.ogSiteName).toBe("Roast My Website");
101
+ expect(response.metadata.sourceURL).toBe("https://roastmywebsite.ai");
102
+ expect(response.metadata.statusCode).toBe(200);
103
+ }
101
104
  }, 30000); // 30 seconds timeout
102
105
 
103
106
  test.concurrent('should return successful response for valid scrape with PDF file', async () => {
@@ -127,7 +130,7 @@ describe('FirecrawlApp E2E Tests', () => {
127
130
 
128
131
  test.concurrent('should return successful response for crawl and wait for completion', async () => {
129
132
  const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
130
- const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as CrawlStatusResponse;
133
+ const response = await app.crawlUrl('https://roastmywebsite.ai', {}, 30) as CrawlStatusResponse;
131
134
  expect(response).not.toBeNull();
132
135
  expect(response).toHaveProperty("total");
133
136
  expect(response.total).toBeGreaterThan(0);
@@ -138,21 +141,25 @@ describe('FirecrawlApp E2E Tests', () => {
138
141
  expect(response).toHaveProperty("status");
139
142
  expect(response.status).toBe("completed");
140
143
  expect(response).not.toHaveProperty("next"); // wait until done
141
- expect(response.data?.length).toBeGreaterThan(0);
142
- expect(response.data?.[0]).toHaveProperty("markdown");
143
- expect(response.data?.[0].markdown).toContain("_Roast_");
144
- expect(response.data?.[0]).not.toHaveProperty('content'); // v0
145
- expect(response.data?.[0]).not.toHaveProperty("html");
146
- expect(response.data?.[0]).not.toHaveProperty("rawHtml");
147
- expect(response.data?.[0]).not.toHaveProperty("screenshot");
148
- expect(response.data?.[0]).not.toHaveProperty("links");
149
- expect(response.data?.[0]).toHaveProperty("metadata");
150
- expect(response.data?.[0].metadata).toHaveProperty("title");
151
- expect(response.data?.[0].metadata).toHaveProperty("description");
152
- expect(response.data?.[0].metadata).toHaveProperty("language");
153
- expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
154
- expect(response.data?.[0].metadata).toHaveProperty("statusCode");
155
- expect(response.data?.[0].metadata).not.toHaveProperty("error");
144
+ expect(response.data.length).toBeGreaterThan(0);
145
+ expect(response.data[0]).not.toBeNull();
146
+ expect(response.data[0]).not.toBeUndefined();
147
+ if (response.data[0]) {
148
+ expect(response.data[0]).toHaveProperty("markdown");
149
+ expect(response.data[0].markdown).toContain("_Roast_");
150
+ expect(response.data[0]).not.toHaveProperty('content'); // v0
151
+ expect(response.data[0]).not.toHaveProperty("html");
152
+ expect(response.data[0]).not.toHaveProperty("rawHtml");
153
+ expect(response.data[0]).not.toHaveProperty("screenshot");
154
+ expect(response.data[0]).not.toHaveProperty("links");
155
+ expect(response.data[0]).toHaveProperty("metadata");
156
+ expect(response.data[0].metadata).toHaveProperty("title");
157
+ expect(response.data[0].metadata).toHaveProperty("description");
158
+ expect(response.data[0].metadata).toHaveProperty("language");
159
+ expect(response.data[0].metadata).toHaveProperty("sourceURL");
160
+ expect(response.data[0].metadata).toHaveProperty("statusCode");
161
+ expect(response.data[0].metadata).not.toHaveProperty("error");
162
+ }
156
163
  }, 60000); // 60 seconds timeout
157
164
 
158
165
  test.concurrent('should return successful response for crawl with options and wait for completion', async () => {
@@ -173,7 +180,7 @@ describe('FirecrawlApp E2E Tests', () => {
173
180
  onlyMainContent: true,
174
181
  waitFor: 1000
175
182
  }
176
- } as CrawlParams, true, 30) as CrawlStatusResponse;
183
+ } as CrawlParams, 30) as CrawlStatusResponse;
177
184
  expect(response).not.toBeNull();
178
185
  expect(response).toHaveProperty("total");
179
186
  expect(response.total).toBeGreaterThan(0);
@@ -184,41 +191,45 @@ describe('FirecrawlApp E2E Tests', () => {
184
191
  expect(response).toHaveProperty("status");
185
192
  expect(response.status).toBe("completed");
186
193
  expect(response).not.toHaveProperty("next");
187
- expect(response.data?.length).toBeGreaterThan(0);
188
- expect(response.data?.[0]).toHaveProperty("markdown");
189
- expect(response.data?.[0].markdown).toContain("_Roast_");
190
- expect(response.data?.[0]).not.toHaveProperty('content'); // v0
191
- expect(response.data?.[0]).toHaveProperty("html");
192
- expect(response.data?.[0].html).toContain("<h1");
193
- expect(response.data?.[0]).toHaveProperty("rawHtml");
194
- expect(response.data?.[0].rawHtml).toContain("<h1");
195
- expect(response.data?.[0]).toHaveProperty("screenshot");
196
- expect(response.data?.[0].screenshot).toContain("https://");
197
- expect(response.data?.[0]).toHaveProperty("links");
198
- expect(response.data?.[0].links).not.toBeNull();
199
- expect(response.data?.[0].links?.length).toBeGreaterThan(0);
200
- expect(response.data?.[0]).toHaveProperty("metadata");
201
- expect(response.data?.[0].metadata).toHaveProperty("title");
202
- expect(response.data?.[0].metadata).toHaveProperty("description");
203
- expect(response.data?.[0].metadata).toHaveProperty("language");
204
- expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
205
- expect(response.data?.[0].metadata).toHaveProperty("statusCode");
206
- expect(response.data?.[0].metadata).not.toHaveProperty("error");
194
+ expect(response.data.length).toBeGreaterThan(0);
195
+ expect(response.data[0]).not.toBeNull();
196
+ expect(response.data[0]).not.toBeUndefined();
197
+ if (response.data[0]) {
198
+ expect(response.data[0]).toHaveProperty("markdown");
199
+ expect(response.data[0].markdown).toContain("_Roast_");
200
+ expect(response.data[0]).not.toHaveProperty('content'); // v0
201
+ expect(response.data[0]).toHaveProperty("html");
202
+ expect(response.data[0].html).toContain("<h1");
203
+ expect(response.data[0]).toHaveProperty("rawHtml");
204
+ expect(response.data[0].rawHtml).toContain("<h1");
205
+ expect(response.data[0]).toHaveProperty("screenshot");
206
+ expect(response.data[0].screenshot).toContain("https://");
207
+ expect(response.data[0]).toHaveProperty("links");
208
+ expect(response.data[0].links).not.toBeNull();
209
+ expect(response.data[0].links?.length).toBeGreaterThan(0);
210
+ expect(response.data[0]).toHaveProperty("metadata");
211
+ expect(response.data[0].metadata).toHaveProperty("title");
212
+ expect(response.data[0].metadata).toHaveProperty("description");
213
+ expect(response.data[0].metadata).toHaveProperty("language");
214
+ expect(response.data[0].metadata).toHaveProperty("sourceURL");
215
+ expect(response.data[0].metadata).toHaveProperty("statusCode");
216
+ expect(response.data[0].metadata).not.toHaveProperty("error");
217
+ }
207
218
  }, 60000); // 60 seconds timeout
208
219
 
209
220
  test.concurrent('should handle idempotency key for crawl', async () => {
210
221
  const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
211
222
  const uniqueIdempotencyKey = uuidv4();
212
- const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse;
223
+ const response = await app.asyncCrawlUrl('https://roastmywebsite.ai', {}, uniqueIdempotencyKey) as CrawlResponse;
213
224
  expect(response).not.toBeNull();
214
225
  expect(response.id).toBeDefined();
215
226
 
216
- await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
227
+ await expect(app.crawlUrl('https://roastmywebsite.ai', {}, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
217
228
  });
218
229
 
219
230
  test.concurrent('should check crawl status', async () => {
220
231
  const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
221
- const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse;
232
+ const response = await app.asyncCrawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams) as CrawlResponse;
222
233
  expect(response).not.toBeNull();
223
234
  expect(response.id).toBeDefined();
224
235
 
@@ -226,7 +237,8 @@ describe('FirecrawlApp E2E Tests', () => {
226
237
  const maxChecks = 15;
227
238
  let checks = 0;
228
239
 
229
- while (statusResponse.status === 'scraping' && checks < maxChecks) {
240
+ expect(statusResponse.success).toBe(true);
241
+ while ((statusResponse as any).status === 'scraping' && checks < maxChecks) {
230
242
  await new Promise(resolve => setTimeout(resolve, 5000));
231
243
  expect(statusResponse).not.toHaveProperty("partial_data"); // v0
232
244
  expect(statusResponse).not.toHaveProperty("current"); // v0
@@ -236,44 +248,55 @@ describe('FirecrawlApp E2E Tests', () => {
236
248
  expect(statusResponse).toHaveProperty("expiresAt");
237
249
  expect(statusResponse).toHaveProperty("status");
238
250
  expect(statusResponse).toHaveProperty("next");
239
- expect(statusResponse.total).toBeGreaterThan(0);
240
- expect(statusResponse.creditsUsed).toBeGreaterThan(0);
241
- expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
242
- expect(statusResponse.status).toBe("scraping");
243
- expect(statusResponse.next).toContain("/v1/crawl/");
251
+ expect(statusResponse.success).toBe(true);
252
+ if (statusResponse.success === true) {
253
+ expect(statusResponse.total).toBeGreaterThan(0);
254
+ expect(statusResponse.creditsUsed).toBeGreaterThan(0);
255
+ expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
256
+ expect(statusResponse.status).toBe("scraping");
257
+ expect(statusResponse.next).toContain("/v1/crawl/");
258
+ }
244
259
  statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
260
+ expect(statusResponse.success).toBe(true);
245
261
  checks++;
246
262
  }
247
263
 
248
264
  expect(statusResponse).not.toBeNull();
249
265
  expect(statusResponse).toHaveProperty("total");
250
- expect(statusResponse.total).toBeGreaterThan(0);
251
- expect(statusResponse).toHaveProperty("creditsUsed");
252
- expect(statusResponse.creditsUsed).toBeGreaterThan(0);
253
- expect(statusResponse).toHaveProperty("expiresAt");
254
- expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
255
- expect(statusResponse).toHaveProperty("status");
256
- expect(statusResponse.status).toBe("completed");
257
- expect(statusResponse.data?.length).toBeGreaterThan(0);
258
- expect(statusResponse.data?.[0]).toHaveProperty("markdown");
259
- expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10);
260
- expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0
261
- expect(statusResponse.data?.[0]).toHaveProperty("html");
262
- expect(statusResponse.data?.[0].html).toContain("<div");
263
- expect(statusResponse.data?.[0]).toHaveProperty("rawHtml");
264
- expect(statusResponse.data?.[0].rawHtml).toContain("<div");
265
- expect(statusResponse.data?.[0]).toHaveProperty("screenshot");
266
- expect(statusResponse.data?.[0].screenshot).toContain("https://");
267
- expect(statusResponse.data?.[0]).toHaveProperty("links");
268
- expect(statusResponse.data?.[0].links).not.toBeNull();
269
- expect(statusResponse.data?.[0].links?.length).toBeGreaterThan(0);
270
- expect(statusResponse.data?.[0]).toHaveProperty("metadata");
271
- expect(statusResponse.data?.[0].metadata).toHaveProperty("title");
272
- expect(statusResponse.data?.[0].metadata).toHaveProperty("description");
273
- expect(statusResponse.data?.[0].metadata).toHaveProperty("language");
274
- expect(statusResponse.data?.[0].metadata).toHaveProperty("sourceURL");
275
- expect(statusResponse.data?.[0].metadata).toHaveProperty("statusCode");
276
- expect(statusResponse.data?.[0].metadata).not.toHaveProperty("error");
266
+ expect(statusResponse.success).toBe(true);
267
+ if (statusResponse.success === true) {
268
+ expect(statusResponse.total).toBeGreaterThan(0);
269
+ expect(statusResponse).toHaveProperty("creditsUsed");
270
+ expect(statusResponse.creditsUsed).toBeGreaterThan(0);
271
+ expect(statusResponse).toHaveProperty("expiresAt");
272
+ expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
273
+ expect(statusResponse).toHaveProperty("status");
274
+ expect(statusResponse.status).toBe("completed");
275
+ expect(statusResponse.data.length).toBeGreaterThan(0);
276
+ expect(statusResponse.data[0]).not.toBeNull();
277
+ expect(statusResponse.data[0]).not.toBeUndefined();
278
+ if (statusResponse.data[0]) {
279
+ expect(statusResponse.data[0]).toHaveProperty("markdown");
280
+ expect(statusResponse.data[0].markdown?.length).toBeGreaterThan(10);
281
+ expect(statusResponse.data[0]).not.toHaveProperty('content'); // v0
282
+ expect(statusResponse.data[0]).toHaveProperty("html");
283
+ expect(statusResponse.data[0].html).toContain("<div");
284
+ expect(statusResponse.data[0]).toHaveProperty("rawHtml");
285
+ expect(statusResponse.data[0].rawHtml).toContain("<div");
286
+ expect(statusResponse.data[0]).toHaveProperty("screenshot");
287
+ expect(statusResponse.data[0].screenshot).toContain("https://");
288
+ expect(statusResponse.data[0]).toHaveProperty("links");
289
+ expect(statusResponse.data[0].links).not.toBeNull();
290
+ expect(statusResponse.data[0].links?.length).toBeGreaterThan(0);
291
+ expect(statusResponse.data[0]).toHaveProperty("metadata");
292
+ expect(statusResponse.data[0].metadata).toHaveProperty("title");
293
+ expect(statusResponse.data[0].metadata).toHaveProperty("description");
294
+ expect(statusResponse.data[0].metadata).toHaveProperty("language");
295
+ expect(statusResponse.data[0].metadata).toHaveProperty("sourceURL");
296
+ expect(statusResponse.data[0].metadata).toHaveProperty("statusCode");
297
+ expect(statusResponse.data[0].metadata).not.toHaveProperty("error");
298
+ }
299
+ }
277
300
  }, 60000); // 60 seconds timeout
278
301
 
279
302
  test.concurrent('should throw error for invalid API key on map', async () => {
package/src/index.ts CHANGED
@@ -1,5 +1,5 @@
1
- import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
2
- import { z } from "zod";
1
+ import axios, { type AxiosResponse, type AxiosRequestHeaders } from "axios";
2
+ import type { ZodSchema } from "zod";
3
3
  import { zodToJsonSchema } from "zod-to-json-schema";
4
4
  import { WebSocket } from "isows";
5
5
  import { TypedEventTarget } from "typescript-event-target";
@@ -64,6 +64,7 @@ export interface FirecrawlDocument {
64
64
  html?: string;
65
65
  rawHtml?: string;
66
66
  links?: string[];
67
+ extract?: Record<any, any>;
67
68
  screenshot?: string;
68
69
  metadata?: FirecrawlDocumentMetadata;
69
70
  }
@@ -73,12 +74,17 @@ export interface FirecrawlDocument {
73
74
  * Defines the options and configurations available for scraping web content.
74
75
  */
75
76
  export interface ScrapeParams {
76
- formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "full@scrennshot")[];
77
+ formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[];
77
78
  headers?: Record<string, string>;
78
79
  includeTags?: string[];
79
80
  excludeTags?: string[];
80
81
  onlyMainContent?: boolean;
81
- waitFor?: number;
82
+ extract?: {
83
+ prompt?: string;
84
+ schema?: ZodSchema | any;
85
+ systemPrompt?: string;
86
+ };
87
+ waitFor?: number;
82
88
  timeout?: number;
83
89
  }
84
90
 
@@ -105,6 +111,7 @@ export interface CrawlParams {
105
111
  allowExternalLinks?: boolean;
106
112
  ignoreSitemap?: boolean;
107
113
  scrapeOptions?: ScrapeParams;
114
+ webhook?: string;
108
115
  }
109
116
 
110
117
  /**
@@ -124,15 +131,14 @@ export interface CrawlResponse {
124
131
  */
125
132
  export interface CrawlStatusResponse {
126
133
  success: true;
127
- total: number;
134
+ status: "scraping" | "completed" | "failed" | "cancelled";
128
135
  completed: number;
136
+ total: number;
129
137
  creditsUsed: number;
130
138
  expiresAt: Date;
131
- status: "scraping" | "completed" | "failed";
132
- next: string;
133
- data?: FirecrawlDocument[];
134
- error?: string;
135
- }
139
+ next?: string;
140
+ data: FirecrawlDocument[];
141
+ };
136
142
 
137
143
  /**
138
144
  * Parameters for mapping operations.
@@ -177,7 +183,11 @@ export default class FirecrawlApp {
177
183
  * @param config - Configuration options for the FirecrawlApp instance.
178
184
  */
179
185
  constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
180
- this.apiKey = apiKey || "";
186
+ if (typeof apiKey !== "string") {
187
+ throw new Error("No API key provided");
188
+ }
189
+
190
+ this.apiKey = apiKey;
181
191
  this.apiUrl = apiUrl || "https://api.firecrawl.dev";
182
192
  }
183
193
 
@@ -196,18 +206,20 @@ export default class FirecrawlApp {
196
206
  Authorization: `Bearer ${this.apiKey}`,
197
207
  } as AxiosRequestHeaders;
198
208
  let jsonData: any = { url, ...params };
199
- if (jsonData?.extractorOptions?.extractionSchema) {
200
- let schema = jsonData.extractorOptions.extractionSchema;
201
- // Check if schema is an instance of ZodSchema to correctly identify Zod schemas
202
- if (schema instanceof z.ZodSchema) {
209
+ if (jsonData?.extract?.schema) {
210
+ let schema = jsonData.extract.schema;
211
+
212
+ // Try parsing the schema as a Zod schema
213
+ try {
203
214
  schema = zodToJsonSchema(schema);
215
+ } catch (error) {
216
+
204
217
  }
205
218
  jsonData = {
206
219
  ...jsonData,
207
- extractorOptions: {
208
- ...jsonData.extractorOptions,
209
- extractionSchema: schema,
210
- mode: jsonData.extractorOptions.mode || "llm-extraction",
220
+ extract: {
221
+ ...jsonData.extract,
222
+ schema: schema,
211
223
  },
212
224
  };
213
225
  }
@@ -320,9 +332,10 @@ export default class FirecrawlApp {
320
332
  /**
321
333
  * Checks the status of a crawl job using the Firecrawl API.
322
334
  * @param id - The ID of the crawl operation.
335
+ * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
323
336
  * @returns The response containing the job status.
324
337
  */
325
- async checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | ErrorResponse> {
338
+ async checkCrawlStatus(id?: string, getAllData = false): Promise<CrawlStatusResponse | ErrorResponse> {
326
339
  if (!id) {
327
340
  throw new Error("No crawl ID provided");
328
341
  }
@@ -334,16 +347,28 @@ export default class FirecrawlApp {
334
347
  headers
335
348
  );
336
349
  if (response.status === 200) {
350
+ let allData = response.data.data;
351
+ if (getAllData && response.data.status === "completed") {
352
+ let statusData = response.data
353
+ if ("data" in statusData) {
354
+ let data = statusData.data;
355
+ while ('next' in statusData) {
356
+ statusData = (await this.getRequest(statusData.next, headers)).data;
357
+ data = data.concat(statusData.data);
358
+ }
359
+ allData = data;
360
+ }
361
+ }
337
362
  return ({
338
- success: true,
363
+ success: response.data.success,
339
364
  status: response.data.status,
340
365
  total: response.data.total,
341
366
  completed: response.data.completed,
342
367
  creditsUsed: response.data.creditsUsed,
343
368
  expiresAt: new Date(response.data.expiresAt),
344
369
  next: response.data.next,
345
- data: response.data.data,
346
- error: response.data.error
370
+ data: allData,
371
+ error: response.data.error,
347
372
  })
348
373
  } else {
349
374
  this.handleError(response, "check crawl status");
@@ -443,22 +468,29 @@ export default class FirecrawlApp {
443
468
  id: string,
444
469
  headers: AxiosRequestHeaders,
445
470
  checkInterval: number
446
- ): Promise<CrawlStatusResponse> {
471
+ ): Promise<CrawlStatusResponse | ErrorResponse> {
447
472
  while (true) {
448
- const statusResponse: AxiosResponse = await this.getRequest(
473
+ let statusResponse: AxiosResponse = await this.getRequest(
449
474
  `${this.apiUrl}/v1/crawl/${id}`,
450
475
  headers
451
476
  );
452
477
  if (statusResponse.status === 200) {
453
- const statusData = statusResponse.data;
454
- if (statusData.status === "completed") {
455
- if ("data" in statusData) {
456
- return statusData;
457
- } else {
458
- throw new Error("Crawl job completed but no data was returned");
459
- }
460
- } else if (
461
- ["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)
478
+ let statusData = statusResponse.data;
479
+ if (statusData.status === "completed") {
480
+ if ("data" in statusData) {
481
+ let data = statusData.data;
482
+ while ('next' in statusData) {
483
+ statusResponse = await this.getRequest(statusData.next, headers);
484
+ statusData = statusResponse.data;
485
+ data = data.concat(statusData.data);
486
+ }
487
+ statusData.data = data;
488
+ return statusData;
489
+ } else {
490
+ throw new Error("Crawl job completed but no data was returned");
491
+ }
492
+ } else if (
493
+ ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
462
494
  ) {
463
495
  checkInterval = Math.max(checkInterval, 2);
464
496
  await new Promise((resolve) =>
package/tsconfig.json CHANGED
@@ -1,110 +1,24 @@
1
1
  {
2
2
  "compilerOptions": {
3
- /* Visit https://aka.ms/tsconfig to read more about this file */
4
-
5
- /* Projects */
6
- // "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
7
- // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
8
- // "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
9
- // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
10
- // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
11
- // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
12
-
13
- /* Language and Environment */
14
- "target": "es2020", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
15
- // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
16
- // "jsx": "preserve", /* Specify what JSX code is generated. */
17
- // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
18
- // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
19
- // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
20
- // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
21
- // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
22
- // "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
23
- // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
24
- // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
25
- // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
26
-
27
- /* Modules */
28
- "module": "commonjs", /* Specify what module code is generated. */
29
- "rootDir": "./src", /* Specify the root folder within your source files. */
30
- "moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */
31
- // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
32
- // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
33
- // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
34
- // "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */
35
- // "types": [], /* Specify type package names to be included without being referenced in a source file. */
36
- // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
37
- // "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
38
- // "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
39
- // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
40
- // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
41
- // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
42
- // "resolveJsonModule": true, /* Enable importing .json files. */
43
- // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
44
- // "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
45
-
46
- /* JavaScript Support */
47
- // "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
48
- // "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
49
- // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
50
-
51
- /* Emit */
52
- "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
53
- // "declarationMap": true, /* Create sourcemaps for d.ts files. */
54
- // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
55
- // "sourceMap": true, /* Create source map files for emitted JavaScript files. */
56
- // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
57
- // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
58
- "outDir": "./build", /* Specify an output folder for all emitted files. */
59
- // "removeComments": true, /* Disable emitting comments. */
60
- // "noEmit": true, /* Disable emitting files from a compilation. */
61
- // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
62
- // "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types. */
63
- // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
64
- // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
65
- // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
66
- // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
67
- // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
68
- // "newLine": "crlf", /* Set the newline character for emitting files. */
69
- // "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
70
- // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
71
- // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
72
- // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
73
- "declarationDir": "./types", /* Specify the output directory for generated declaration files. */
74
- // "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */
75
-
76
- /* Interop Constraints */
77
- // "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
78
- // "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
79
- // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
80
- "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
81
- // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
82
- "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
83
-
84
- /* Type Checking */
85
- "strict": true, /* Enable all strict type-checking options. */
86
- // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
87
- // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
88
- // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
89
- // "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
90
- // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
91
- // "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
92
- // "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
93
- // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
94
- // "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
95
- // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
96
- // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
97
- // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
98
- // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
99
- // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
100
- // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
101
- // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
102
- // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
103
- // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
104
-
105
- /* Completeness */
106
- // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
107
- "skipLibCheck": true /* Skip type checking all .d.ts files. */
3
+ // See https://www.totaltypescript.com/tsconfig-cheat-sheet
4
+ /* Base Options: */
5
+ "esModuleInterop": true,
6
+ "skipLibCheck": true,
7
+ "target": "es2022",
8
+ "allowJs": true,
9
+ "resolveJsonModule": true,
10
+ "moduleDetection": "force",
11
+ "isolatedModules": true,
12
+ "verbatimModuleSyntax": true,
13
+
14
+ /* Strictness */
15
+ "strict": true,
16
+ "noUncheckedIndexedAccess": true,
17
+ "noImplicitOverride": true,
18
+
19
+ /* If NOT transpiling with TypeScript: */
20
+ "module": "NodeNext",
21
+ "noEmit": true,
108
22
  },
109
23
  "include": ["src/**/*"],
110
24
  "exclude": ["node_modules", "dist", "**/__tests__/*"]
package/tsup.config.ts ADDED
@@ -0,0 +1,9 @@
1
+ import { defineConfig } from "tsup";
2
+
3
+ export default defineConfig({
4
+ entryPoints: ["src/index.ts"],
5
+ format: ["cjs", "esm"],
6
+ dts: true,
7
+ outDir: "dist",
8
+ clean: true,
9
+ });