firecrawl 0.0.30 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +88 -123
- package/build/cjs/index.js +347 -0
- package/build/cjs/package.json +1 -0
- package/build/esm/index.js +339 -0
- package/build/esm/package.json +1 -0
- package/jest.config.js +16 -0
- package/package.json +17 -5
- package/src/__tests__/e2e_withAuth/index.test.ts +298 -124
- package/src/__tests__/index.test.ts +1 -1
- package/src/__tests__/v1/e2e_withAuth/index.test.ts +312 -0
- package/src/index.ts +337 -144
- package/tsconfig.json +3 -3
- package/types/index.d.ts +137 -74
- package/build/index.js +0 -257
- package/build_and_publish.sh +0 -34
- package/jest.config.cjs +0 -5
package/src/index.ts
CHANGED
|
@@ -1,8 +1,13 @@
|
|
|
1
1
|
import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
|
|
2
2
|
import { z } from "zod";
|
|
3
3
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
4
|
+
import { WebSocket } from "isows";
|
|
5
|
+
import { TypedEventTarget } from "typescript-event-target";
|
|
6
|
+
|
|
4
7
|
/**
|
|
5
8
|
* Configuration interface for FirecrawlApp.
|
|
9
|
+
* @param apiKey - Optional API key for authentication.
|
|
10
|
+
* @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'.
|
|
6
11
|
*/
|
|
7
12
|
export interface FirecrawlAppConfig {
|
|
8
13
|
apiKey?: string | null;
|
|
@@ -11,6 +16,7 @@ export interface FirecrawlAppConfig {
|
|
|
11
16
|
|
|
12
17
|
/**
|
|
13
18
|
* Metadata for a Firecrawl document.
|
|
19
|
+
* Includes various optional properties for document metadata.
|
|
14
20
|
*/
|
|
15
21
|
export interface FirecrawlDocumentMetadata {
|
|
16
22
|
title?: string;
|
|
@@ -43,115 +49,155 @@ export interface FirecrawlDocumentMetadata {
|
|
|
43
49
|
articleTag?: string;
|
|
44
50
|
articleSection?: string;
|
|
45
51
|
sourceURL?: string;
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
[key: string]: any;
|
|
52
|
+
statusCode?: number;
|
|
53
|
+
error?: string;
|
|
54
|
+
[key: string]: any; // Allows for additional metadata properties not explicitly defined.
|
|
49
55
|
}
|
|
50
56
|
|
|
51
57
|
/**
|
|
52
58
|
* Document interface for Firecrawl.
|
|
59
|
+
* Represents a document retrieved or processed by Firecrawl.
|
|
53
60
|
*/
|
|
54
61
|
export interface FirecrawlDocument {
|
|
55
|
-
id?: string;
|
|
56
62
|
url?: string;
|
|
57
|
-
content: string;
|
|
58
63
|
markdown?: string;
|
|
59
64
|
html?: string;
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
childrenLinks?: string[];
|
|
66
|
-
provider?: string;
|
|
67
|
-
warning?: string;
|
|
65
|
+
rawHtml?: string;
|
|
66
|
+
links?: string[];
|
|
67
|
+
screenshot?: string;
|
|
68
|
+
metadata?: FirecrawlDocumentMetadata;
|
|
69
|
+
}
|
|
68
70
|
|
|
69
|
-
|
|
71
|
+
/**
|
|
72
|
+
* Parameters for scraping operations.
|
|
73
|
+
* Defines the options and configurations available for scraping web content.
|
|
74
|
+
*/
|
|
75
|
+
export interface ScrapeParams {
|
|
76
|
+
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "full@scrennshot")[];
|
|
77
|
+
headers?: Record<string, string>;
|
|
78
|
+
includeTags?: string[];
|
|
79
|
+
excludeTags?: string[];
|
|
80
|
+
onlyMainContent?: boolean;
|
|
81
|
+
waitFor?: number;
|
|
82
|
+
timeout?: number;
|
|
70
83
|
}
|
|
71
84
|
|
|
72
85
|
/**
|
|
73
86
|
* Response interface for scraping operations.
|
|
87
|
+
* Defines the structure of the response received after a scraping operation.
|
|
74
88
|
*/
|
|
75
|
-
export interface ScrapeResponse {
|
|
76
|
-
success:
|
|
77
|
-
|
|
89
|
+
export interface ScrapeResponse extends FirecrawlDocument {
|
|
90
|
+
success: true;
|
|
91
|
+
warning?: string;
|
|
78
92
|
error?: string;
|
|
79
93
|
}
|
|
94
|
+
|
|
80
95
|
/**
|
|
81
|
-
*
|
|
96
|
+
* Parameters for crawling operations.
|
|
97
|
+
* Includes options for both scraping and mapping during a crawl.
|
|
82
98
|
*/
|
|
83
|
-
export interface
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
99
|
+
export interface CrawlParams {
|
|
100
|
+
includePaths?: string[];
|
|
101
|
+
excludePaths?: string[];
|
|
102
|
+
maxDepth?: number;
|
|
103
|
+
limit?: number;
|
|
104
|
+
allowBackwardLinks?: boolean;
|
|
105
|
+
allowExternalLinks?: boolean;
|
|
106
|
+
ignoreSitemap?: boolean;
|
|
107
|
+
scrapeOptions?: ScrapeParams;
|
|
87
108
|
}
|
|
109
|
+
|
|
88
110
|
/**
|
|
89
111
|
* Response interface for crawling operations.
|
|
112
|
+
* Defines the structure of the response received after initiating a crawl.
|
|
90
113
|
*/
|
|
91
114
|
export interface CrawlResponse {
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
115
|
+
id?: string;
|
|
116
|
+
url?: string;
|
|
117
|
+
success: true;
|
|
95
118
|
error?: string;
|
|
96
119
|
}
|
|
120
|
+
|
|
97
121
|
/**
|
|
98
122
|
* Response interface for job status checks.
|
|
123
|
+
* Provides detailed status of a crawl job including progress and results.
|
|
99
124
|
*/
|
|
100
|
-
export interface
|
|
101
|
-
success:
|
|
102
|
-
|
|
103
|
-
|
|
125
|
+
export interface CrawlStatusResponse {
|
|
126
|
+
success: true;
|
|
127
|
+
total: number;
|
|
128
|
+
completed: number;
|
|
129
|
+
creditsUsed: number;
|
|
130
|
+
expiresAt: Date;
|
|
131
|
+
status: "scraping" | "completed" | "failed";
|
|
132
|
+
next: string;
|
|
104
133
|
data?: FirecrawlDocument[];
|
|
105
|
-
partial_data?: FirecrawlDocument[];
|
|
106
134
|
error?: string;
|
|
107
135
|
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Parameters for mapping operations.
|
|
139
|
+
* Defines options for mapping URLs during a crawl.
|
|
140
|
+
*/
|
|
141
|
+
export interface MapParams {
|
|
142
|
+
search?: string;
|
|
143
|
+
ignoreSitemap?: boolean;
|
|
144
|
+
includeSubdomains?: boolean;
|
|
145
|
+
limit?: number;
|
|
146
|
+
}
|
|
147
|
+
|
|
108
148
|
/**
|
|
109
|
-
*
|
|
149
|
+
* Response interface for mapping operations.
|
|
150
|
+
* Defines the structure of the response received after a mapping operation.
|
|
110
151
|
*/
|
|
111
|
-
export interface
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
mode?: "llm-extraction";
|
|
116
|
-
extractionPrompt?: string;
|
|
117
|
-
};
|
|
152
|
+
export interface MapResponse {
|
|
153
|
+
success: true;
|
|
154
|
+
links?: string[];
|
|
155
|
+
error?: string;
|
|
118
156
|
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Error response interface.
|
|
160
|
+
* Defines the structure of the response received when an error occurs.
|
|
161
|
+
*/
|
|
162
|
+
export interface ErrorResponse {
|
|
163
|
+
success: false;
|
|
164
|
+
error: string;
|
|
165
|
+
}
|
|
166
|
+
|
|
119
167
|
/**
|
|
120
168
|
* Main class for interacting with the Firecrawl API.
|
|
169
|
+
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
121
170
|
*/
|
|
122
171
|
export default class FirecrawlApp {
|
|
123
|
-
|
|
124
|
-
|
|
172
|
+
public apiKey: string;
|
|
173
|
+
public apiUrl: string;
|
|
125
174
|
|
|
126
175
|
/**
|
|
127
176
|
* Initializes a new instance of the FirecrawlApp class.
|
|
128
|
-
* @param
|
|
177
|
+
* @param config - Configuration options for the FirecrawlApp instance.
|
|
129
178
|
*/
|
|
130
179
|
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
|
131
180
|
this.apiKey = apiKey || "";
|
|
132
181
|
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
|
133
|
-
if (!this.apiKey) {
|
|
134
|
-
throw new Error("No API key provided");
|
|
135
|
-
}
|
|
136
182
|
}
|
|
137
183
|
|
|
138
184
|
/**
|
|
139
185
|
* Scrapes a URL using the Firecrawl API.
|
|
140
|
-
* @param
|
|
141
|
-
* @param
|
|
142
|
-
* @returns
|
|
186
|
+
* @param url - The URL to scrape.
|
|
187
|
+
* @param params - Additional parameters for the scrape request.
|
|
188
|
+
* @returns The response from the scrape operation.
|
|
143
189
|
*/
|
|
144
190
|
async scrapeUrl(
|
|
145
191
|
url: string,
|
|
146
|
-
params
|
|
147
|
-
): Promise<ScrapeResponse> {
|
|
192
|
+
params?: ScrapeParams
|
|
193
|
+
): Promise<ScrapeResponse | ErrorResponse> {
|
|
148
194
|
const headers: AxiosRequestHeaders = {
|
|
149
195
|
"Content-Type": "application/json",
|
|
150
196
|
Authorization: `Bearer ${this.apiKey}`,
|
|
151
197
|
} as AxiosRequestHeaders;
|
|
152
|
-
let jsonData:
|
|
153
|
-
if (
|
|
154
|
-
let schema =
|
|
198
|
+
let jsonData: any = { url, ...params };
|
|
199
|
+
if (jsonData?.extractorOptions?.extractionSchema) {
|
|
200
|
+
let schema = jsonData.extractorOptions.extractionSchema;
|
|
155
201
|
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
|
156
202
|
if (schema instanceof z.ZodSchema) {
|
|
157
203
|
schema = zodToJsonSchema(schema);
|
|
@@ -159,22 +205,27 @@ export default class FirecrawlApp {
|
|
|
159
205
|
jsonData = {
|
|
160
206
|
...jsonData,
|
|
161
207
|
extractorOptions: {
|
|
162
|
-
...
|
|
208
|
+
...jsonData.extractorOptions,
|
|
163
209
|
extractionSchema: schema,
|
|
164
|
-
mode:
|
|
210
|
+
mode: jsonData.extractorOptions.mode || "llm-extraction",
|
|
165
211
|
},
|
|
166
212
|
};
|
|
167
213
|
}
|
|
168
214
|
try {
|
|
169
215
|
const response: AxiosResponse = await axios.post(
|
|
170
|
-
this.apiUrl +
|
|
216
|
+
this.apiUrl + `/v1/scrape`,
|
|
171
217
|
jsonData,
|
|
172
218
|
{ headers }
|
|
173
219
|
);
|
|
174
220
|
if (response.status === 200) {
|
|
175
221
|
const responseData = response.data;
|
|
176
222
|
if (responseData.success) {
|
|
177
|
-
return
|
|
223
|
+
return {
|
|
224
|
+
success: true,
|
|
225
|
+
warning: responseData.warning,
|
|
226
|
+
error: responseData.error,
|
|
227
|
+
...responseData.data
|
|
228
|
+
};
|
|
178
229
|
} else {
|
|
179
230
|
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
|
|
180
231
|
}
|
|
@@ -188,126 +239,161 @@ export default class FirecrawlApp {
|
|
|
188
239
|
}
|
|
189
240
|
|
|
190
241
|
/**
|
|
191
|
-
*
|
|
192
|
-
* @param
|
|
193
|
-
* @param
|
|
194
|
-
* @returns
|
|
242
|
+
* This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API.
|
|
243
|
+
* @param query - The search query string.
|
|
244
|
+
* @param params - Additional parameters for the search.
|
|
245
|
+
* @returns Throws an error advising to use version 0 of the API.
|
|
195
246
|
*/
|
|
196
247
|
async search(
|
|
197
248
|
query: string,
|
|
198
|
-
params
|
|
199
|
-
): Promise<
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
249
|
+
params?: any
|
|
250
|
+
): Promise<any> {
|
|
251
|
+
throw new Error("Search is not supported in v1, please update FirecrawlApp() initialization to use v0.");
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
256
|
+
* @param url - The URL to crawl.
|
|
257
|
+
* @param params - Additional parameters for the crawl request.
|
|
258
|
+
* @param pollInterval - Time in seconds for job status checks.
|
|
259
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
260
|
+
* @returns The response from the crawl operation.
|
|
261
|
+
*/
|
|
262
|
+
async crawlUrl(
|
|
263
|
+
url: string,
|
|
264
|
+
params?: CrawlParams,
|
|
265
|
+
pollInterval: number = 2,
|
|
266
|
+
idempotencyKey?: string
|
|
267
|
+
): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
268
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
269
|
+
let jsonData: any = { url, ...params };
|
|
208
270
|
try {
|
|
209
|
-
const response: AxiosResponse = await
|
|
210
|
-
this.apiUrl +
|
|
271
|
+
const response: AxiosResponse = await this.postRequest(
|
|
272
|
+
this.apiUrl + `/v1/crawl`,
|
|
211
273
|
jsonData,
|
|
212
|
-
|
|
274
|
+
headers
|
|
213
275
|
);
|
|
214
276
|
if (response.status === 200) {
|
|
215
|
-
const
|
|
216
|
-
|
|
217
|
-
return responseData;
|
|
218
|
-
} else {
|
|
219
|
-
throw new Error(`Failed to search. Error: ${responseData.error}`);
|
|
220
|
-
}
|
|
277
|
+
const id: string = response.data.id;
|
|
278
|
+
return this.monitorJobStatus(id, headers, pollInterval);
|
|
221
279
|
} else {
|
|
222
|
-
this.handleError(response, "
|
|
280
|
+
this.handleError(response, "start crawl job");
|
|
223
281
|
}
|
|
224
282
|
} catch (error: any) {
|
|
225
|
-
|
|
283
|
+
if (error.response?.data?.error) {
|
|
284
|
+
throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
|
|
285
|
+
} else {
|
|
286
|
+
throw new Error(error.message);
|
|
287
|
+
}
|
|
226
288
|
}
|
|
227
289
|
return { success: false, error: "Internal server error." };
|
|
228
290
|
}
|
|
229
291
|
|
|
230
|
-
|
|
231
|
-
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
232
|
-
* @param {string} url - The URL to crawl.
|
|
233
|
-
* @param {Params | null} params - Additional parameters for the crawl request.
|
|
234
|
-
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
|
|
235
|
-
* @param {number} pollInterval - Time in seconds for job status checks.
|
|
236
|
-
* @param {string} idempotencyKey - Optional idempotency key for the request.
|
|
237
|
-
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
|
|
238
|
-
*/
|
|
239
|
-
async crawlUrl(
|
|
292
|
+
async asyncCrawlUrl(
|
|
240
293
|
url: string,
|
|
241
|
-
params
|
|
242
|
-
waitUntilDone: boolean = true,
|
|
243
|
-
pollInterval: number = 2,
|
|
294
|
+
params?: CrawlParams,
|
|
244
295
|
idempotencyKey?: string
|
|
245
|
-
): Promise<CrawlResponse |
|
|
296
|
+
): Promise<CrawlResponse | ErrorResponse> {
|
|
246
297
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
247
|
-
let jsonData:
|
|
248
|
-
if (params) {
|
|
249
|
-
jsonData = { ...jsonData, ...params };
|
|
250
|
-
}
|
|
298
|
+
let jsonData: any = { url, ...params };
|
|
251
299
|
try {
|
|
252
300
|
const response: AxiosResponse = await this.postRequest(
|
|
253
|
-
this.apiUrl +
|
|
301
|
+
this.apiUrl + `/v1/crawl`,
|
|
254
302
|
jsonData,
|
|
255
303
|
headers
|
|
256
304
|
);
|
|
257
305
|
if (response.status === 200) {
|
|
258
|
-
|
|
259
|
-
if (waitUntilDone) {
|
|
260
|
-
return this.monitorJobStatus(jobId, headers, pollInterval);
|
|
261
|
-
} else {
|
|
262
|
-
return { success: true, jobId };
|
|
263
|
-
}
|
|
306
|
+
return response.data;
|
|
264
307
|
} else {
|
|
265
308
|
this.handleError(response, "start crawl job");
|
|
266
309
|
}
|
|
267
310
|
} catch (error: any) {
|
|
268
|
-
|
|
269
|
-
|
|
311
|
+
if (error.response?.data?.error) {
|
|
312
|
+
throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
|
|
313
|
+
} else {
|
|
314
|
+
throw new Error(error.message);
|
|
315
|
+
}
|
|
270
316
|
}
|
|
271
317
|
return { success: false, error: "Internal server error." };
|
|
272
318
|
}
|
|
273
319
|
|
|
274
320
|
/**
|
|
275
321
|
* Checks the status of a crawl job using the Firecrawl API.
|
|
276
|
-
* @param
|
|
277
|
-
* @returns
|
|
322
|
+
* @param id - The ID of the crawl operation.
|
|
323
|
+
* @returns The response containing the job status.
|
|
278
324
|
*/
|
|
279
|
-
async checkCrawlStatus(
|
|
325
|
+
async checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
326
|
+
if (!id) {
|
|
327
|
+
throw new Error("No crawl ID provided");
|
|
328
|
+
}
|
|
329
|
+
|
|
280
330
|
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
|
281
331
|
try {
|
|
282
332
|
const response: AxiosResponse = await this.getRequest(
|
|
283
|
-
this.apiUrl
|
|
333
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
284
334
|
headers
|
|
285
335
|
);
|
|
286
336
|
if (response.status === 200) {
|
|
287
|
-
return {
|
|
337
|
+
return ({
|
|
288
338
|
success: true,
|
|
289
339
|
status: response.data.status,
|
|
340
|
+
total: response.data.total,
|
|
341
|
+
completed: response.data.completed,
|
|
342
|
+
creditsUsed: response.data.creditsUsed,
|
|
343
|
+
expiresAt: new Date(response.data.expiresAt),
|
|
344
|
+
next: response.data.next,
|
|
290
345
|
data: response.data.data,
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
: undefined,
|
|
294
|
-
};
|
|
346
|
+
error: response.data.error
|
|
347
|
+
})
|
|
295
348
|
} else {
|
|
296
349
|
this.handleError(response, "check crawl status");
|
|
297
350
|
}
|
|
298
351
|
} catch (error: any) {
|
|
299
352
|
throw new Error(error.message);
|
|
300
353
|
}
|
|
301
|
-
return {
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
354
|
+
return { success: false, error: "Internal server error." };
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
async crawlUrlAndWatch(
|
|
358
|
+
url: string,
|
|
359
|
+
params?: CrawlParams,
|
|
360
|
+
idempotencyKey?: string,
|
|
361
|
+
) {
|
|
362
|
+
const crawl = await this.asyncCrawlUrl(url, params, idempotencyKey);
|
|
363
|
+
|
|
364
|
+
if (crawl.success && crawl.id) {
|
|
365
|
+
const id = crawl.id;
|
|
366
|
+
return new CrawlWatcher(id, this);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
throw new Error("Crawl job failed to start");
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
async mapUrl(url: string, params?: MapParams): Promise<MapResponse | ErrorResponse> {
|
|
373
|
+
const headers = this.prepareHeaders();
|
|
374
|
+
let jsonData: { url: string } & MapParams = { url, ...params };
|
|
375
|
+
|
|
376
|
+
try {
|
|
377
|
+
const response: AxiosResponse = await this.postRequest(
|
|
378
|
+
this.apiUrl + `/v1/map`,
|
|
379
|
+
jsonData,
|
|
380
|
+
headers
|
|
381
|
+
);
|
|
382
|
+
if (response.status === 200) {
|
|
383
|
+
return response.data as MapResponse;
|
|
384
|
+
} else {
|
|
385
|
+
this.handleError(response, "map");
|
|
386
|
+
}
|
|
387
|
+
} catch (error: any) {
|
|
388
|
+
throw new Error(error.message);
|
|
389
|
+
}
|
|
390
|
+
return { success: false, error: "Internal server error." };
|
|
306
391
|
}
|
|
307
392
|
|
|
308
393
|
/**
|
|
309
394
|
* Prepares the headers for an API request.
|
|
310
|
-
* @
|
|
395
|
+
* @param idempotencyKey - Optional key to ensure idempotency.
|
|
396
|
+
* @returns The prepared headers.
|
|
311
397
|
*/
|
|
312
398
|
prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
|
|
313
399
|
return {
|
|
@@ -319,14 +405,14 @@ export default class FirecrawlApp {
|
|
|
319
405
|
|
|
320
406
|
/**
|
|
321
407
|
* Sends a POST request to the specified URL.
|
|
322
|
-
* @param
|
|
323
|
-
* @param
|
|
324
|
-
* @param
|
|
325
|
-
* @returns
|
|
408
|
+
* @param url - The URL to send the request to.
|
|
409
|
+
* @param data - The data to send in the request.
|
|
410
|
+
* @param headers - The headers for the request.
|
|
411
|
+
* @returns The response from the POST request.
|
|
326
412
|
*/
|
|
327
413
|
postRequest(
|
|
328
414
|
url: string,
|
|
329
|
-
data:
|
|
415
|
+
data: any,
|
|
330
416
|
headers: AxiosRequestHeaders
|
|
331
417
|
): Promise<AxiosResponse> {
|
|
332
418
|
return axios.post(url, data, { headers });
|
|
@@ -334,9 +420,9 @@ export default class FirecrawlApp {
|
|
|
334
420
|
|
|
335
421
|
/**
|
|
336
422
|
* Sends a GET request to the specified URL.
|
|
337
|
-
* @param
|
|
338
|
-
* @param
|
|
339
|
-
* @returns
|
|
423
|
+
* @param url - The URL to send the request to.
|
|
424
|
+
* @param headers - The headers for the request.
|
|
425
|
+
* @returns The response from the GET request.
|
|
340
426
|
*/
|
|
341
427
|
getRequest(
|
|
342
428
|
url: string,
|
|
@@ -347,38 +433,37 @@ export default class FirecrawlApp {
|
|
|
347
433
|
|
|
348
434
|
/**
|
|
349
435
|
* Monitors the status of a crawl job until completion or failure.
|
|
350
|
-
* @param
|
|
351
|
-
* @param
|
|
352
|
-
* @param
|
|
353
|
-
* @
|
|
436
|
+
* @param id - The ID of the crawl operation.
|
|
437
|
+
* @param headers - The headers for the request.
|
|
438
|
+
* @param checkInterval - Interval in seconds for job status checks.
|
|
439
|
+
* @param checkUrl - Optional URL to check the status (used for v1 API)
|
|
440
|
+
* @returns The final job status or data.
|
|
354
441
|
*/
|
|
355
442
|
async monitorJobStatus(
|
|
356
|
-
|
|
443
|
+
id: string,
|
|
357
444
|
headers: AxiosRequestHeaders,
|
|
358
445
|
checkInterval: number
|
|
359
|
-
): Promise<
|
|
446
|
+
): Promise<CrawlStatusResponse> {
|
|
360
447
|
while (true) {
|
|
361
448
|
const statusResponse: AxiosResponse = await this.getRequest(
|
|
362
|
-
this.apiUrl
|
|
449
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
363
450
|
headers
|
|
364
451
|
);
|
|
365
452
|
if (statusResponse.status === 200) {
|
|
366
453
|
const statusData = statusResponse.data;
|
|
367
454
|
if (statusData.status === "completed") {
|
|
368
455
|
if ("data" in statusData) {
|
|
369
|
-
return statusData
|
|
456
|
+
return statusData;
|
|
370
457
|
} else {
|
|
371
458
|
throw new Error("Crawl job completed but no data was returned");
|
|
372
459
|
}
|
|
373
460
|
} else if (
|
|
374
|
-
["active", "paused", "pending", "queued"].includes(statusData.status)
|
|
461
|
+
["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)
|
|
375
462
|
) {
|
|
376
|
-
|
|
377
|
-
checkInterval = 2;
|
|
378
|
-
}
|
|
463
|
+
checkInterval = Math.max(checkInterval, 2);
|
|
379
464
|
await new Promise((resolve) =>
|
|
380
465
|
setTimeout(resolve, checkInterval * 1000)
|
|
381
|
-
);
|
|
466
|
+
);
|
|
382
467
|
} else {
|
|
383
468
|
throw new Error(
|
|
384
469
|
`Crawl job failed or was stopped. Status: ${statusData.status}`
|
|
@@ -409,3 +494,111 @@ export default class FirecrawlApp {
|
|
|
409
494
|
}
|
|
410
495
|
}
|
|
411
496
|
}
|
|
497
|
+
|
|
498
|
+
interface CrawlWatcherEvents {
|
|
499
|
+
document: CustomEvent<FirecrawlDocument>,
|
|
500
|
+
done: CustomEvent<{
|
|
501
|
+
status: CrawlStatusResponse["status"];
|
|
502
|
+
data: FirecrawlDocument[];
|
|
503
|
+
}>,
|
|
504
|
+
error: CustomEvent<{
|
|
505
|
+
status: CrawlStatusResponse["status"],
|
|
506
|
+
data: FirecrawlDocument[],
|
|
507
|
+
error: string,
|
|
508
|
+
}>,
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
512
|
+
private ws: WebSocket;
|
|
513
|
+
public data: FirecrawlDocument[];
|
|
514
|
+
public status: CrawlStatusResponse["status"];
|
|
515
|
+
|
|
516
|
+
constructor(id: string, app: FirecrawlApp) {
|
|
517
|
+
super();
|
|
518
|
+
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
519
|
+
this.status = "scraping";
|
|
520
|
+
this.data = [];
|
|
521
|
+
|
|
522
|
+
type ErrorMessage = {
|
|
523
|
+
type: "error",
|
|
524
|
+
error: string,
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
type CatchupMessage = {
|
|
528
|
+
type: "catchup",
|
|
529
|
+
data: CrawlStatusResponse,
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
type DocumentMessage = {
|
|
533
|
+
type: "document",
|
|
534
|
+
data: FirecrawlDocument,
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
type DoneMessage = { type: "done" }
|
|
538
|
+
|
|
539
|
+
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
|
|
540
|
+
|
|
541
|
+
const messageHandler = (msg: Message) => {
|
|
542
|
+
if (msg.type === "done") {
|
|
543
|
+
this.status = "completed";
|
|
544
|
+
this.dispatchTypedEvent("done", new CustomEvent("done", {
|
|
545
|
+
detail: {
|
|
546
|
+
status: this.status,
|
|
547
|
+
data: this.data,
|
|
548
|
+
},
|
|
549
|
+
}));
|
|
550
|
+
} else if (msg.type === "error") {
|
|
551
|
+
this.status = "failed";
|
|
552
|
+
this.dispatchTypedEvent("error", new CustomEvent("error", {
|
|
553
|
+
detail: {
|
|
554
|
+
status: this.status,
|
|
555
|
+
data: this.data,
|
|
556
|
+
error: msg.error,
|
|
557
|
+
},
|
|
558
|
+
}));
|
|
559
|
+
} else if (msg.type === "catchup") {
|
|
560
|
+
this.status = msg.data.status;
|
|
561
|
+
this.data.push(...(msg.data.data ?? []));
|
|
562
|
+
for (const doc of this.data) {
|
|
563
|
+
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
564
|
+
detail: doc,
|
|
565
|
+
}));
|
|
566
|
+
}
|
|
567
|
+
} else if (msg.type === "document") {
|
|
568
|
+
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
569
|
+
detail: msg.data,
|
|
570
|
+
}));
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
this.ws.onmessage = ((ev: MessageEvent) => {
|
|
575
|
+
if (typeof ev.data !== "string") {
|
|
576
|
+
this.ws.close();
|
|
577
|
+
return;
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
const msg = JSON.parse(ev.data) as Message;
|
|
581
|
+
messageHandler(msg);
|
|
582
|
+
}).bind(this);
|
|
583
|
+
|
|
584
|
+
this.ws.onclose = ((ev: CloseEvent) => {
|
|
585
|
+
const msg = JSON.parse(ev.reason) as Message;
|
|
586
|
+
messageHandler(msg);
|
|
587
|
+
}).bind(this);
|
|
588
|
+
|
|
589
|
+
this.ws.onerror = ((_: Event) => {
|
|
590
|
+
this.status = "failed"
|
|
591
|
+
this.dispatchTypedEvent("error", new CustomEvent("error", {
|
|
592
|
+
detail: {
|
|
593
|
+
status: this.status,
|
|
594
|
+
data: this.data,
|
|
595
|
+
error: "WebSocket error",
|
|
596
|
+
},
|
|
597
|
+
}));
|
|
598
|
+
}).bind(this);
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
close() {
|
|
602
|
+
this.ws.close();
|
|
603
|
+
}
|
|
604
|
+
}
|