firecrawl 0.0.30 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +88 -123
- package/build/cjs/index.js +347 -0
- package/build/cjs/package.json +1 -0
- package/build/esm/index.js +339 -0
- package/build/esm/package.json +1 -0
- package/jest.config.js +16 -0
- package/package.json +17 -5
- package/src/__tests__/e2e_withAuth/index.test.ts +298 -124
- package/src/__tests__/index.test.ts +1 -1
- package/src/__tests__/v1/e2e_withAuth/index.test.ts +312 -0
- package/src/index.ts +349 -148
- package/tsconfig.json +3 -3
- package/types/index.d.ts +143 -73
- package/build/index.js +0 -257
- package/build_and_publish.sh +0 -34
- package/jest.config.cjs +0 -5
package/src/index.ts
CHANGED
|
@@ -1,8 +1,13 @@
|
|
|
1
1
|
import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
|
|
2
2
|
import { z } from "zod";
|
|
3
3
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
4
|
+
import { WebSocket } from "isows";
|
|
5
|
+
import { TypedEventTarget } from "typescript-event-target";
|
|
6
|
+
|
|
4
7
|
/**
|
|
5
8
|
* Configuration interface for FirecrawlApp.
|
|
9
|
+
* @param apiKey - Optional API key for authentication.
|
|
10
|
+
* @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'.
|
|
6
11
|
*/
|
|
7
12
|
export interface FirecrawlAppConfig {
|
|
8
13
|
apiKey?: string | null;
|
|
@@ -11,6 +16,7 @@ export interface FirecrawlAppConfig {
|
|
|
11
16
|
|
|
12
17
|
/**
|
|
13
18
|
* Metadata for a Firecrawl document.
|
|
19
|
+
* Includes various optional properties for document metadata.
|
|
14
20
|
*/
|
|
15
21
|
export interface FirecrawlDocumentMetadata {
|
|
16
22
|
title?: string;
|
|
@@ -43,138 +49,191 @@ export interface FirecrawlDocumentMetadata {
|
|
|
43
49
|
articleTag?: string;
|
|
44
50
|
articleSection?: string;
|
|
45
51
|
sourceURL?: string;
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
[key: string]: any;
|
|
52
|
+
statusCode?: number;
|
|
53
|
+
error?: string;
|
|
54
|
+
[key: string]: any; // Allows for additional metadata properties not explicitly defined.
|
|
49
55
|
}
|
|
50
56
|
|
|
51
57
|
/**
|
|
52
58
|
* Document interface for Firecrawl.
|
|
59
|
+
* Represents a document retrieved or processed by Firecrawl.
|
|
53
60
|
*/
|
|
54
61
|
export interface FirecrawlDocument {
|
|
55
|
-
id?: string;
|
|
56
62
|
url?: string;
|
|
57
|
-
content: string;
|
|
58
63
|
markdown?: string;
|
|
59
64
|
html?: string;
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
metadata
|
|
65
|
-
|
|
66
|
-
provider?: string;
|
|
67
|
-
warning?: string;
|
|
65
|
+
rawHtml?: string;
|
|
66
|
+
links?: string[];
|
|
67
|
+
extract?: Record<any, any>;
|
|
68
|
+
screenshot?: string;
|
|
69
|
+
metadata?: FirecrawlDocumentMetadata;
|
|
70
|
+
}
|
|
68
71
|
|
|
69
|
-
|
|
72
|
+
/**
|
|
73
|
+
* Parameters for scraping operations.
|
|
74
|
+
* Defines the options and configurations available for scraping web content.
|
|
75
|
+
*/
|
|
76
|
+
export interface ScrapeParams {
|
|
77
|
+
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[];
|
|
78
|
+
headers?: Record<string, string>;
|
|
79
|
+
includeTags?: string[];
|
|
80
|
+
excludeTags?: string[];
|
|
81
|
+
onlyMainContent?: boolean;
|
|
82
|
+
extract?: {
|
|
83
|
+
prompt?: string;
|
|
84
|
+
schema?: z.ZodSchema | any;
|
|
85
|
+
systemPrompt?: string;
|
|
86
|
+
};
|
|
87
|
+
waitFor?: number;
|
|
88
|
+
timeout?: number;
|
|
70
89
|
}
|
|
71
90
|
|
|
72
91
|
/**
|
|
73
92
|
* Response interface for scraping operations.
|
|
93
|
+
* Defines the structure of the response received after a scraping operation.
|
|
74
94
|
*/
|
|
75
|
-
export interface ScrapeResponse {
|
|
76
|
-
success:
|
|
77
|
-
|
|
95
|
+
export interface ScrapeResponse extends FirecrawlDocument {
|
|
96
|
+
success: true;
|
|
97
|
+
warning?: string;
|
|
78
98
|
error?: string;
|
|
79
99
|
}
|
|
100
|
+
|
|
80
101
|
/**
|
|
81
|
-
*
|
|
102
|
+
* Parameters for crawling operations.
|
|
103
|
+
* Includes options for both scraping and mapping during a crawl.
|
|
82
104
|
*/
|
|
83
|
-
export interface
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
105
|
+
export interface CrawlParams {
|
|
106
|
+
includePaths?: string[];
|
|
107
|
+
excludePaths?: string[];
|
|
108
|
+
maxDepth?: number;
|
|
109
|
+
limit?: number;
|
|
110
|
+
allowBackwardLinks?: boolean;
|
|
111
|
+
allowExternalLinks?: boolean;
|
|
112
|
+
ignoreSitemap?: boolean;
|
|
113
|
+
scrapeOptions?: ScrapeParams;
|
|
87
114
|
}
|
|
115
|
+
|
|
88
116
|
/**
|
|
89
117
|
* Response interface for crawling operations.
|
|
118
|
+
* Defines the structure of the response received after initiating a crawl.
|
|
90
119
|
*/
|
|
91
120
|
export interface CrawlResponse {
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
121
|
+
id?: string;
|
|
122
|
+
url?: string;
|
|
123
|
+
success: true;
|
|
95
124
|
error?: string;
|
|
96
125
|
}
|
|
126
|
+
|
|
97
127
|
/**
|
|
98
128
|
* Response interface for job status checks.
|
|
129
|
+
* Provides detailed status of a crawl job including progress and results.
|
|
99
130
|
*/
|
|
100
|
-
export interface
|
|
101
|
-
success:
|
|
102
|
-
|
|
103
|
-
|
|
131
|
+
export interface CrawlStatusResponse {
|
|
132
|
+
success: true;
|
|
133
|
+
total: number;
|
|
134
|
+
completed: number;
|
|
135
|
+
creditsUsed: number;
|
|
136
|
+
expiresAt: Date;
|
|
137
|
+
status: "scraping" | "completed" | "failed";
|
|
138
|
+
next: string;
|
|
104
139
|
data?: FirecrawlDocument[];
|
|
105
|
-
partial_data?: FirecrawlDocument[];
|
|
106
140
|
error?: string;
|
|
107
141
|
}
|
|
142
|
+
|
|
108
143
|
/**
|
|
109
|
-
*
|
|
144
|
+
* Parameters for mapping operations.
|
|
145
|
+
* Defines options for mapping URLs during a crawl.
|
|
110
146
|
*/
|
|
111
|
-
export interface
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
extractionPrompt?: string;
|
|
117
|
-
};
|
|
147
|
+
export interface MapParams {
|
|
148
|
+
search?: string;
|
|
149
|
+
ignoreSitemap?: boolean;
|
|
150
|
+
includeSubdomains?: boolean;
|
|
151
|
+
limit?: number;
|
|
118
152
|
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Response interface for mapping operations.
|
|
156
|
+
* Defines the structure of the response received after a mapping operation.
|
|
157
|
+
*/
|
|
158
|
+
export interface MapResponse {
|
|
159
|
+
success: true;
|
|
160
|
+
links?: string[];
|
|
161
|
+
error?: string;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Error response interface.
|
|
166
|
+
* Defines the structure of the response received when an error occurs.
|
|
167
|
+
*/
|
|
168
|
+
export interface ErrorResponse {
|
|
169
|
+
success: false;
|
|
170
|
+
error: string;
|
|
171
|
+
}
|
|
172
|
+
|
|
119
173
|
/**
|
|
120
174
|
* Main class for interacting with the Firecrawl API.
|
|
175
|
+
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
121
176
|
*/
|
|
122
177
|
export default class FirecrawlApp {
|
|
123
|
-
|
|
124
|
-
|
|
178
|
+
public apiKey: string;
|
|
179
|
+
public apiUrl: string;
|
|
125
180
|
|
|
126
181
|
/**
|
|
127
182
|
* Initializes a new instance of the FirecrawlApp class.
|
|
128
|
-
* @param
|
|
183
|
+
* @param config - Configuration options for the FirecrawlApp instance.
|
|
129
184
|
*/
|
|
130
185
|
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
|
131
186
|
this.apiKey = apiKey || "";
|
|
132
187
|
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
|
133
|
-
if (!this.apiKey) {
|
|
134
|
-
throw new Error("No API key provided");
|
|
135
|
-
}
|
|
136
188
|
}
|
|
137
189
|
|
|
138
190
|
/**
|
|
139
191
|
* Scrapes a URL using the Firecrawl API.
|
|
140
|
-
* @param
|
|
141
|
-
* @param
|
|
142
|
-
* @returns
|
|
192
|
+
* @param url - The URL to scrape.
|
|
193
|
+
* @param params - Additional parameters for the scrape request.
|
|
194
|
+
* @returns The response from the scrape operation.
|
|
143
195
|
*/
|
|
144
196
|
async scrapeUrl(
|
|
145
197
|
url: string,
|
|
146
|
-
params
|
|
147
|
-
): Promise<ScrapeResponse> {
|
|
198
|
+
params?: ScrapeParams
|
|
199
|
+
): Promise<ScrapeResponse | ErrorResponse> {
|
|
148
200
|
const headers: AxiosRequestHeaders = {
|
|
149
201
|
"Content-Type": "application/json",
|
|
150
202
|
Authorization: `Bearer ${this.apiKey}`,
|
|
151
203
|
} as AxiosRequestHeaders;
|
|
152
|
-
let jsonData:
|
|
153
|
-
if (
|
|
154
|
-
let schema =
|
|
155
|
-
|
|
156
|
-
|
|
204
|
+
let jsonData: any = { url, ...params };
|
|
205
|
+
if (jsonData?.extract?.schema) {
|
|
206
|
+
let schema = jsonData.extract.schema;
|
|
207
|
+
|
|
208
|
+
// Try parsing the schema as a Zod schema
|
|
209
|
+
try {
|
|
157
210
|
schema = zodToJsonSchema(schema);
|
|
211
|
+
} catch (error) {
|
|
212
|
+
|
|
158
213
|
}
|
|
159
214
|
jsonData = {
|
|
160
215
|
...jsonData,
|
|
161
|
-
|
|
162
|
-
...
|
|
163
|
-
|
|
164
|
-
mode: params.extractorOptions.mode || "llm-extraction",
|
|
216
|
+
extract: {
|
|
217
|
+
...jsonData.extract,
|
|
218
|
+
schema: schema,
|
|
165
219
|
},
|
|
166
220
|
};
|
|
167
221
|
}
|
|
168
222
|
try {
|
|
169
223
|
const response: AxiosResponse = await axios.post(
|
|
170
|
-
this.apiUrl +
|
|
224
|
+
this.apiUrl + `/v1/scrape`,
|
|
171
225
|
jsonData,
|
|
172
226
|
{ headers }
|
|
173
227
|
);
|
|
174
228
|
if (response.status === 200) {
|
|
175
229
|
const responseData = response.data;
|
|
176
230
|
if (responseData.success) {
|
|
177
|
-
return
|
|
231
|
+
return {
|
|
232
|
+
success: true,
|
|
233
|
+
warning: responseData.warning,
|
|
234
|
+
error: responseData.error,
|
|
235
|
+
...responseData.data
|
|
236
|
+
};
|
|
178
237
|
} else {
|
|
179
238
|
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
|
|
180
239
|
}
|
|
@@ -188,126 +247,161 @@ export default class FirecrawlApp {
|
|
|
188
247
|
}
|
|
189
248
|
|
|
190
249
|
/**
|
|
191
|
-
*
|
|
192
|
-
* @param
|
|
193
|
-
* @param
|
|
194
|
-
* @returns
|
|
250
|
+
* This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API.
|
|
251
|
+
* @param query - The search query string.
|
|
252
|
+
* @param params - Additional parameters for the search.
|
|
253
|
+
* @returns Throws an error advising to use version 0 of the API.
|
|
195
254
|
*/
|
|
196
255
|
async search(
|
|
197
256
|
query: string,
|
|
198
|
-
params
|
|
199
|
-
): Promise<
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
257
|
+
params?: any
|
|
258
|
+
): Promise<any> {
|
|
259
|
+
throw new Error("Search is not supported in v1, please update FirecrawlApp() initialization to use v0.");
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
264
|
+
* @param url - The URL to crawl.
|
|
265
|
+
* @param params - Additional parameters for the crawl request.
|
|
266
|
+
* @param pollInterval - Time in seconds for job status checks.
|
|
267
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
268
|
+
* @returns The response from the crawl operation.
|
|
269
|
+
*/
|
|
270
|
+
async crawlUrl(
|
|
271
|
+
url: string,
|
|
272
|
+
params?: CrawlParams,
|
|
273
|
+
pollInterval: number = 2,
|
|
274
|
+
idempotencyKey?: string
|
|
275
|
+
): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
276
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
277
|
+
let jsonData: any = { url, ...params };
|
|
208
278
|
try {
|
|
209
|
-
const response: AxiosResponse = await
|
|
210
|
-
this.apiUrl +
|
|
279
|
+
const response: AxiosResponse = await this.postRequest(
|
|
280
|
+
this.apiUrl + `/v1/crawl`,
|
|
211
281
|
jsonData,
|
|
212
|
-
|
|
282
|
+
headers
|
|
213
283
|
);
|
|
214
284
|
if (response.status === 200) {
|
|
215
|
-
const
|
|
216
|
-
|
|
217
|
-
return responseData;
|
|
218
|
-
} else {
|
|
219
|
-
throw new Error(`Failed to search. Error: ${responseData.error}`);
|
|
220
|
-
}
|
|
285
|
+
const id: string = response.data.id;
|
|
286
|
+
return this.monitorJobStatus(id, headers, pollInterval);
|
|
221
287
|
} else {
|
|
222
|
-
this.handleError(response, "
|
|
288
|
+
this.handleError(response, "start crawl job");
|
|
223
289
|
}
|
|
224
290
|
} catch (error: any) {
|
|
225
|
-
|
|
291
|
+
if (error.response?.data?.error) {
|
|
292
|
+
throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
|
|
293
|
+
} else {
|
|
294
|
+
throw new Error(error.message);
|
|
295
|
+
}
|
|
226
296
|
}
|
|
227
297
|
return { success: false, error: "Internal server error." };
|
|
228
298
|
}
|
|
229
299
|
|
|
230
|
-
|
|
231
|
-
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
232
|
-
* @param {string} url - The URL to crawl.
|
|
233
|
-
* @param {Params | null} params - Additional parameters for the crawl request.
|
|
234
|
-
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
|
|
235
|
-
* @param {number} pollInterval - Time in seconds for job status checks.
|
|
236
|
-
* @param {string} idempotencyKey - Optional idempotency key for the request.
|
|
237
|
-
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
|
|
238
|
-
*/
|
|
239
|
-
async crawlUrl(
|
|
300
|
+
async asyncCrawlUrl(
|
|
240
301
|
url: string,
|
|
241
|
-
params
|
|
242
|
-
waitUntilDone: boolean = true,
|
|
243
|
-
pollInterval: number = 2,
|
|
302
|
+
params?: CrawlParams,
|
|
244
303
|
idempotencyKey?: string
|
|
245
|
-
): Promise<CrawlResponse |
|
|
304
|
+
): Promise<CrawlResponse | ErrorResponse> {
|
|
246
305
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
247
|
-
let jsonData:
|
|
248
|
-
if (params) {
|
|
249
|
-
jsonData = { ...jsonData, ...params };
|
|
250
|
-
}
|
|
306
|
+
let jsonData: any = { url, ...params };
|
|
251
307
|
try {
|
|
252
308
|
const response: AxiosResponse = await this.postRequest(
|
|
253
|
-
this.apiUrl +
|
|
309
|
+
this.apiUrl + `/v1/crawl`,
|
|
254
310
|
jsonData,
|
|
255
311
|
headers
|
|
256
312
|
);
|
|
257
313
|
if (response.status === 200) {
|
|
258
|
-
|
|
259
|
-
if (waitUntilDone) {
|
|
260
|
-
return this.monitorJobStatus(jobId, headers, pollInterval);
|
|
261
|
-
} else {
|
|
262
|
-
return { success: true, jobId };
|
|
263
|
-
}
|
|
314
|
+
return response.data;
|
|
264
315
|
} else {
|
|
265
316
|
this.handleError(response, "start crawl job");
|
|
266
317
|
}
|
|
267
318
|
} catch (error: any) {
|
|
268
|
-
|
|
269
|
-
|
|
319
|
+
if (error.response?.data?.error) {
|
|
320
|
+
throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
|
|
321
|
+
} else {
|
|
322
|
+
throw new Error(error.message);
|
|
323
|
+
}
|
|
270
324
|
}
|
|
271
325
|
return { success: false, error: "Internal server error." };
|
|
272
326
|
}
|
|
273
327
|
|
|
274
328
|
/**
|
|
275
329
|
* Checks the status of a crawl job using the Firecrawl API.
|
|
276
|
-
* @param
|
|
277
|
-
* @returns
|
|
330
|
+
* @param id - The ID of the crawl operation.
|
|
331
|
+
* @returns The response containing the job status.
|
|
278
332
|
*/
|
|
279
|
-
async checkCrawlStatus(
|
|
333
|
+
async checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
334
|
+
if (!id) {
|
|
335
|
+
throw new Error("No crawl ID provided");
|
|
336
|
+
}
|
|
337
|
+
|
|
280
338
|
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
|
281
339
|
try {
|
|
282
340
|
const response: AxiosResponse = await this.getRequest(
|
|
283
|
-
this.apiUrl
|
|
341
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
284
342
|
headers
|
|
285
343
|
);
|
|
286
344
|
if (response.status === 200) {
|
|
287
|
-
return {
|
|
345
|
+
return ({
|
|
288
346
|
success: true,
|
|
289
347
|
status: response.data.status,
|
|
348
|
+
total: response.data.total,
|
|
349
|
+
completed: response.data.completed,
|
|
350
|
+
creditsUsed: response.data.creditsUsed,
|
|
351
|
+
expiresAt: new Date(response.data.expiresAt),
|
|
352
|
+
next: response.data.next,
|
|
290
353
|
data: response.data.data,
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
: undefined,
|
|
294
|
-
};
|
|
354
|
+
error: response.data.error
|
|
355
|
+
})
|
|
295
356
|
} else {
|
|
296
357
|
this.handleError(response, "check crawl status");
|
|
297
358
|
}
|
|
298
359
|
} catch (error: any) {
|
|
299
360
|
throw new Error(error.message);
|
|
300
361
|
}
|
|
301
|
-
return {
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
362
|
+
return { success: false, error: "Internal server error." };
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
async crawlUrlAndWatch(
|
|
366
|
+
url: string,
|
|
367
|
+
params?: CrawlParams,
|
|
368
|
+
idempotencyKey?: string,
|
|
369
|
+
) {
|
|
370
|
+
const crawl = await this.asyncCrawlUrl(url, params, idempotencyKey);
|
|
371
|
+
|
|
372
|
+
if (crawl.success && crawl.id) {
|
|
373
|
+
const id = crawl.id;
|
|
374
|
+
return new CrawlWatcher(id, this);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
throw new Error("Crawl job failed to start");
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
async mapUrl(url: string, params?: MapParams): Promise<MapResponse | ErrorResponse> {
|
|
381
|
+
const headers = this.prepareHeaders();
|
|
382
|
+
let jsonData: { url: string } & MapParams = { url, ...params };
|
|
383
|
+
|
|
384
|
+
try {
|
|
385
|
+
const response: AxiosResponse = await this.postRequest(
|
|
386
|
+
this.apiUrl + `/v1/map`,
|
|
387
|
+
jsonData,
|
|
388
|
+
headers
|
|
389
|
+
);
|
|
390
|
+
if (response.status === 200) {
|
|
391
|
+
return response.data as MapResponse;
|
|
392
|
+
} else {
|
|
393
|
+
this.handleError(response, "map");
|
|
394
|
+
}
|
|
395
|
+
} catch (error: any) {
|
|
396
|
+
throw new Error(error.message);
|
|
397
|
+
}
|
|
398
|
+
return { success: false, error: "Internal server error." };
|
|
306
399
|
}
|
|
307
400
|
|
|
308
401
|
/**
|
|
309
402
|
* Prepares the headers for an API request.
|
|
310
|
-
* @
|
|
403
|
+
* @param idempotencyKey - Optional key to ensure idempotency.
|
|
404
|
+
* @returns The prepared headers.
|
|
311
405
|
*/
|
|
312
406
|
prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
|
|
313
407
|
return {
|
|
@@ -319,14 +413,14 @@ export default class FirecrawlApp {
|
|
|
319
413
|
|
|
320
414
|
/**
|
|
321
415
|
* Sends a POST request to the specified URL.
|
|
322
|
-
* @param
|
|
323
|
-
* @param
|
|
324
|
-
* @param
|
|
325
|
-
* @returns
|
|
416
|
+
* @param url - The URL to send the request to.
|
|
417
|
+
* @param data - The data to send in the request.
|
|
418
|
+
* @param headers - The headers for the request.
|
|
419
|
+
* @returns The response from the POST request.
|
|
326
420
|
*/
|
|
327
421
|
postRequest(
|
|
328
422
|
url: string,
|
|
329
|
-
data:
|
|
423
|
+
data: any,
|
|
330
424
|
headers: AxiosRequestHeaders
|
|
331
425
|
): Promise<AxiosResponse> {
|
|
332
426
|
return axios.post(url, data, { headers });
|
|
@@ -334,9 +428,9 @@ export default class FirecrawlApp {
|
|
|
334
428
|
|
|
335
429
|
/**
|
|
336
430
|
* Sends a GET request to the specified URL.
|
|
337
|
-
* @param
|
|
338
|
-
* @param
|
|
339
|
-
* @returns
|
|
431
|
+
* @param url - The URL to send the request to.
|
|
432
|
+
* @param headers - The headers for the request.
|
|
433
|
+
* @returns The response from the GET request.
|
|
340
434
|
*/
|
|
341
435
|
getRequest(
|
|
342
436
|
url: string,
|
|
@@ -347,38 +441,37 @@ export default class FirecrawlApp {
|
|
|
347
441
|
|
|
348
442
|
/**
|
|
349
443
|
* Monitors the status of a crawl job until completion or failure.
|
|
350
|
-
* @param
|
|
351
|
-
* @param
|
|
352
|
-
* @param
|
|
353
|
-
* @
|
|
444
|
+
* @param id - The ID of the crawl operation.
|
|
445
|
+
* @param headers - The headers for the request.
|
|
446
|
+
* @param checkInterval - Interval in seconds for job status checks.
|
|
447
|
+
* @param checkUrl - Optional URL to check the status (used for v1 API)
|
|
448
|
+
* @returns The final job status or data.
|
|
354
449
|
*/
|
|
355
450
|
async monitorJobStatus(
|
|
356
|
-
|
|
451
|
+
id: string,
|
|
357
452
|
headers: AxiosRequestHeaders,
|
|
358
453
|
checkInterval: number
|
|
359
|
-
): Promise<
|
|
454
|
+
): Promise<CrawlStatusResponse> {
|
|
360
455
|
while (true) {
|
|
361
456
|
const statusResponse: AxiosResponse = await this.getRequest(
|
|
362
|
-
this.apiUrl
|
|
457
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
363
458
|
headers
|
|
364
459
|
);
|
|
365
460
|
if (statusResponse.status === 200) {
|
|
366
461
|
const statusData = statusResponse.data;
|
|
367
462
|
if (statusData.status === "completed") {
|
|
368
463
|
if ("data" in statusData) {
|
|
369
|
-
return statusData
|
|
464
|
+
return statusData;
|
|
370
465
|
} else {
|
|
371
466
|
throw new Error("Crawl job completed but no data was returned");
|
|
372
467
|
}
|
|
373
468
|
} else if (
|
|
374
|
-
["active", "paused", "pending", "queued"].includes(statusData.status)
|
|
469
|
+
["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)
|
|
375
470
|
) {
|
|
376
|
-
|
|
377
|
-
checkInterval = 2;
|
|
378
|
-
}
|
|
471
|
+
checkInterval = Math.max(checkInterval, 2);
|
|
379
472
|
await new Promise((resolve) =>
|
|
380
473
|
setTimeout(resolve, checkInterval * 1000)
|
|
381
|
-
);
|
|
474
|
+
);
|
|
382
475
|
} else {
|
|
383
476
|
throw new Error(
|
|
384
477
|
`Crawl job failed or was stopped. Status: ${statusData.status}`
|
|
@@ -409,3 +502,111 @@ export default class FirecrawlApp {
|
|
|
409
502
|
}
|
|
410
503
|
}
|
|
411
504
|
}
|
|
505
|
+
|
|
506
|
+
interface CrawlWatcherEvents {
|
|
507
|
+
document: CustomEvent<FirecrawlDocument>,
|
|
508
|
+
done: CustomEvent<{
|
|
509
|
+
status: CrawlStatusResponse["status"];
|
|
510
|
+
data: FirecrawlDocument[];
|
|
511
|
+
}>,
|
|
512
|
+
error: CustomEvent<{
|
|
513
|
+
status: CrawlStatusResponse["status"],
|
|
514
|
+
data: FirecrawlDocument[],
|
|
515
|
+
error: string,
|
|
516
|
+
}>,
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
520
|
+
private ws: WebSocket;
|
|
521
|
+
public data: FirecrawlDocument[];
|
|
522
|
+
public status: CrawlStatusResponse["status"];
|
|
523
|
+
|
|
524
|
+
constructor(id: string, app: FirecrawlApp) {
|
|
525
|
+
super();
|
|
526
|
+
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
527
|
+
this.status = "scraping";
|
|
528
|
+
this.data = [];
|
|
529
|
+
|
|
530
|
+
type ErrorMessage = {
|
|
531
|
+
type: "error",
|
|
532
|
+
error: string,
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
type CatchupMessage = {
|
|
536
|
+
type: "catchup",
|
|
537
|
+
data: CrawlStatusResponse,
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
type DocumentMessage = {
|
|
541
|
+
type: "document",
|
|
542
|
+
data: FirecrawlDocument,
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
type DoneMessage = { type: "done" }
|
|
546
|
+
|
|
547
|
+
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
|
|
548
|
+
|
|
549
|
+
const messageHandler = (msg: Message) => {
|
|
550
|
+
if (msg.type === "done") {
|
|
551
|
+
this.status = "completed";
|
|
552
|
+
this.dispatchTypedEvent("done", new CustomEvent("done", {
|
|
553
|
+
detail: {
|
|
554
|
+
status: this.status,
|
|
555
|
+
data: this.data,
|
|
556
|
+
},
|
|
557
|
+
}));
|
|
558
|
+
} else if (msg.type === "error") {
|
|
559
|
+
this.status = "failed";
|
|
560
|
+
this.dispatchTypedEvent("error", new CustomEvent("error", {
|
|
561
|
+
detail: {
|
|
562
|
+
status: this.status,
|
|
563
|
+
data: this.data,
|
|
564
|
+
error: msg.error,
|
|
565
|
+
},
|
|
566
|
+
}));
|
|
567
|
+
} else if (msg.type === "catchup") {
|
|
568
|
+
this.status = msg.data.status;
|
|
569
|
+
this.data.push(...(msg.data.data ?? []));
|
|
570
|
+
for (const doc of this.data) {
|
|
571
|
+
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
572
|
+
detail: doc,
|
|
573
|
+
}));
|
|
574
|
+
}
|
|
575
|
+
} else if (msg.type === "document") {
|
|
576
|
+
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
577
|
+
detail: msg.data,
|
|
578
|
+
}));
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
this.ws.onmessage = ((ev: MessageEvent) => {
|
|
583
|
+
if (typeof ev.data !== "string") {
|
|
584
|
+
this.ws.close();
|
|
585
|
+
return;
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
const msg = JSON.parse(ev.data) as Message;
|
|
589
|
+
messageHandler(msg);
|
|
590
|
+
}).bind(this);
|
|
591
|
+
|
|
592
|
+
this.ws.onclose = ((ev: CloseEvent) => {
|
|
593
|
+
const msg = JSON.parse(ev.reason) as Message;
|
|
594
|
+
messageHandler(msg);
|
|
595
|
+
}).bind(this);
|
|
596
|
+
|
|
597
|
+
this.ws.onerror = ((_: Event) => {
|
|
598
|
+
this.status = "failed"
|
|
599
|
+
this.dispatchTypedEvent("error", new CustomEvent("error", {
|
|
600
|
+
detail: {
|
|
601
|
+
status: this.status,
|
|
602
|
+
data: this.data,
|
|
603
|
+
error: "WebSocket error",
|
|
604
|
+
},
|
|
605
|
+
}));
|
|
606
|
+
}).bind(this);
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
close() {
|
|
610
|
+
this.ws.close();
|
|
611
|
+
}
|
|
612
|
+
}
|