@mendable/firecrawl 1.2.2 → 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -0
- package/dist/index.cjs +778 -0
- package/dist/index.d.cts +452 -0
- package/dist/index.d.ts +452 -0
- package/dist/index.js +742 -0
- package/package.json +12 -14
- package/src/__tests__/index.test.ts +18 -9
- package/src/__tests__/v1/e2e_withAuth/index.test.ts +226 -113
- package/src/index.ts +1100 -130
- package/tsconfig.json +19 -105
- package/tsup.config.ts +9 -0
- package/build/cjs/index.js +0 -354
- package/build/cjs/package.json +0 -1
- package/build/esm/index.js +0 -346
- package/build/esm/package.json +0 -1
- package/types/index.d.ts +0 -260
package/src/index.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
|
|
2
|
-
import
|
|
1
|
+
import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios";
|
|
2
|
+
import * as zt from "zod";
|
|
3
3
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
4
4
|
import { WebSocket } from "isows";
|
|
5
5
|
import { TypedEventTarget } from "typescript-event-target";
|
|
@@ -58,41 +58,95 @@ export interface FirecrawlDocumentMetadata {
|
|
|
58
58
|
* Document interface for Firecrawl.
|
|
59
59
|
* Represents a document retrieved or processed by Firecrawl.
|
|
60
60
|
*/
|
|
61
|
-
export interface FirecrawlDocument {
|
|
61
|
+
export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | never) = never> {
|
|
62
62
|
url?: string;
|
|
63
63
|
markdown?: string;
|
|
64
64
|
html?: string;
|
|
65
65
|
rawHtml?: string;
|
|
66
66
|
links?: string[];
|
|
67
|
-
extract?:
|
|
67
|
+
extract?: T;
|
|
68
|
+
json?: T;
|
|
68
69
|
screenshot?: string;
|
|
69
70
|
metadata?: FirecrawlDocumentMetadata;
|
|
71
|
+
actions: ActionsSchema;
|
|
72
|
+
// v1 search only
|
|
73
|
+
title?: string;
|
|
74
|
+
description?: string;
|
|
70
75
|
}
|
|
71
76
|
|
|
72
77
|
/**
|
|
73
78
|
* Parameters for scraping operations.
|
|
74
79
|
* Defines the options and configurations available for scraping web content.
|
|
75
80
|
*/
|
|
76
|
-
export interface
|
|
77
|
-
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "
|
|
81
|
+
export interface CrawlScrapeOptions {
|
|
82
|
+
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json")[];
|
|
78
83
|
headers?: Record<string, string>;
|
|
79
84
|
includeTags?: string[];
|
|
80
85
|
excludeTags?: string[];
|
|
81
86
|
onlyMainContent?: boolean;
|
|
87
|
+
waitFor?: number;
|
|
88
|
+
timeout?: number;
|
|
89
|
+
location?: {
|
|
90
|
+
country?: string;
|
|
91
|
+
languages?: string[];
|
|
92
|
+
};
|
|
93
|
+
mobile?: boolean;
|
|
94
|
+
skipTlsVerification?: boolean;
|
|
95
|
+
removeBase64Images?: boolean;
|
|
96
|
+
blockAds?: boolean;
|
|
97
|
+
proxy?: "basic" | "stealth";
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
export type Action = {
|
|
101
|
+
type: "wait",
|
|
102
|
+
milliseconds?: number,
|
|
103
|
+
selector?: string,
|
|
104
|
+
} | {
|
|
105
|
+
type: "click",
|
|
106
|
+
selector: string,
|
|
107
|
+
} | {
|
|
108
|
+
type: "screenshot",
|
|
109
|
+
fullPage?: boolean,
|
|
110
|
+
} | {
|
|
111
|
+
type: "write",
|
|
112
|
+
text: string,
|
|
113
|
+
} | {
|
|
114
|
+
type: "press",
|
|
115
|
+
key: string,
|
|
116
|
+
} | {
|
|
117
|
+
type: "scroll",
|
|
118
|
+
direction?: "up" | "down",
|
|
119
|
+
selector?: string,
|
|
120
|
+
} | {
|
|
121
|
+
type: "scrape",
|
|
122
|
+
} | {
|
|
123
|
+
type: "executeJavascript",
|
|
124
|
+
script: string,
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchema extends (Action[] | undefined) = undefined> extends CrawlScrapeOptions {
|
|
82
128
|
extract?: {
|
|
83
129
|
prompt?: string;
|
|
84
|
-
schema?:
|
|
130
|
+
schema?: LLMSchema;
|
|
85
131
|
systemPrompt?: string;
|
|
86
132
|
};
|
|
87
|
-
|
|
88
|
-
|
|
133
|
+
jsonOptions?:{
|
|
134
|
+
prompt?: string;
|
|
135
|
+
schema?: LLMSchema;
|
|
136
|
+
systemPrompt?: string;
|
|
137
|
+
}
|
|
138
|
+
actions?: ActionsSchema;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
export interface ActionsResult {
|
|
142
|
+
screenshots: string[];
|
|
89
143
|
}
|
|
90
144
|
|
|
91
145
|
/**
|
|
92
146
|
* Response interface for scraping operations.
|
|
93
147
|
* Defines the structure of the response received after a scraping operation.
|
|
94
148
|
*/
|
|
95
|
-
export interface ScrapeResponse extends FirecrawlDocument {
|
|
149
|
+
export interface ScrapeResponse<LLMResult = any, ActionsSchema extends (ActionsResult | never) = never> extends FirecrawlDocument<LLMResult, ActionsSchema> {
|
|
96
150
|
success: true;
|
|
97
151
|
warning?: string;
|
|
98
152
|
error?: string;
|
|
@@ -110,8 +164,15 @@ export interface CrawlParams {
|
|
|
110
164
|
allowBackwardLinks?: boolean;
|
|
111
165
|
allowExternalLinks?: boolean;
|
|
112
166
|
ignoreSitemap?: boolean;
|
|
113
|
-
scrapeOptions?:
|
|
114
|
-
webhook?: string
|
|
167
|
+
scrapeOptions?: CrawlScrapeOptions;
|
|
168
|
+
webhook?: string | {
|
|
169
|
+
url: string;
|
|
170
|
+
headers?: Record<string, string>;
|
|
171
|
+
metadata?: Record<string, string>;
|
|
172
|
+
events?: ["completed", "failed", "page", "started"][number][];
|
|
173
|
+
};
|
|
174
|
+
deduplicateSimilarURLs?: boolean;
|
|
175
|
+
ignoreQueryParameters?: boolean;
|
|
115
176
|
}
|
|
116
177
|
|
|
117
178
|
/**
|
|
@@ -125,21 +186,47 @@ export interface CrawlResponse {
|
|
|
125
186
|
error?: string;
|
|
126
187
|
}
|
|
127
188
|
|
|
189
|
+
/**
|
|
190
|
+
* Response interface for batch scrape operations.
|
|
191
|
+
* Defines the structure of the response received after initiating a crawl.
|
|
192
|
+
*/
|
|
193
|
+
export interface BatchScrapeResponse {
|
|
194
|
+
id?: string;
|
|
195
|
+
url?: string;
|
|
196
|
+
success: true;
|
|
197
|
+
error?: string;
|
|
198
|
+
invalidURLs?: string[];
|
|
199
|
+
}
|
|
200
|
+
|
|
128
201
|
/**
|
|
129
202
|
* Response interface for job status checks.
|
|
130
203
|
* Provides detailed status of a crawl job including progress and results.
|
|
131
204
|
*/
|
|
132
205
|
export interface CrawlStatusResponse {
|
|
133
206
|
success: true;
|
|
207
|
+
status: "scraping" | "completed" | "failed" | "cancelled";
|
|
208
|
+
completed: number;
|
|
134
209
|
total: number;
|
|
210
|
+
creditsUsed: number;
|
|
211
|
+
expiresAt: Date;
|
|
212
|
+
next?: string;
|
|
213
|
+
data: FirecrawlDocument<undefined>[];
|
|
214
|
+
};
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Response interface for batch scrape job status checks.
|
|
218
|
+
* Provides detailed status of a batch scrape job including progress and results.
|
|
219
|
+
*/
|
|
220
|
+
export interface BatchScrapeStatusResponse {
|
|
221
|
+
success: true;
|
|
222
|
+
status: "scraping" | "completed" | "failed" | "cancelled";
|
|
135
223
|
completed: number;
|
|
224
|
+
total: number;
|
|
136
225
|
creditsUsed: number;
|
|
137
226
|
expiresAt: Date;
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
error?: string;
|
|
142
|
-
}
|
|
227
|
+
next?: string;
|
|
228
|
+
data: FirecrawlDocument<undefined>[];
|
|
229
|
+
};
|
|
143
230
|
|
|
144
231
|
/**
|
|
145
232
|
* Parameters for mapping operations.
|
|
@@ -149,7 +236,9 @@ export interface MapParams {
|
|
|
149
236
|
search?: string;
|
|
150
237
|
ignoreSitemap?: boolean;
|
|
151
238
|
includeSubdomains?: boolean;
|
|
239
|
+
sitemapOnly?: boolean;
|
|
152
240
|
limit?: number;
|
|
241
|
+
timeout?: number;
|
|
153
242
|
}
|
|
154
243
|
|
|
155
244
|
/**
|
|
@@ -162,6 +251,34 @@ export interface MapResponse {
|
|
|
162
251
|
error?: string;
|
|
163
252
|
}
|
|
164
253
|
|
|
254
|
+
/**
|
|
255
|
+
* Parameters for extracting information from URLs.
|
|
256
|
+
* Defines options for extracting information from URLs.
|
|
257
|
+
*/
|
|
258
|
+
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
259
|
+
prompt?: string;
|
|
260
|
+
schema?: LLMSchema | object;
|
|
261
|
+
systemPrompt?: string;
|
|
262
|
+
allowExternalLinks?: boolean;
|
|
263
|
+
enableWebSearch?: boolean;
|
|
264
|
+
includeSubdomains?: boolean;
|
|
265
|
+
origin?: string;
|
|
266
|
+
showSources?: boolean;
|
|
267
|
+
scrapeOptions?: CrawlScrapeOptions;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Response interface for extracting information from URLs.
|
|
272
|
+
* Defines the structure of the response received after extracting information from URLs.
|
|
273
|
+
*/
|
|
274
|
+
export interface ExtractResponse<LLMSchema extends zt.ZodSchema = any> {
|
|
275
|
+
success: boolean;
|
|
276
|
+
data: LLMSchema;
|
|
277
|
+
error?: string;
|
|
278
|
+
warning?: string;
|
|
279
|
+
sources?: string[];
|
|
280
|
+
}
|
|
281
|
+
|
|
165
282
|
/**
|
|
166
283
|
* Error response interface.
|
|
167
284
|
* Defines the structure of the response received when an error occurs.
|
|
@@ -171,6 +288,131 @@ export interface ErrorResponse {
|
|
|
171
288
|
error: string;
|
|
172
289
|
}
|
|
173
290
|
|
|
291
|
+
/**
|
|
292
|
+
* Custom error class for Firecrawl.
|
|
293
|
+
* Extends the built-in Error class to include a status code.
|
|
294
|
+
*/
|
|
295
|
+
export class FirecrawlError extends Error {
|
|
296
|
+
statusCode: number;
|
|
297
|
+
details?: any;
|
|
298
|
+
constructor(message: string, statusCode: number, details?: any) {
|
|
299
|
+
super(message);
|
|
300
|
+
this.statusCode = statusCode;
|
|
301
|
+
this.details = details;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* Parameters for search operations.
|
|
307
|
+
* Defines options for searching and scraping search results.
|
|
308
|
+
*/
|
|
309
|
+
export interface SearchParams {
|
|
310
|
+
limit?: number;
|
|
311
|
+
tbs?: string;
|
|
312
|
+
filter?: string;
|
|
313
|
+
lang?: string;
|
|
314
|
+
country?: string;
|
|
315
|
+
location?: string;
|
|
316
|
+
origin?: string;
|
|
317
|
+
timeout?: number;
|
|
318
|
+
scrapeOptions?: ScrapeParams;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/**
|
|
322
|
+
* Response interface for search operations.
|
|
323
|
+
* Defines the structure of the response received after a search operation.
|
|
324
|
+
*/
|
|
325
|
+
export interface SearchResponse {
|
|
326
|
+
success: boolean;
|
|
327
|
+
data: FirecrawlDocument<undefined>[];
|
|
328
|
+
warning?: string;
|
|
329
|
+
error?: string;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Response interface for crawl/batch scrape error monitoring.
|
|
334
|
+
*/
|
|
335
|
+
export interface CrawlErrorsResponse {
|
|
336
|
+
/**
|
|
337
|
+
* Scrapes that errored out + error details
|
|
338
|
+
*/
|
|
339
|
+
errors: {
|
|
340
|
+
id: string,
|
|
341
|
+
timestamp?: string,
|
|
342
|
+
url: string,
|
|
343
|
+
error: string,
|
|
344
|
+
}[];
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* URLs blocked by robots.txt
|
|
348
|
+
*/
|
|
349
|
+
robotsBlocked: string[];
|
|
350
|
+
};
|
|
351
|
+
|
|
352
|
+
/**
|
|
353
|
+
* Parameters for deep research operations.
|
|
354
|
+
* Defines options for conducting deep research on a topic.
|
|
355
|
+
*/
|
|
356
|
+
export interface DeepResearchParams {
|
|
357
|
+
/**
|
|
358
|
+
* Maximum depth of research iterations (1-10)
|
|
359
|
+
* @default 7
|
|
360
|
+
*/
|
|
361
|
+
maxDepth?: number;
|
|
362
|
+
/**
|
|
363
|
+
* Time limit in seconds (30-300)
|
|
364
|
+
* @default 270
|
|
365
|
+
*/
|
|
366
|
+
timeLimit?: number;
|
|
367
|
+
/**
|
|
368
|
+
* Experimental flag for streaming steps
|
|
369
|
+
*/
|
|
370
|
+
__experimental_streamSteps?: boolean;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* Response interface for deep research operations.
|
|
375
|
+
*/
|
|
376
|
+
export interface DeepResearchResponse {
|
|
377
|
+
success: boolean;
|
|
378
|
+
id: string;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
/**
|
|
382
|
+
* Status response interface for deep research operations.
|
|
383
|
+
*/
|
|
384
|
+
export interface DeepResearchStatusResponse {
|
|
385
|
+
success: boolean;
|
|
386
|
+
data: {
|
|
387
|
+
findings: Array<{
|
|
388
|
+
text: string;
|
|
389
|
+
source: string;
|
|
390
|
+
}>;
|
|
391
|
+
finalAnalysis: string;
|
|
392
|
+
analysis: string;
|
|
393
|
+
completedSteps: number;
|
|
394
|
+
totalSteps: number;
|
|
395
|
+
};
|
|
396
|
+
status: "processing" | "completed" | "failed";
|
|
397
|
+
error?: string;
|
|
398
|
+
expiresAt: string;
|
|
399
|
+
currentDepth: number;
|
|
400
|
+
maxDepth: number;
|
|
401
|
+
activities: Array<{
|
|
402
|
+
type: string;
|
|
403
|
+
status: string;
|
|
404
|
+
message: string;
|
|
405
|
+
timestamp: string;
|
|
406
|
+
depth: number;
|
|
407
|
+
}>;
|
|
408
|
+
sources: Array<{
|
|
409
|
+
url: string;
|
|
410
|
+
title: string;
|
|
411
|
+
description: string;
|
|
412
|
+
}>;
|
|
413
|
+
summaries: string[];
|
|
414
|
+
}
|
|
415
|
+
|
|
174
416
|
/**
|
|
175
417
|
* Main class for interacting with the Firecrawl API.
|
|
176
418
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
@@ -179,13 +421,23 @@ export default class FirecrawlApp {
|
|
|
179
421
|
public apiKey: string;
|
|
180
422
|
public apiUrl: string;
|
|
181
423
|
|
|
424
|
+
private isCloudService(url: string): boolean {
|
|
425
|
+
return url.includes('api.firecrawl.dev');
|
|
426
|
+
}
|
|
427
|
+
|
|
182
428
|
/**
|
|
183
429
|
* Initializes a new instance of the FirecrawlApp class.
|
|
184
430
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
185
431
|
*/
|
|
186
432
|
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
|
187
|
-
|
|
188
|
-
|
|
433
|
+
const baseUrl = apiUrl || "https://api.firecrawl.dev";
|
|
434
|
+
|
|
435
|
+
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
|
|
436
|
+
throw new FirecrawlError("No API key provided", 401);
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
this.apiKey = apiKey || '';
|
|
440
|
+
this.apiUrl = baseUrl;
|
|
189
441
|
}
|
|
190
442
|
|
|
191
443
|
/**
|
|
@@ -194,10 +446,10 @@ export default class FirecrawlApp {
|
|
|
194
446
|
* @param params - Additional parameters for the scrape request.
|
|
195
447
|
* @returns The response from the scrape operation.
|
|
196
448
|
*/
|
|
197
|
-
async scrapeUrl(
|
|
449
|
+
async scrapeUrl<T extends zt.ZodSchema, ActionsSchema extends (Action[] | undefined) = undefined>(
|
|
198
450
|
url: string,
|
|
199
|
-
params?: ScrapeParams
|
|
200
|
-
): Promise<ScrapeResponse | ErrorResponse> {
|
|
451
|
+
params?: ScrapeParams<T, ActionsSchema>
|
|
452
|
+
): Promise<ScrapeResponse<zt.infer<T>, ActionsSchema extends Action[] ? ActionsResult : never> | ErrorResponse> {
|
|
201
453
|
const headers: AxiosRequestHeaders = {
|
|
202
454
|
"Content-Type": "application/json",
|
|
203
455
|
Authorization: `Bearer ${this.apiKey}`,
|
|
@@ -220,6 +472,23 @@ export default class FirecrawlApp {
|
|
|
220
472
|
},
|
|
221
473
|
};
|
|
222
474
|
}
|
|
475
|
+
|
|
476
|
+
if (jsonData?.jsonOptions?.schema) {
|
|
477
|
+
let schema = jsonData.jsonOptions.schema;
|
|
478
|
+
// Try parsing the schema as a Zod schema
|
|
479
|
+
try {
|
|
480
|
+
schema = zodToJsonSchema(schema);
|
|
481
|
+
} catch (error) {
|
|
482
|
+
|
|
483
|
+
}
|
|
484
|
+
jsonData = {
|
|
485
|
+
...jsonData,
|
|
486
|
+
jsonOptions: {
|
|
487
|
+
...jsonData.jsonOptions,
|
|
488
|
+
schema: schema,
|
|
489
|
+
},
|
|
490
|
+
};
|
|
491
|
+
}
|
|
223
492
|
try {
|
|
224
493
|
const response: AxiosResponse = await axios.post(
|
|
225
494
|
this.apiUrl + `/v1/scrape`,
|
|
@@ -236,28 +505,92 @@ export default class FirecrawlApp {
|
|
|
236
505
|
...responseData.data
|
|
237
506
|
};
|
|
238
507
|
} else {
|
|
239
|
-
throw new
|
|
508
|
+
throw new FirecrawlError(`Failed to scrape URL. Error: ${responseData.error}`, response.status);
|
|
240
509
|
}
|
|
241
510
|
} else {
|
|
242
511
|
this.handleError(response, "scrape URL");
|
|
243
512
|
}
|
|
244
513
|
} catch (error: any) {
|
|
245
|
-
|
|
514
|
+
this.handleError(error.response, "scrape URL");
|
|
246
515
|
}
|
|
247
516
|
return { success: false, error: "Internal server error." };
|
|
248
517
|
}
|
|
249
518
|
|
|
250
519
|
/**
|
|
251
|
-
*
|
|
520
|
+
* Searches using the Firecrawl API and optionally scrapes the results.
|
|
252
521
|
* @param query - The search query string.
|
|
253
|
-
* @param params -
|
|
254
|
-
* @returns
|
|
522
|
+
* @param params - Optional parameters for the search request.
|
|
523
|
+
* @returns The response from the search operation.
|
|
255
524
|
*/
|
|
256
|
-
async search(
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
525
|
+
async search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse> {
|
|
526
|
+
const headers: AxiosRequestHeaders = {
|
|
527
|
+
"Content-Type": "application/json",
|
|
528
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
529
|
+
} as AxiosRequestHeaders;
|
|
530
|
+
|
|
531
|
+
let jsonData: any = {
|
|
532
|
+
query,
|
|
533
|
+
limit: params?.limit ?? 5,
|
|
534
|
+
tbs: params?.tbs,
|
|
535
|
+
filter: params?.filter,
|
|
536
|
+
lang: params?.lang ?? "en",
|
|
537
|
+
country: params?.country ?? "us",
|
|
538
|
+
location: params?.location,
|
|
539
|
+
origin: params?.origin ?? "api",
|
|
540
|
+
timeout: params?.timeout ?? 60000,
|
|
541
|
+
scrapeOptions: params?.scrapeOptions ?? { formats: [] },
|
|
542
|
+
};
|
|
543
|
+
|
|
544
|
+
if (jsonData?.scrapeOptions?.extract?.schema) {
|
|
545
|
+
let schema = jsonData.scrapeOptions.extract.schema;
|
|
546
|
+
|
|
547
|
+
// Try parsing the schema as a Zod schema
|
|
548
|
+
try {
|
|
549
|
+
schema = zodToJsonSchema(schema);
|
|
550
|
+
} catch (error) {
|
|
551
|
+
|
|
552
|
+
}
|
|
553
|
+
jsonData = {
|
|
554
|
+
...jsonData,
|
|
555
|
+
scrapeOptions: {
|
|
556
|
+
...jsonData.scrapeOptions,
|
|
557
|
+
extract: {
|
|
558
|
+
...jsonData.scrapeOptions.extract,
|
|
559
|
+
schema: schema,
|
|
560
|
+
},
|
|
561
|
+
},
|
|
562
|
+
};
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
try {
|
|
566
|
+
const response: AxiosResponse = await this.postRequest(
|
|
567
|
+
this.apiUrl + `/v1/search`,
|
|
568
|
+
jsonData,
|
|
569
|
+
headers
|
|
570
|
+
);
|
|
571
|
+
|
|
572
|
+
if (response.status === 200) {
|
|
573
|
+
const responseData = response.data;
|
|
574
|
+
if (responseData.success) {
|
|
575
|
+
return {
|
|
576
|
+
success: true,
|
|
577
|
+
data: responseData.data as FirecrawlDocument<any>[],
|
|
578
|
+
warning: responseData.warning,
|
|
579
|
+
};
|
|
580
|
+
} else {
|
|
581
|
+
throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
|
|
582
|
+
}
|
|
583
|
+
} else {
|
|
584
|
+
this.handleError(response, "search");
|
|
585
|
+
}
|
|
586
|
+
} catch (error: any) {
|
|
587
|
+
if (error.response?.data?.error) {
|
|
588
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
589
|
+
} else {
|
|
590
|
+
throw new FirecrawlError(error.message, 500);
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
return { success: false, error: "Internal server error.", data: [] };
|
|
261
594
|
}
|
|
262
595
|
|
|
263
596
|
/**
|
|
@@ -290,9 +623,9 @@ export default class FirecrawlApp {
|
|
|
290
623
|
}
|
|
291
624
|
} catch (error: any) {
|
|
292
625
|
if (error.response?.data?.error) {
|
|
293
|
-
throw new
|
|
626
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
294
627
|
} else {
|
|
295
|
-
throw new
|
|
628
|
+
throw new FirecrawlError(error.message, 500);
|
|
296
629
|
}
|
|
297
630
|
}
|
|
298
631
|
return { success: false, error: "Internal server error." };
|
|
@@ -318,9 +651,9 @@ export default class FirecrawlApp {
|
|
|
318
651
|
}
|
|
319
652
|
} catch (error: any) {
|
|
320
653
|
if (error.response?.data?.error) {
|
|
321
|
-
throw new
|
|
654
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
322
655
|
} else {
|
|
323
|
-
throw new
|
|
656
|
+
throw new FirecrawlError(error.message, 500);
|
|
324
657
|
}
|
|
325
658
|
}
|
|
326
659
|
return { success: false, error: "Internal server error." };
|
|
@@ -329,40 +662,134 @@ export default class FirecrawlApp {
|
|
|
329
662
|
/**
|
|
330
663
|
* Checks the status of a crawl job using the Firecrawl API.
|
|
331
664
|
* @param id - The ID of the crawl operation.
|
|
665
|
+
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
666
|
+
* @param nextURL - The `next` URL from the previous crawl status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`.
|
|
667
|
+
* @param skip - How many entries to skip to paginate. Only required if you're not providing `nextURL`. Only used when `getAllData = false`.
|
|
668
|
+
* @param limit - How many entries to return. Only used when `getAllData = false`.
|
|
332
669
|
* @returns The response containing the job status.
|
|
333
670
|
*/
|
|
334
|
-
async checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
671
|
+
async checkCrawlStatus(id?: string, getAllData = false, nextURL?: string, skip?: number, limit?: number): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
335
672
|
if (!id) {
|
|
336
|
-
throw new
|
|
673
|
+
throw new FirecrawlError("No crawl ID provided", 400);
|
|
337
674
|
}
|
|
338
675
|
|
|
339
676
|
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
|
677
|
+
const targetURL = new URL(nextURL ?? `${this.apiUrl}/v1/crawl/${id}`);
|
|
678
|
+
if (skip !== undefined) {
|
|
679
|
+
targetURL.searchParams.set("skip", skip.toString());
|
|
680
|
+
}
|
|
681
|
+
if (limit !== undefined) {
|
|
682
|
+
targetURL.searchParams.set("limit", limit.toString());
|
|
683
|
+
}
|
|
684
|
+
|
|
340
685
|
try {
|
|
341
686
|
const response: AxiosResponse = await this.getRequest(
|
|
342
|
-
|
|
687
|
+
targetURL.href,
|
|
343
688
|
headers
|
|
344
689
|
);
|
|
345
690
|
if (response.status === 200) {
|
|
346
|
-
|
|
347
|
-
|
|
691
|
+
let allData = response.data.data;
|
|
692
|
+
if (getAllData && response.data.status === "completed") {
|
|
693
|
+
let statusData = response.data
|
|
694
|
+
if ("data" in statusData) {
|
|
695
|
+
let data = statusData.data;
|
|
696
|
+
while (typeof statusData === 'object' && 'next' in statusData) {
|
|
697
|
+
if (data.length === 0) {
|
|
698
|
+
break
|
|
699
|
+
}
|
|
700
|
+
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
701
|
+
data = data.concat(statusData.data);
|
|
702
|
+
}
|
|
703
|
+
allData = data;
|
|
704
|
+
}
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
let resp: CrawlStatusResponse | ErrorResponse = {
|
|
708
|
+
success: response.data.success,
|
|
348
709
|
status: response.data.status,
|
|
349
710
|
total: response.data.total,
|
|
350
711
|
completed: response.data.completed,
|
|
351
712
|
creditsUsed: response.data.creditsUsed,
|
|
713
|
+
next: getAllData ? undefined : response.data.next,
|
|
352
714
|
expiresAt: new Date(response.data.expiresAt),
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
715
|
+
data: allData
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
if (!response.data.success && response.data.error) {
|
|
719
|
+
resp = {
|
|
720
|
+
...resp,
|
|
721
|
+
success: false,
|
|
722
|
+
error: response.data.error
|
|
723
|
+
} as ErrorResponse;
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
if (response.data.next) {
|
|
727
|
+
(resp as CrawlStatusResponse).next = response.data.next;
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
return resp;
|
|
357
731
|
} else {
|
|
358
732
|
this.handleError(response, "check crawl status");
|
|
359
733
|
}
|
|
360
734
|
} catch (error: any) {
|
|
361
|
-
throw new
|
|
735
|
+
throw new FirecrawlError(error.message, 500);
|
|
736
|
+
}
|
|
737
|
+
return { success: false, error: "Internal server error." };
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
/**
|
|
741
|
+
* Returns information about crawl errors.
|
|
742
|
+
* @param id - The ID of the crawl operation.
|
|
743
|
+
* @returns Information about crawl errors.
|
|
744
|
+
*/
|
|
745
|
+
async checkCrawlErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> {
|
|
746
|
+
const headers = this.prepareHeaders();
|
|
747
|
+
try {
|
|
748
|
+
const response: AxiosResponse = await this.deleteRequest(
|
|
749
|
+
`${this.apiUrl}/v1/crawl/${id}/errors`,
|
|
750
|
+
headers
|
|
751
|
+
);
|
|
752
|
+
if (response.status === 200) {
|
|
753
|
+
return response.data;
|
|
754
|
+
} else {
|
|
755
|
+
this.handleError(response, "check crawl errors");
|
|
756
|
+
}
|
|
757
|
+
} catch (error: any) {
|
|
758
|
+
throw new FirecrawlError(error.message, 500);
|
|
362
759
|
}
|
|
363
760
|
return { success: false, error: "Internal server error." };
|
|
364
761
|
}
|
|
365
762
|
|
|
763
|
+
/**
|
|
764
|
+
* Cancels a crawl job using the Firecrawl API.
|
|
765
|
+
* @param id - The ID of the crawl operation.
|
|
766
|
+
* @returns The response from the cancel crawl operation.
|
|
767
|
+
*/
|
|
768
|
+
async cancelCrawl(id: string): Promise<ErrorResponse> {
|
|
769
|
+
const headers = this.prepareHeaders();
|
|
770
|
+
try {
|
|
771
|
+
const response: AxiosResponse = await this.deleteRequest(
|
|
772
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
773
|
+
headers
|
|
774
|
+
);
|
|
775
|
+
if (response.status === 200) {
|
|
776
|
+
return response.data;
|
|
777
|
+
} else {
|
|
778
|
+
this.handleError(response, "cancel crawl job");
|
|
779
|
+
}
|
|
780
|
+
} catch (error: any) {
|
|
781
|
+
throw new FirecrawlError(error.message, 500);
|
|
782
|
+
}
|
|
783
|
+
return { success: false, error: "Internal server error." };
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
/**
|
|
787
|
+
* Initiates a crawl job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
788
|
+
* @param url - The URL to crawl.
|
|
789
|
+
* @param params - Additional parameters for the crawl request.
|
|
790
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
791
|
+
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
792
|
+
*/
|
|
366
793
|
async crawlUrlAndWatch(
|
|
367
794
|
url: string,
|
|
368
795
|
params?: CrawlParams,
|
|
@@ -375,9 +802,15 @@ export default class FirecrawlApp {
|
|
|
375
802
|
return new CrawlWatcher(id, this);
|
|
376
803
|
}
|
|
377
804
|
|
|
378
|
-
throw new
|
|
805
|
+
throw new FirecrawlError("Crawl job failed to start", 400);
|
|
379
806
|
}
|
|
380
807
|
|
|
808
|
+
/**
|
|
809
|
+
* Maps a URL using the Firecrawl API.
|
|
810
|
+
* @param url - The URL to map.
|
|
811
|
+
* @param params - Additional parameters for the map request.
|
|
812
|
+
* @returns The response from the map operation.
|
|
813
|
+
*/
|
|
381
814
|
async mapUrl(url: string, params?: MapParams): Promise<MapResponse | ErrorResponse> {
|
|
382
815
|
const headers = this.prepareHeaders();
|
|
383
816
|
let jsonData: { url: string } & MapParams = { url, ...params };
|
|
@@ -394,50 +827,442 @@ export default class FirecrawlApp {
|
|
|
394
827
|
this.handleError(response, "map");
|
|
395
828
|
}
|
|
396
829
|
} catch (error: any) {
|
|
397
|
-
throw new
|
|
830
|
+
throw new FirecrawlError(error.message, 500);
|
|
398
831
|
}
|
|
399
832
|
return { success: false, error: "Internal server error." };
|
|
400
833
|
}
|
|
401
834
|
|
|
402
835
|
/**
|
|
403
|
-
*
|
|
404
|
-
* @param
|
|
405
|
-
* @
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
Authorization: `Bearer ${this.apiKey}`,
|
|
411
|
-
...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}),
|
|
412
|
-
} as AxiosRequestHeaders & { "x-idempotency-key"?: string };
|
|
413
|
-
}
|
|
414
|
-
|
|
415
|
-
/**
|
|
416
|
-
* Sends a POST request to the specified URL.
|
|
417
|
-
* @param url - The URL to send the request to.
|
|
418
|
-
* @param data - The data to send in the request.
|
|
419
|
-
* @param headers - The headers for the request.
|
|
420
|
-
* @returns The response from the POST request.
|
|
836
|
+
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
|
|
837
|
+
* @param url - The URLs to scrape.
|
|
838
|
+
* @param params - Additional parameters for the scrape request.
|
|
839
|
+
* @param pollInterval - Time in seconds for job status checks.
|
|
840
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
841
|
+
* @param webhook - Optional webhook for the batch scrape.
|
|
842
|
+
* @returns The response from the crawl operation.
|
|
421
843
|
*/
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
844
|
+
async batchScrapeUrls(
|
|
845
|
+
urls: string[],
|
|
846
|
+
params?: ScrapeParams,
|
|
847
|
+
pollInterval: number = 2,
|
|
848
|
+
idempotencyKey?: string,
|
|
849
|
+
webhook?: CrawlParams["webhook"],
|
|
850
|
+
ignoreInvalidURLs?: boolean,
|
|
851
|
+
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
|
852
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
853
|
+
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params };
|
|
854
|
+
if (jsonData?.extract?.schema) {
|
|
855
|
+
let schema = jsonData.extract.schema;
|
|
429
856
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
857
|
+
// Try parsing the schema as a Zod schema
|
|
858
|
+
try {
|
|
859
|
+
schema = zodToJsonSchema(schema);
|
|
860
|
+
} catch (error) {
|
|
861
|
+
|
|
862
|
+
}
|
|
863
|
+
jsonData = {
|
|
864
|
+
...jsonData,
|
|
865
|
+
extract: {
|
|
866
|
+
...jsonData.extract,
|
|
867
|
+
schema: schema,
|
|
868
|
+
},
|
|
869
|
+
};
|
|
870
|
+
}
|
|
871
|
+
if (jsonData?.jsonOptions?.schema) {
|
|
872
|
+
let schema = jsonData.jsonOptions.schema;
|
|
873
|
+
|
|
874
|
+
// Try parsing the schema as a Zod schema
|
|
875
|
+
try {
|
|
876
|
+
schema = zodToJsonSchema(schema);
|
|
877
|
+
} catch (error) {
|
|
878
|
+
|
|
879
|
+
}
|
|
880
|
+
jsonData = {
|
|
881
|
+
...jsonData,
|
|
882
|
+
jsonOptions: {
|
|
883
|
+
...jsonData.jsonOptions,
|
|
884
|
+
schema: schema,
|
|
885
|
+
},
|
|
886
|
+
};
|
|
887
|
+
}
|
|
888
|
+
try {
|
|
889
|
+
const response: AxiosResponse = await this.postRequest(
|
|
890
|
+
this.apiUrl + `/v1/batch/scrape`,
|
|
891
|
+
jsonData,
|
|
892
|
+
headers
|
|
893
|
+
);
|
|
894
|
+
if (response.status === 200) {
|
|
895
|
+
const id: string = response.data.id;
|
|
896
|
+
return this.monitorJobStatus(id, headers, pollInterval);
|
|
897
|
+
} else {
|
|
898
|
+
this.handleError(response, "start batch scrape job");
|
|
899
|
+
}
|
|
900
|
+
} catch (error: any) {
|
|
901
|
+
if (error.response?.data?.error) {
|
|
902
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
903
|
+
} else {
|
|
904
|
+
throw new FirecrawlError(error.message, 500);
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
return { success: false, error: "Internal server error." };
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
async asyncBatchScrapeUrls(
|
|
911
|
+
urls: string[],
|
|
912
|
+
params?: ScrapeParams,
|
|
913
|
+
idempotencyKey?: string,
|
|
914
|
+
webhook?: CrawlParams["webhook"],
|
|
915
|
+
ignoreInvalidURLs?: boolean,
|
|
916
|
+
): Promise<BatchScrapeResponse | ErrorResponse> {
|
|
917
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
918
|
+
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) };
|
|
919
|
+
try {
|
|
920
|
+
const response: AxiosResponse = await this.postRequest(
|
|
921
|
+
this.apiUrl + `/v1/batch/scrape`,
|
|
922
|
+
jsonData,
|
|
923
|
+
headers
|
|
924
|
+
);
|
|
925
|
+
if (response.status === 200) {
|
|
926
|
+
return response.data;
|
|
927
|
+
} else {
|
|
928
|
+
this.handleError(response, "start batch scrape job");
|
|
929
|
+
}
|
|
930
|
+
} catch (error: any) {
|
|
931
|
+
if (error.response?.data?.error) {
|
|
932
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
933
|
+
} else {
|
|
934
|
+
throw new FirecrawlError(error.message, 500);
|
|
935
|
+
}
|
|
936
|
+
}
|
|
937
|
+
return { success: false, error: "Internal server error." };
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
/**
|
|
941
|
+
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
942
|
+
* @param urls - The URL to scrape.
|
|
943
|
+
* @param params - Additional parameters for the scrape request.
|
|
944
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
945
|
+
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
946
|
+
*/
|
|
947
|
+
async batchScrapeUrlsAndWatch(
|
|
948
|
+
urls: string[],
|
|
949
|
+
params?: ScrapeParams,
|
|
950
|
+
idempotencyKey?: string,
|
|
951
|
+
webhook?: CrawlParams["webhook"],
|
|
952
|
+
ignoreInvalidURLs?: boolean,
|
|
953
|
+
) {
|
|
954
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs);
|
|
955
|
+
|
|
956
|
+
if (crawl.success && crawl.id) {
|
|
957
|
+
const id = crawl.id;
|
|
958
|
+
return new CrawlWatcher(id, this);
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
throw new FirecrawlError("Batch scrape job failed to start", 400);
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
/**
|
|
965
|
+
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
966
|
+
* @param id - The ID of the batch scrape operation.
|
|
967
|
+
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
968
|
+
* @param nextURL - The `next` URL from the previous batch scrape status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`.
|
|
969
|
+
* @param skip - How many entries to skip to paginate. Only used when `getAllData = false`.
|
|
970
|
+
* @param limit - How many entries to return. Only used when `getAllData = false`.
|
|
971
|
+
* @returns The response containing the job status.
|
|
972
|
+
*/
|
|
973
|
+
async checkBatchScrapeStatus(id?: string, getAllData = false, nextURL?: string, skip?: number, limit?: number): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
|
974
|
+
if (!id) {
|
|
975
|
+
throw new FirecrawlError("No batch scrape ID provided", 400);
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
|
979
|
+
const targetURL = new URL(nextURL ?? `${this.apiUrl}/v1/batch/scrape/${id}`);
|
|
980
|
+
if (skip !== undefined) {
|
|
981
|
+
targetURL.searchParams.set("skip", skip.toString());
|
|
982
|
+
}
|
|
983
|
+
if (limit !== undefined) {
|
|
984
|
+
targetURL.searchParams.set("limit", limit.toString());
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
try {
|
|
988
|
+
const response: AxiosResponse = await this.getRequest(
|
|
989
|
+
targetURL.href,
|
|
990
|
+
headers
|
|
991
|
+
);
|
|
992
|
+
if (response.status === 200) {
|
|
993
|
+
let allData = response.data.data;
|
|
994
|
+
if (getAllData && response.data.status === "completed") {
|
|
995
|
+
let statusData = response.data
|
|
996
|
+
if ("data" in statusData) {
|
|
997
|
+
let data = statusData.data;
|
|
998
|
+
while (typeof statusData === 'object' && 'next' in statusData) {
|
|
999
|
+
if (data.length === 0) {
|
|
1000
|
+
break
|
|
1001
|
+
}
|
|
1002
|
+
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
1003
|
+
data = data.concat(statusData.data);
|
|
1004
|
+
}
|
|
1005
|
+
allData = data;
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
|
|
1009
|
+
let resp: BatchScrapeStatusResponse | ErrorResponse = {
|
|
1010
|
+
success: response.data.success,
|
|
1011
|
+
status: response.data.status,
|
|
1012
|
+
total: response.data.total,
|
|
1013
|
+
completed: response.data.completed,
|
|
1014
|
+
creditsUsed: response.data.creditsUsed,
|
|
1015
|
+
next: getAllData ? undefined : response.data.next,
|
|
1016
|
+
expiresAt: new Date(response.data.expiresAt),
|
|
1017
|
+
data: allData
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
if (!response.data.success && response.data.error) {
|
|
1021
|
+
resp = {
|
|
1022
|
+
...resp,
|
|
1023
|
+
success: false,
|
|
1024
|
+
error: response.data.error
|
|
1025
|
+
} as ErrorResponse;
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
if (response.data.next) {
|
|
1029
|
+
(resp as BatchScrapeStatusResponse).next = response.data.next;
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
return resp;
|
|
1033
|
+
} else {
|
|
1034
|
+
this.handleError(response, "check batch scrape status");
|
|
1035
|
+
}
|
|
1036
|
+
} catch (error: any) {
|
|
1037
|
+
throw new FirecrawlError(error.message, 500);
|
|
1038
|
+
}
|
|
1039
|
+
return { success: false, error: "Internal server error." };
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
/**
|
|
1043
|
+
* Returns information about batch scrape errors.
|
|
1044
|
+
* @param id - The ID of the batch scrape operation.
|
|
1045
|
+
* @returns Information about batch scrape errors.
|
|
1046
|
+
*/
|
|
1047
|
+
async checkBatchScrapeErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> {
|
|
1048
|
+
const headers = this.prepareHeaders();
|
|
1049
|
+
try {
|
|
1050
|
+
const response: AxiosResponse = await this.deleteRequest(
|
|
1051
|
+
`${this.apiUrl}/v1/batch/scrape/${id}/errors`,
|
|
1052
|
+
headers
|
|
1053
|
+
);
|
|
1054
|
+
if (response.status === 200) {
|
|
1055
|
+
return response.data;
|
|
1056
|
+
} else {
|
|
1057
|
+
this.handleError(response, "check batch scrape errors");
|
|
1058
|
+
}
|
|
1059
|
+
} catch (error: any) {
|
|
1060
|
+
throw new FirecrawlError(error.message, 500);
|
|
1061
|
+
}
|
|
1062
|
+
return { success: false, error: "Internal server error." };
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
/**
|
|
1066
|
+
* Extracts information from URLs using the Firecrawl API.
|
|
1067
|
+
* Currently in Beta. Expect breaking changes on future minor versions.
|
|
1068
|
+
* @param url - The URL to extract information from.
|
|
1069
|
+
* @param params - Additional parameters for the extract request.
|
|
1070
|
+
* @returns The response from the extract operation.
|
|
1071
|
+
*/
|
|
1072
|
+
async extract<T extends zt.ZodSchema = any>(urls: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> {
|
|
1073
|
+
const headers = this.prepareHeaders();
|
|
1074
|
+
|
|
1075
|
+
let jsonData: { urls: string[] } & ExtractParams<T> = { urls, ...params };
|
|
1076
|
+
let jsonSchema: any;
|
|
1077
|
+
try {
|
|
1078
|
+
if (!params?.schema) {
|
|
1079
|
+
jsonSchema = undefined;
|
|
1080
|
+
} else if (params.schema instanceof zt.ZodType) {
|
|
1081
|
+
jsonSchema = zodToJsonSchema(params.schema);
|
|
1082
|
+
} else {
|
|
1083
|
+
jsonSchema = params.schema;
|
|
1084
|
+
}
|
|
1085
|
+
} catch (error: any) {
|
|
1086
|
+
throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400);
|
|
1087
|
+
}
|
|
1088
|
+
|
|
1089
|
+
|
|
1090
|
+
try {
|
|
1091
|
+
const response: AxiosResponse = await this.postRequest(
|
|
1092
|
+
this.apiUrl + `/v1/extract`,
|
|
1093
|
+
{ ...jsonData, schema: jsonSchema, origin: params?.origin || "api-sdk" },
|
|
1094
|
+
headers
|
|
1095
|
+
);
|
|
1096
|
+
|
|
1097
|
+
if (response.status === 200) {
|
|
1098
|
+
const jobId = response.data.id;
|
|
1099
|
+
let extractStatus;
|
|
1100
|
+
do {
|
|
1101
|
+
const statusResponse: AxiosResponse = await this.getRequest(
|
|
1102
|
+
`${this.apiUrl}/v1/extract/${jobId}`,
|
|
1103
|
+
headers
|
|
1104
|
+
);
|
|
1105
|
+
extractStatus = statusResponse.data;
|
|
1106
|
+
if (extractStatus.status === "completed") {
|
|
1107
|
+
if (extractStatus.success) {
|
|
1108
|
+
return {
|
|
1109
|
+
success: true,
|
|
1110
|
+
data: extractStatus.data,
|
|
1111
|
+
warning: extractStatus.warning,
|
|
1112
|
+
error: extractStatus.error,
|
|
1113
|
+
sources: extractStatus?.sources || undefined,
|
|
1114
|
+
};
|
|
1115
|
+
} else {
|
|
1116
|
+
throw new FirecrawlError(`Failed to extract data. Error: ${extractStatus.error}`, statusResponse.status);
|
|
1117
|
+
}
|
|
1118
|
+
} else if (extractStatus.status === "failed" || extractStatus.status === "cancelled") {
|
|
1119
|
+
throw new FirecrawlError(`Extract job ${extractStatus.status}. Error: ${extractStatus.error}`, statusResponse.status);
|
|
1120
|
+
}
|
|
1121
|
+
await new Promise(resolve => setTimeout(resolve, 1000)); // Polling interval
|
|
1122
|
+
} while (extractStatus.status !== "completed");
|
|
1123
|
+
} else {
|
|
1124
|
+
this.handleError(response, "extract");
|
|
1125
|
+
}
|
|
1126
|
+
} catch (error: any) {
|
|
1127
|
+
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
|
1128
|
+
}
|
|
1129
|
+
return { success: false, error: "Internal server error."};
|
|
1130
|
+
}
|
|
1131
|
+
|
|
1132
|
+
/**
|
|
1133
|
+
* Initiates an asynchronous extract job for a URL using the Firecrawl API.
|
|
1134
|
+
* @param url - The URL to extract data from.
|
|
1135
|
+
* @param params - Additional parameters for the extract request.
|
|
1136
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
1137
|
+
* @returns The response from the extract operation.
|
|
1138
|
+
*/
|
|
1139
|
+
async asyncExtract(
|
|
1140
|
+
urls: string[],
|
|
1141
|
+
params?: ExtractParams,
|
|
1142
|
+
idempotencyKey?: string
|
|
1143
|
+
): Promise<ExtractResponse | ErrorResponse> {
|
|
1144
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
1145
|
+
let jsonData: any = { urls, ...params };
|
|
1146
|
+
let jsonSchema: any;
|
|
1147
|
+
|
|
1148
|
+
try {
|
|
1149
|
+
if (params?.schema instanceof zt.ZodType) {
|
|
1150
|
+
jsonSchema = zodToJsonSchema(params.schema);
|
|
1151
|
+
} else {
|
|
1152
|
+
jsonSchema = params?.schema;
|
|
1153
|
+
}
|
|
1154
|
+
} catch (error: any) {
|
|
1155
|
+
throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400);
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
try {
|
|
1159
|
+
const response: AxiosResponse = await this.postRequest(
|
|
1160
|
+
this.apiUrl + `/v1/extract`,
|
|
1161
|
+
{ ...jsonData, schema: jsonSchema },
|
|
1162
|
+
headers
|
|
1163
|
+
);
|
|
1164
|
+
|
|
1165
|
+
if (response.status === 200) {
|
|
1166
|
+
return response.data;
|
|
1167
|
+
} else {
|
|
1168
|
+
this.handleError(response, "start extract job");
|
|
1169
|
+
}
|
|
1170
|
+
} catch (error: any) {
|
|
1171
|
+
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
|
1172
|
+
}
|
|
1173
|
+
return { success: false, error: "Internal server error." };
|
|
1174
|
+
}
|
|
1175
|
+
|
|
1176
|
+
/**
|
|
1177
|
+
* Retrieves the status of an extract job.
|
|
1178
|
+
* @param jobId - The ID of the extract job.
|
|
1179
|
+
* @returns The status of the extract job.
|
|
1180
|
+
*/
|
|
1181
|
+
async getExtractStatus(jobId: string): Promise<any> {
|
|
1182
|
+
try {
|
|
1183
|
+
const response: AxiosResponse = await this.getRequest(
|
|
1184
|
+
`${this.apiUrl}/v1/extract/${jobId}`,
|
|
1185
|
+
this.prepareHeaders()
|
|
1186
|
+
);
|
|
1187
|
+
|
|
1188
|
+
if (response.status === 200) {
|
|
1189
|
+
return response.data;
|
|
1190
|
+
} else {
|
|
1191
|
+
this.handleError(response, "get extract status");
|
|
1192
|
+
}
|
|
1193
|
+
} catch (error: any) {
|
|
1194
|
+
throw new FirecrawlError(error.message, 500);
|
|
1195
|
+
}
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
/**
|
|
1199
|
+
* Prepares the headers for an API request.
|
|
1200
|
+
* @param idempotencyKey - Optional key to ensure idempotency.
|
|
1201
|
+
* @returns The prepared headers.
|
|
1202
|
+
*/
|
|
1203
|
+
prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
|
|
1204
|
+
return {
|
|
1205
|
+
"Content-Type": "application/json",
|
|
1206
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
1207
|
+
...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}),
|
|
1208
|
+
} as AxiosRequestHeaders & { "x-idempotency-key"?: string };
|
|
1209
|
+
}
|
|
1210
|
+
|
|
1211
|
+
/**
|
|
1212
|
+
* Sends a POST request to the specified URL.
|
|
1213
|
+
* @param url - The URL to send the request to.
|
|
1214
|
+
* @param data - The data to send in the request.
|
|
1215
|
+
* @param headers - The headers for the request.
|
|
1216
|
+
* @returns The response from the POST request.
|
|
1217
|
+
*/
|
|
1218
|
+
postRequest(
|
|
1219
|
+
url: string,
|
|
1220
|
+
data: any,
|
|
1221
|
+
headers: AxiosRequestHeaders
|
|
1222
|
+
): Promise<AxiosResponse> {
|
|
1223
|
+
return axios.post(url, data, { headers });
|
|
1224
|
+
}
|
|
1225
|
+
|
|
1226
|
+
/**
|
|
1227
|
+
* Sends a GET request to the specified URL.
|
|
1228
|
+
* @param url - The URL to send the request to.
|
|
1229
|
+
* @param headers - The headers for the request.
|
|
1230
|
+
* @returns The response from the GET request.
|
|
1231
|
+
*/
|
|
1232
|
+
async getRequest(
|
|
437
1233
|
url: string,
|
|
438
1234
|
headers: AxiosRequestHeaders
|
|
439
1235
|
): Promise<AxiosResponse> {
|
|
440
|
-
|
|
1236
|
+
try {
|
|
1237
|
+
return await axios.get(url, { headers });
|
|
1238
|
+
} catch (error) {
|
|
1239
|
+
if (error instanceof AxiosError && error.response) {
|
|
1240
|
+
return error.response as AxiosResponse;
|
|
1241
|
+
} else {
|
|
1242
|
+
throw error;
|
|
1243
|
+
}
|
|
1244
|
+
}
|
|
1245
|
+
}
|
|
1246
|
+
|
|
1247
|
+
/**
|
|
1248
|
+
* Sends a DELETE request to the specified URL.
|
|
1249
|
+
* @param url - The URL to send the request to.
|
|
1250
|
+
* @param headers - The headers for the request.
|
|
1251
|
+
* @returns The response from the DELETE request.
|
|
1252
|
+
*/
|
|
1253
|
+
async deleteRequest(
|
|
1254
|
+
url: string,
|
|
1255
|
+
headers: AxiosRequestHeaders
|
|
1256
|
+
): Promise<AxiosResponse> {
|
|
1257
|
+
try {
|
|
1258
|
+
return await axios.delete(url, { headers });
|
|
1259
|
+
} catch (error) {
|
|
1260
|
+
if (error instanceof AxiosError && error.response) {
|
|
1261
|
+
return error.response as AxiosResponse;
|
|
1262
|
+
} else {
|
|
1263
|
+
throw error;
|
|
1264
|
+
}
|
|
1265
|
+
}
|
|
441
1266
|
}
|
|
442
1267
|
|
|
443
1268
|
/**
|
|
@@ -452,42 +1277,50 @@ export default class FirecrawlApp {
|
|
|
452
1277
|
id: string,
|
|
453
1278
|
headers: AxiosRequestHeaders,
|
|
454
1279
|
checkInterval: number
|
|
455
|
-
): Promise<CrawlStatusResponse> {
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
1280
|
+
): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
1281
|
+
try {
|
|
1282
|
+
while (true) {
|
|
1283
|
+
let statusResponse: AxiosResponse = await this.getRequest(
|
|
1284
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
1285
|
+
headers
|
|
1286
|
+
);
|
|
1287
|
+
if (statusResponse.status === 200) {
|
|
1288
|
+
let statusData = statusResponse.data;
|
|
1289
|
+
if (statusData.status === "completed") {
|
|
1290
|
+
if ("data" in statusData) {
|
|
1291
|
+
let data = statusData.data;
|
|
1292
|
+
while (typeof statusData === 'object' && 'next' in statusData) {
|
|
1293
|
+
if (data.length === 0) {
|
|
1294
|
+
break
|
|
1295
|
+
}
|
|
1296
|
+
statusResponse = await this.getRequest(statusData.next, headers);
|
|
1297
|
+
statusData = statusResponse.data;
|
|
1298
|
+
data = data.concat(statusData.data);
|
|
1299
|
+
}
|
|
1300
|
+
statusData.data = data;
|
|
1301
|
+
return statusData;
|
|
1302
|
+
} else {
|
|
1303
|
+
throw new FirecrawlError("Crawl job completed but no data was returned", 500);
|
|
1304
|
+
}
|
|
1305
|
+
} else if (
|
|
1306
|
+
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
|
|
1307
|
+
) {
|
|
1308
|
+
checkInterval = Math.max(checkInterval, 2);
|
|
1309
|
+
await new Promise((resolve) =>
|
|
1310
|
+
setTimeout(resolve, checkInterval * 1000)
|
|
1311
|
+
);
|
|
473
1312
|
} else {
|
|
474
|
-
throw new
|
|
1313
|
+
throw new FirecrawlError(
|
|
1314
|
+
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
1315
|
+
500
|
|
1316
|
+
);
|
|
475
1317
|
}
|
|
476
|
-
} else if (
|
|
477
|
-
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
|
|
478
|
-
) {
|
|
479
|
-
checkInterval = Math.max(checkInterval, 2);
|
|
480
|
-
await new Promise((resolve) =>
|
|
481
|
-
setTimeout(resolve, checkInterval * 1000)
|
|
482
|
-
);
|
|
483
1318
|
} else {
|
|
484
|
-
|
|
485
|
-
`Crawl job failed or was stopped. Status: ${statusData.status}`
|
|
486
|
-
);
|
|
1319
|
+
this.handleError(statusResponse, "check crawl status");
|
|
487
1320
|
}
|
|
488
|
-
} else {
|
|
489
|
-
this.handleError(statusResponse, "check crawl status");
|
|
490
1321
|
}
|
|
1322
|
+
} catch (error: any) {
|
|
1323
|
+
throw new FirecrawlError(error, 500);
|
|
491
1324
|
}
|
|
492
1325
|
}
|
|
493
1326
|
|
|
@@ -497,41 +1330,162 @@ export default class FirecrawlApp {
|
|
|
497
1330
|
* @param {string} action - The action being performed when the error occurred.
|
|
498
1331
|
*/
|
|
499
1332
|
handleError(response: AxiosResponse, action: string): void {
|
|
500
|
-
if ([402, 408, 409, 500].includes(response.status)) {
|
|
1333
|
+
if ([400, 402, 408, 409, 500].includes(response.status)) {
|
|
501
1334
|
const errorMessage: string =
|
|
502
1335
|
response.data.error || "Unknown error occurred";
|
|
503
|
-
|
|
504
|
-
|
|
1336
|
+
const details = response.data.details ? ` - ${JSON.stringify(response.data.details)}` : '';
|
|
1337
|
+
throw new FirecrawlError(
|
|
1338
|
+
`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}${details}`,
|
|
1339
|
+
response.status,
|
|
1340
|
+
response?.data?.details
|
|
505
1341
|
);
|
|
506
1342
|
} else {
|
|
507
|
-
throw new
|
|
508
|
-
`Unexpected error occurred while trying to ${action}. Status code: ${response.status}
|
|
1343
|
+
throw new FirecrawlError(
|
|
1344
|
+
`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`,
|
|
1345
|
+
response.status
|
|
1346
|
+
);
|
|
1347
|
+
}
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
/**
|
|
1351
|
+
* Initiates a deep research operation on a given topic and polls until completion.
|
|
1352
|
+
* @param params - Parameters for the deep research operation.
|
|
1353
|
+
* @returns The final research results.
|
|
1354
|
+
*/
|
|
1355
|
+
async __deepResearch(topic: string, params: DeepResearchParams): Promise<DeepResearchStatusResponse | ErrorResponse> {
|
|
1356
|
+
try {
|
|
1357
|
+
const response = await this.__asyncDeepResearch(topic, params);
|
|
1358
|
+
|
|
1359
|
+
if (!response.success || 'error' in response) {
|
|
1360
|
+
return { success: false, error: 'error' in response ? response.error : 'Unknown error' };
|
|
1361
|
+
}
|
|
1362
|
+
|
|
1363
|
+
if (!response.id) {
|
|
1364
|
+
throw new FirecrawlError(`Failed to start research. No job ID returned.`, 500);
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
const jobId = response.id;
|
|
1368
|
+
let researchStatus;
|
|
1369
|
+
|
|
1370
|
+
while (true) {
|
|
1371
|
+
// console.log("Checking research status...");
|
|
1372
|
+
researchStatus = await this.__checkDeepResearchStatus(jobId);
|
|
1373
|
+
// console.log("Research status:", researchStatus);
|
|
1374
|
+
|
|
1375
|
+
if ('error' in researchStatus && !researchStatus.success) {
|
|
1376
|
+
return researchStatus;
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
if (researchStatus.status === "completed") {
|
|
1380
|
+
return researchStatus;
|
|
1381
|
+
}
|
|
1382
|
+
|
|
1383
|
+
if (researchStatus.status === "failed") {
|
|
1384
|
+
throw new FirecrawlError(
|
|
1385
|
+
`Research job ${researchStatus.status}. Error: ${researchStatus.error}`,
|
|
1386
|
+
500
|
|
1387
|
+
);
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
if (researchStatus.status !== "processing") {
|
|
1391
|
+
break;
|
|
1392
|
+
}
|
|
1393
|
+
|
|
1394
|
+
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
1395
|
+
}
|
|
1396
|
+
// console.log("Research status finished:", researchStatus);
|
|
1397
|
+
|
|
1398
|
+
return { success: false, error: "Research job terminated unexpectedly" };
|
|
1399
|
+
} catch (error: any) {
|
|
1400
|
+
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
|
1401
|
+
}
|
|
1402
|
+
}
|
|
1403
|
+
|
|
1404
|
+
/**
|
|
1405
|
+
* Initiates a deep research operation on a given topic without polling.
|
|
1406
|
+
* @param params - Parameters for the deep research operation.
|
|
1407
|
+
* @returns The response containing the research job ID.
|
|
1408
|
+
*/
|
|
1409
|
+
async __asyncDeepResearch(topic: string, params: DeepResearchParams): Promise<DeepResearchResponse | ErrorResponse> {
|
|
1410
|
+
const headers = this.prepareHeaders();
|
|
1411
|
+
try {
|
|
1412
|
+
const response: AxiosResponse = await this.postRequest(
|
|
1413
|
+
`${this.apiUrl}/v1/deep-research`,
|
|
1414
|
+
{ topic, ...params },
|
|
1415
|
+
headers
|
|
1416
|
+
);
|
|
1417
|
+
|
|
1418
|
+
if (response.status === 200) {
|
|
1419
|
+
return response.data;
|
|
1420
|
+
} else {
|
|
1421
|
+
this.handleError(response, "start deep research");
|
|
1422
|
+
}
|
|
1423
|
+
} catch (error: any) {
|
|
1424
|
+
if (error.response?.data?.error) {
|
|
1425
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
1426
|
+
} else {
|
|
1427
|
+
throw new FirecrawlError(error.message, 500);
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
return { success: false, error: "Internal server error." };
|
|
1431
|
+
}
|
|
1432
|
+
|
|
1433
|
+
/**
|
|
1434
|
+
* Checks the status of a deep research operation.
|
|
1435
|
+
* @param id - The ID of the deep research operation.
|
|
1436
|
+
* @returns The current status and results of the research operation.
|
|
1437
|
+
*/
|
|
1438
|
+
async __checkDeepResearchStatus(id: string): Promise<DeepResearchStatusResponse | ErrorResponse> {
|
|
1439
|
+
const headers = this.prepareHeaders();
|
|
1440
|
+
try {
|
|
1441
|
+
const response: AxiosResponse = await this.getRequest(
|
|
1442
|
+
`${this.apiUrl}/v1/deep-research/${id}`,
|
|
1443
|
+
headers
|
|
509
1444
|
);
|
|
1445
|
+
|
|
1446
|
+
if (response.status === 200) {
|
|
1447
|
+
return response.data;
|
|
1448
|
+
} else if (response.status === 404) {
|
|
1449
|
+
throw new FirecrawlError("Deep research job not found", 404);
|
|
1450
|
+
} else {
|
|
1451
|
+
this.handleError(response, "check deep research status");
|
|
1452
|
+
}
|
|
1453
|
+
} catch (error: any) {
|
|
1454
|
+
if (error.response?.data?.error) {
|
|
1455
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
1456
|
+
} else {
|
|
1457
|
+
throw new FirecrawlError(error.message, 500);
|
|
1458
|
+
}
|
|
510
1459
|
}
|
|
1460
|
+
return { success: false, error: "Internal server error." };
|
|
511
1461
|
}
|
|
512
1462
|
}
|
|
513
1463
|
|
|
514
1464
|
interface CrawlWatcherEvents {
|
|
515
|
-
document: CustomEvent<FirecrawlDocument
|
|
1465
|
+
document: CustomEvent<FirecrawlDocument<undefined>>,
|
|
516
1466
|
done: CustomEvent<{
|
|
517
1467
|
status: CrawlStatusResponse["status"];
|
|
518
|
-
data: FirecrawlDocument[];
|
|
1468
|
+
data: FirecrawlDocument<undefined>[];
|
|
519
1469
|
}>,
|
|
520
1470
|
error: CustomEvent<{
|
|
521
1471
|
status: CrawlStatusResponse["status"],
|
|
522
|
-
data: FirecrawlDocument[],
|
|
1472
|
+
data: FirecrawlDocument<undefined>[],
|
|
523
1473
|
error: string,
|
|
524
1474
|
}>,
|
|
525
1475
|
}
|
|
526
1476
|
|
|
527
1477
|
export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
528
1478
|
private ws: WebSocket;
|
|
529
|
-
public data: FirecrawlDocument[];
|
|
1479
|
+
public data: FirecrawlDocument<undefined>[];
|
|
530
1480
|
public status: CrawlStatusResponse["status"];
|
|
1481
|
+
public id: string;
|
|
531
1482
|
|
|
532
1483
|
constructor(id: string, app: FirecrawlApp) {
|
|
533
1484
|
super();
|
|
534
|
-
this.
|
|
1485
|
+
this.id = id;
|
|
1486
|
+
// replace `http` with `ws` (`http://` -> `ws://` and `https://` -> `wss://`)
|
|
1487
|
+
const wsUrl = app.apiUrl.replace(/^http/, "ws");
|
|
1488
|
+
this.ws = new WebSocket(`${wsUrl}/v1/crawl/${id}`, app.apiKey);
|
|
535
1489
|
this.status = "scraping";
|
|
536
1490
|
this.data = [];
|
|
537
1491
|
|
|
@@ -547,7 +1501,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
547
1501
|
|
|
548
1502
|
type DocumentMessage = {
|
|
549
1503
|
type: "document",
|
|
550
|
-
data: FirecrawlDocument
|
|
1504
|
+
data: FirecrawlDocument<undefined>,
|
|
551
1505
|
}
|
|
552
1506
|
|
|
553
1507
|
type DoneMessage = { type: "done" }
|
|
@@ -561,6 +1515,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
561
1515
|
detail: {
|
|
562
1516
|
status: this.status,
|
|
563
1517
|
data: this.data,
|
|
1518
|
+
id: this.id,
|
|
564
1519
|
},
|
|
565
1520
|
}));
|
|
566
1521
|
} else if (msg.type === "error") {
|
|
@@ -570,6 +1525,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
570
1525
|
status: this.status,
|
|
571
1526
|
data: this.data,
|
|
572
1527
|
error: msg.error,
|
|
1528
|
+
id: this.id,
|
|
573
1529
|
},
|
|
574
1530
|
}));
|
|
575
1531
|
} else if (msg.type === "catchup") {
|
|
@@ -577,12 +1533,18 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
577
1533
|
this.data.push(...(msg.data.data ?? []));
|
|
578
1534
|
for (const doc of this.data) {
|
|
579
1535
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
580
|
-
detail:
|
|
1536
|
+
detail: {
|
|
1537
|
+
...doc,
|
|
1538
|
+
id: this.id,
|
|
1539
|
+
},
|
|
581
1540
|
}));
|
|
582
1541
|
}
|
|
583
1542
|
} else if (msg.type === "document") {
|
|
584
1543
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
585
|
-
detail:
|
|
1544
|
+
detail: {
|
|
1545
|
+
...msg.data,
|
|
1546
|
+
id: this.id,
|
|
1547
|
+
},
|
|
586
1548
|
}));
|
|
587
1549
|
}
|
|
588
1550
|
}
|
|
@@ -592,14 +1554,21 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
592
1554
|
this.ws.close();
|
|
593
1555
|
return;
|
|
594
1556
|
}
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
1557
|
+
try {
|
|
1558
|
+
const msg = JSON.parse(ev.data) as Message;
|
|
1559
|
+
messageHandler(msg);
|
|
1560
|
+
} catch (error) {
|
|
1561
|
+
console.error("Error on message", error);
|
|
1562
|
+
}
|
|
598
1563
|
}).bind(this);
|
|
599
1564
|
|
|
600
1565
|
this.ws.onclose = ((ev: CloseEvent) => {
|
|
601
|
-
|
|
602
|
-
|
|
1566
|
+
try {
|
|
1567
|
+
const msg = JSON.parse(ev.reason) as Message;
|
|
1568
|
+
messageHandler(msg);
|
|
1569
|
+
} catch (error) {
|
|
1570
|
+
console.error("Error on close", error);
|
|
1571
|
+
}
|
|
603
1572
|
}).bind(this);
|
|
604
1573
|
|
|
605
1574
|
this.ws.onerror = ((_: Event) => {
|
|
@@ -609,6 +1578,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
609
1578
|
status: this.status,
|
|
610
1579
|
data: this.data,
|
|
611
1580
|
error: "WebSocket error",
|
|
1581
|
+
id: this.id,
|
|
612
1582
|
},
|
|
613
1583
|
}));
|
|
614
1584
|
}).bind(this);
|