firecrawl 1.29.3 → 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +4 -2
- package/LICENSE +0 -0
- package/README.md +85 -78
- package/audit-ci.jsonc +4 -0
- package/dist/chunk-JFWW4BWA.js +85 -0
- package/dist/index.cjs +964 -39
- package/dist/index.d.cts +529 -11
- package/dist/index.d.ts +529 -11
- package/dist/index.js +952 -27
- package/dist/package-KYZ3HXR5.js +4 -0
- package/dump.rdb +0 -0
- package/jest.config.js +0 -0
- package/package.json +6 -6
- package/src/__tests__/e2e/v2/batch.test.ts +74 -0
- package/src/__tests__/e2e/v2/crawl.test.ts +182 -0
- package/src/__tests__/e2e/v2/extract.test.ts +70 -0
- package/src/__tests__/e2e/v2/map.test.ts +55 -0
- package/src/__tests__/e2e/v2/scrape.test.ts +130 -0
- package/src/__tests__/e2e/v2/search.test.ts +247 -0
- package/src/__tests__/e2e/v2/usage.test.ts +36 -0
- package/src/__tests__/e2e/v2/utils/idmux.ts +58 -0
- package/src/__tests__/e2e/v2/watcher.test.ts +96 -0
- package/src/__tests__/unit/v2/errorHandler.test.ts +19 -0
- package/src/__tests__/unit/v2/scrape.unit.test.ts +11 -0
- package/src/__tests__/unit/v2/validation.test.ts +59 -0
- package/src/index.backup.ts +2146 -0
- package/src/index.ts +27 -2134
- package/src/v1/index.ts +2158 -0
- package/src/v2/client.ts +281 -0
- package/src/v2/methods/batch.ts +131 -0
- package/src/v2/methods/crawl.ts +160 -0
- package/src/v2/methods/extract.ts +86 -0
- package/src/v2/methods/map.ts +37 -0
- package/src/v2/methods/scrape.ts +26 -0
- package/src/v2/methods/search.ts +69 -0
- package/src/v2/methods/usage.ts +39 -0
- package/src/v2/types.ts +308 -0
- package/src/v2/utils/errorHandler.ts +18 -0
- package/src/v2/utils/getVersion.ts +14 -0
- package/src/v2/utils/httpClient.ts +99 -0
- package/src/v2/utils/validation.ts +50 -0
- package/src/v2/watcher.ts +159 -0
- package/tsconfig.json +2 -1
- package/tsup.config.ts +0 -0
- package/dist/package-Z6F7JDXI.js +0 -111
- /package/src/__tests__/{v1/e2e_withAuth → e2e/v1}/index.test.ts +0 -0
- /package/src/__tests__/{v1/unit → unit/v1}/monitor-job-status-retry.test.ts +0 -0
|
@@ -0,0 +1,2146 @@
|
|
|
1
|
+
import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios";
|
|
2
|
+
import * as zt from "zod";
|
|
3
|
+
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
4
|
+
import { TypedEventTarget } from "typescript-event-target";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Configuration interface for FirecrawlApp.
|
|
8
|
+
* @param apiKey - Optional API key for authentication.
|
|
9
|
+
* @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'.
|
|
10
|
+
*/
|
|
11
|
+
export interface FirecrawlAppConfig {
|
|
12
|
+
apiKey?: string | null;
|
|
13
|
+
apiUrl?: string | null;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Metadata for a Firecrawl document.
|
|
18
|
+
* Includes various optional properties for document metadata.
|
|
19
|
+
*/
|
|
20
|
+
export interface FirecrawlDocumentMetadata {
|
|
21
|
+
title?: string;
|
|
22
|
+
description?: string;
|
|
23
|
+
language?: string;
|
|
24
|
+
keywords?: string;
|
|
25
|
+
robots?: string;
|
|
26
|
+
ogTitle?: string;
|
|
27
|
+
ogDescription?: string;
|
|
28
|
+
ogUrl?: string;
|
|
29
|
+
ogImage?: string;
|
|
30
|
+
ogAudio?: string;
|
|
31
|
+
ogDeterminer?: string;
|
|
32
|
+
ogLocale?: string;
|
|
33
|
+
ogLocaleAlternate?: string[];
|
|
34
|
+
ogSiteName?: string;
|
|
35
|
+
ogVideo?: string;
|
|
36
|
+
dctermsCreated?: string;
|
|
37
|
+
dcDateCreated?: string;
|
|
38
|
+
dcDate?: string;
|
|
39
|
+
dctermsType?: string;
|
|
40
|
+
dcType?: string;
|
|
41
|
+
dctermsAudience?: string;
|
|
42
|
+
dctermsSubject?: string;
|
|
43
|
+
dcSubject?: string;
|
|
44
|
+
dcDescription?: string;
|
|
45
|
+
dctermsKeywords?: string;
|
|
46
|
+
modifiedTime?: string;
|
|
47
|
+
publishedTime?: string;
|
|
48
|
+
articleTag?: string;
|
|
49
|
+
articleSection?: string;
|
|
50
|
+
sourceURL?: string;
|
|
51
|
+
statusCode?: number;
|
|
52
|
+
error?: string;
|
|
53
|
+
proxyUsed?: "basic" | "stealth";
|
|
54
|
+
cacheState?: "miss" | "hit";
|
|
55
|
+
cachedAt?: string;
|
|
56
|
+
[key: string]: any; // Allows for additional metadata properties not explicitly defined.
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Document interface for Firecrawl.
|
|
61
|
+
* Represents a document retrieved or processed by Firecrawl.
|
|
62
|
+
*/
|
|
63
|
+
export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | never) = never> {
|
|
64
|
+
url?: string;
|
|
65
|
+
markdown?: string;
|
|
66
|
+
html?: string;
|
|
67
|
+
rawHtml?: string;
|
|
68
|
+
links?: string[];
|
|
69
|
+
extract?: T;
|
|
70
|
+
json?: T;
|
|
71
|
+
screenshot?: string;
|
|
72
|
+
metadata?: FirecrawlDocumentMetadata;
|
|
73
|
+
actions: ActionsSchema;
|
|
74
|
+
changeTracking?: {
|
|
75
|
+
previousScrapeAt: string | null;
|
|
76
|
+
changeStatus: "new" | "same" | "changed" | "removed";
|
|
77
|
+
visibility: "visible" | "hidden";
|
|
78
|
+
diff?: {
|
|
79
|
+
text: string;
|
|
80
|
+
json: {
|
|
81
|
+
files: Array<{
|
|
82
|
+
from: string | null;
|
|
83
|
+
to: string | null;
|
|
84
|
+
chunks: Array<{
|
|
85
|
+
content: string;
|
|
86
|
+
changes: Array<{
|
|
87
|
+
type: string;
|
|
88
|
+
normal?: boolean;
|
|
89
|
+
ln?: number;
|
|
90
|
+
ln1?: number;
|
|
91
|
+
ln2?: number;
|
|
92
|
+
content: string;
|
|
93
|
+
}>;
|
|
94
|
+
}>;
|
|
95
|
+
}>;
|
|
96
|
+
};
|
|
97
|
+
};
|
|
98
|
+
json?: any;
|
|
99
|
+
};
|
|
100
|
+
// v1 search only
|
|
101
|
+
title?: string;
|
|
102
|
+
description?: string;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Parameters for scraping operations.
|
|
107
|
+
* Defines the options and configurations available for scraping web content.
|
|
108
|
+
*/
|
|
109
|
+
export interface CrawlScrapeOptions {
|
|
110
|
+
formats?: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json" | "changeTracking")[];
|
|
111
|
+
headers?: Record<string, string>;
|
|
112
|
+
includeTags?: string[];
|
|
113
|
+
excludeTags?: string[];
|
|
114
|
+
onlyMainContent?: boolean;
|
|
115
|
+
waitFor?: number;
|
|
116
|
+
timeout?: number;
|
|
117
|
+
location?: {
|
|
118
|
+
country?: string;
|
|
119
|
+
languages?: string[];
|
|
120
|
+
};
|
|
121
|
+
mobile?: boolean;
|
|
122
|
+
skipTlsVerification?: boolean;
|
|
123
|
+
removeBase64Images?: boolean;
|
|
124
|
+
blockAds?: boolean;
|
|
125
|
+
proxy?: "basic" | "stealth" | "auto";
|
|
126
|
+
storeInCache?: boolean;
|
|
127
|
+
maxAge?: number;
|
|
128
|
+
parsePDF?: boolean;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
export type Action = {
|
|
132
|
+
type: "wait",
|
|
133
|
+
milliseconds?: number,
|
|
134
|
+
selector?: string,
|
|
135
|
+
} | {
|
|
136
|
+
type: "click",
|
|
137
|
+
selector: string,
|
|
138
|
+
all?: boolean,
|
|
139
|
+
} | {
|
|
140
|
+
type: "screenshot",
|
|
141
|
+
fullPage?: boolean,
|
|
142
|
+
quality?: number,
|
|
143
|
+
} | {
|
|
144
|
+
type: "write",
|
|
145
|
+
text: string,
|
|
146
|
+
} | {
|
|
147
|
+
type: "press",
|
|
148
|
+
key: string,
|
|
149
|
+
} | {
|
|
150
|
+
type: "scroll",
|
|
151
|
+
direction?: "up" | "down",
|
|
152
|
+
selector?: string,
|
|
153
|
+
} | {
|
|
154
|
+
type: "scrape",
|
|
155
|
+
} | {
|
|
156
|
+
type: "executeJavascript",
|
|
157
|
+
script: string,
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchema extends (Action[] | undefined) = undefined> extends CrawlScrapeOptions {
|
|
161
|
+
extract?: {
|
|
162
|
+
prompt?: string;
|
|
163
|
+
schema?: LLMSchema;
|
|
164
|
+
systemPrompt?: string;
|
|
165
|
+
};
|
|
166
|
+
jsonOptions?:{
|
|
167
|
+
prompt?: string;
|
|
168
|
+
schema?: LLMSchema;
|
|
169
|
+
systemPrompt?: string;
|
|
170
|
+
}
|
|
171
|
+
changeTrackingOptions?: {
|
|
172
|
+
prompt?: string;
|
|
173
|
+
schema?: any;
|
|
174
|
+
modes?: ("json" | "git-diff")[];
|
|
175
|
+
tag?: string | null;
|
|
176
|
+
}
|
|
177
|
+
actions?: ActionsSchema;
|
|
178
|
+
agent?: AgentOptions;
|
|
179
|
+
zeroDataRetention?: boolean;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
export interface ActionsResult {
|
|
183
|
+
screenshots: string[];
|
|
184
|
+
scrapes: ({
|
|
185
|
+
url: string;
|
|
186
|
+
html: string;
|
|
187
|
+
})[];
|
|
188
|
+
javascriptReturns: {
|
|
189
|
+
type: string;
|
|
190
|
+
value: unknown
|
|
191
|
+
}[];
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Response interface for scraping operations.
|
|
196
|
+
* Defines the structure of the response received after a scraping operation.
|
|
197
|
+
*/
|
|
198
|
+
export interface ScrapeResponse<LLMResult = any, ActionsSchema extends (ActionsResult | never) = never> extends FirecrawlDocument<LLMResult, ActionsSchema> {
|
|
199
|
+
success: true;
|
|
200
|
+
warning?: string;
|
|
201
|
+
error?: string;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Parameters for crawling operations.
|
|
206
|
+
* Includes options for both scraping and mapping during a crawl.
|
|
207
|
+
*/
|
|
208
|
+
export interface CrawlParams {
|
|
209
|
+
includePaths?: string[];
|
|
210
|
+
excludePaths?: string[];
|
|
211
|
+
maxDepth?: number;
|
|
212
|
+
maxDiscoveryDepth?: number;
|
|
213
|
+
limit?: number;
|
|
214
|
+
allowBackwardLinks?: boolean;
|
|
215
|
+
crawlEntireDomain?: boolean;
|
|
216
|
+
allowExternalLinks?: boolean;
|
|
217
|
+
ignoreSitemap?: boolean;
|
|
218
|
+
scrapeOptions?: CrawlScrapeOptions;
|
|
219
|
+
webhook?: string | {
|
|
220
|
+
url: string;
|
|
221
|
+
headers?: Record<string, string>;
|
|
222
|
+
metadata?: Record<string, string>;
|
|
223
|
+
events?: ["completed", "failed", "page", "started"][number][];
|
|
224
|
+
};
|
|
225
|
+
deduplicateSimilarURLs?: boolean;
|
|
226
|
+
ignoreQueryParameters?: boolean;
|
|
227
|
+
regexOnFullURL?: boolean;
|
|
228
|
+
/**
|
|
229
|
+
* Delay in seconds between scrapes. This helps respect website rate limits.
|
|
230
|
+
* If not provided, the crawler may use the robots.txt crawl delay if available.
|
|
231
|
+
*/
|
|
232
|
+
delay?: number;
|
|
233
|
+
allowSubdomains?: boolean;
|
|
234
|
+
maxConcurrency?: number;
|
|
235
|
+
zeroDataRetention?: boolean;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/**
|
|
239
|
+
* Response interface for crawling operations.
|
|
240
|
+
* Defines the structure of the response received after initiating a crawl.
|
|
241
|
+
*/
|
|
242
|
+
export interface CrawlResponse {
|
|
243
|
+
id?: string;
|
|
244
|
+
url?: string;
|
|
245
|
+
success: true;
|
|
246
|
+
error?: string;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Response interface for batch scrape operations.
|
|
251
|
+
* Defines the structure of the response received after initiating a crawl.
|
|
252
|
+
*/
|
|
253
|
+
export interface BatchScrapeResponse {
|
|
254
|
+
id?: string;
|
|
255
|
+
url?: string;
|
|
256
|
+
success: true;
|
|
257
|
+
error?: string;
|
|
258
|
+
invalidURLs?: string[];
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
/**
|
|
262
|
+
* Response interface for job status checks.
|
|
263
|
+
* Provides detailed status of a crawl job including progress and results.
|
|
264
|
+
*/
|
|
265
|
+
export interface CrawlStatusResponse {
|
|
266
|
+
success: true;
|
|
267
|
+
status: "scraping" | "completed" | "failed" | "cancelled";
|
|
268
|
+
completed: number;
|
|
269
|
+
total: number;
|
|
270
|
+
creditsUsed: number;
|
|
271
|
+
expiresAt: Date;
|
|
272
|
+
next?: string;
|
|
273
|
+
data: FirecrawlDocument<undefined>[];
|
|
274
|
+
};
|
|
275
|
+
|
|
276
|
+
/**
|
|
277
|
+
* Response interface for batch scrape job status checks.
|
|
278
|
+
* Provides detailed status of a batch scrape job including progress and results.
|
|
279
|
+
*/
|
|
280
|
+
export interface BatchScrapeStatusResponse {
|
|
281
|
+
success: true;
|
|
282
|
+
status: "scraping" | "completed" | "failed" | "cancelled";
|
|
283
|
+
completed: number;
|
|
284
|
+
total: number;
|
|
285
|
+
creditsUsed: number;
|
|
286
|
+
expiresAt: Date;
|
|
287
|
+
next?: string;
|
|
288
|
+
data: FirecrawlDocument<undefined>[];
|
|
289
|
+
};
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Parameters for mapping operations.
|
|
293
|
+
* Defines options for mapping URLs during a crawl.
|
|
294
|
+
*/
|
|
295
|
+
export interface MapParams {
|
|
296
|
+
search?: string;
|
|
297
|
+
ignoreSitemap?: boolean;
|
|
298
|
+
includeSubdomains?: boolean;
|
|
299
|
+
sitemapOnly?: boolean;
|
|
300
|
+
limit?: number;
|
|
301
|
+
timeout?: number;
|
|
302
|
+
useIndex?: boolean;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* Response interface for mapping operations.
|
|
307
|
+
* Defines the structure of the response received after a mapping operation.
|
|
308
|
+
*/
|
|
309
|
+
export interface MapResponse {
|
|
310
|
+
success: true;
|
|
311
|
+
links?: string[];
|
|
312
|
+
error?: string;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
/**
|
|
316
|
+
* Parameters for extracting information from URLs.
|
|
317
|
+
* Defines options for extracting information from URLs.
|
|
318
|
+
*/
|
|
319
|
+
export interface AgentOptions {
|
|
320
|
+
model?: string;
|
|
321
|
+
prompt?: string;
|
|
322
|
+
sessionId?: string;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* Parameters for extracting information from URLs.
|
|
327
|
+
* Defines options for extracting information from URLs.
|
|
328
|
+
*/
|
|
329
|
+
export interface AgentOptionsExtract {
|
|
330
|
+
model?: string;
|
|
331
|
+
sessionId?: string;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
335
|
+
prompt?: string;
|
|
336
|
+
schema?: LLMSchema | object;
|
|
337
|
+
systemPrompt?: string;
|
|
338
|
+
allowExternalLinks?: boolean;
|
|
339
|
+
enableWebSearch?: boolean;
|
|
340
|
+
includeSubdomains?: boolean;
|
|
341
|
+
origin?: string;
|
|
342
|
+
showSources?: boolean;
|
|
343
|
+
scrapeOptions?: CrawlScrapeOptions;
|
|
344
|
+
agent?: AgentOptionsExtract;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
/**
|
|
348
|
+
* Response interface for extracting information from URLs.
|
|
349
|
+
* Defines the structure of the response received after extracting information from URLs.
|
|
350
|
+
*/
|
|
351
|
+
export interface ExtractResponse<LLMSchema extends zt.ZodSchema = any> {
|
|
352
|
+
success: boolean;
|
|
353
|
+
data: LLMSchema;
|
|
354
|
+
error?: string;
|
|
355
|
+
warning?: string;
|
|
356
|
+
sources?: string[];
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/**
|
|
360
|
+
* Error response interface.
|
|
361
|
+
* Defines the structure of the response received when an error occurs.
|
|
362
|
+
*/
|
|
363
|
+
export interface ErrorResponse {
|
|
364
|
+
success: false;
|
|
365
|
+
error: string;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* Custom error class for Firecrawl.
|
|
370
|
+
* Extends the built-in Error class to include a status code.
|
|
371
|
+
*/
|
|
372
|
+
export class FirecrawlError extends Error {
|
|
373
|
+
statusCode: number;
|
|
374
|
+
details?: any;
|
|
375
|
+
constructor(message: string, statusCode: number, details?: any) {
|
|
376
|
+
super(message);
|
|
377
|
+
this.statusCode = statusCode;
|
|
378
|
+
this.details = details;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
/**
|
|
383
|
+
* Parameters for search operations.
|
|
384
|
+
* Defines options for searching and scraping search results.
|
|
385
|
+
*/
|
|
386
|
+
export interface SearchParams {
|
|
387
|
+
limit?: number;
|
|
388
|
+
tbs?: string;
|
|
389
|
+
filter?: string;
|
|
390
|
+
lang?: string;
|
|
391
|
+
country?: string;
|
|
392
|
+
location?: string;
|
|
393
|
+
origin?: string;
|
|
394
|
+
timeout?: number;
|
|
395
|
+
scrapeOptions?: ScrapeParams;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
/**
|
|
399
|
+
* Response interface for search operations.
|
|
400
|
+
* Defines the structure of the response received after a search operation.
|
|
401
|
+
*/
|
|
402
|
+
export interface SearchResponse {
|
|
403
|
+
success: boolean;
|
|
404
|
+
data: FirecrawlDocument<undefined>[];
|
|
405
|
+
warning?: string;
|
|
406
|
+
error?: string;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
/**
|
|
410
|
+
* Response interface for crawl/batch scrape error monitoring.
|
|
411
|
+
*/
|
|
412
|
+
export interface CrawlErrorsResponse {
|
|
413
|
+
/**
|
|
414
|
+
* Scrapes that errored out + error details
|
|
415
|
+
*/
|
|
416
|
+
errors: {
|
|
417
|
+
id: string,
|
|
418
|
+
timestamp?: string,
|
|
419
|
+
url: string,
|
|
420
|
+
error: string,
|
|
421
|
+
}[];
|
|
422
|
+
|
|
423
|
+
/**
|
|
424
|
+
* URLs blocked by robots.txt
|
|
425
|
+
*/
|
|
426
|
+
robotsBlocked: string[];
|
|
427
|
+
};
|
|
428
|
+
|
|
429
|
+
/**
|
|
430
|
+
* Parameters for deep research operations.
|
|
431
|
+
* Defines options for conducting deep research on a query.
|
|
432
|
+
*/
|
|
433
|
+
export interface DeepResearchParams<LLMSchema extends zt.ZodSchema = any> {
|
|
434
|
+
/**
|
|
435
|
+
* Maximum depth of research iterations (1-10)
|
|
436
|
+
* @default 7
|
|
437
|
+
*/
|
|
438
|
+
maxDepth?: number;
|
|
439
|
+
/**
|
|
440
|
+
* Time limit in seconds (30-300)
|
|
441
|
+
* @default 270
|
|
442
|
+
*/
|
|
443
|
+
timeLimit?: number;
|
|
444
|
+
/**
|
|
445
|
+
* Maximum number of URLs to analyze (1-1000)
|
|
446
|
+
* @default 20
|
|
447
|
+
*/
|
|
448
|
+
maxUrls?: number;
|
|
449
|
+
/**
|
|
450
|
+
* The prompt to use for the final analysis
|
|
451
|
+
*/
|
|
452
|
+
analysisPrompt?: string;
|
|
453
|
+
/**
|
|
454
|
+
* The system prompt to use for the research agent
|
|
455
|
+
*/
|
|
456
|
+
systemPrompt?: string;
|
|
457
|
+
/**
|
|
458
|
+
* The formats to use for the final analysis
|
|
459
|
+
*/
|
|
460
|
+
formats?: ("markdown" | "json")[];
|
|
461
|
+
/**
|
|
462
|
+
* The JSON options to use for the final analysis
|
|
463
|
+
*/
|
|
464
|
+
jsonOptions?:{
|
|
465
|
+
prompt?: string;
|
|
466
|
+
schema?: LLMSchema;
|
|
467
|
+
systemPrompt?: string;
|
|
468
|
+
};
|
|
469
|
+
/**
|
|
470
|
+
* Experimental flag for streaming steps
|
|
471
|
+
*/
|
|
472
|
+
// __experimental_streamSteps?: boolean;
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
/**
|
|
476
|
+
* Response interface for deep research operations.
|
|
477
|
+
*/
|
|
478
|
+
export interface DeepResearchResponse {
|
|
479
|
+
success: boolean;
|
|
480
|
+
id: string;
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
/**
|
|
484
|
+
* Status response interface for deep research operations.
|
|
485
|
+
*/
|
|
486
|
+
export interface DeepResearchStatusResponse {
|
|
487
|
+
success: boolean;
|
|
488
|
+
data: {
|
|
489
|
+
finalAnalysis: string;
|
|
490
|
+
activities: Array<{
|
|
491
|
+
type: string;
|
|
492
|
+
status: string;
|
|
493
|
+
message: string;
|
|
494
|
+
timestamp: string;
|
|
495
|
+
depth: number;
|
|
496
|
+
}>;
|
|
497
|
+
sources: Array<{
|
|
498
|
+
url: string;
|
|
499
|
+
title: string;
|
|
500
|
+
description: string;
|
|
501
|
+
}>;
|
|
502
|
+
};
|
|
503
|
+
status: "processing" | "completed" | "failed";
|
|
504
|
+
error?: string;
|
|
505
|
+
expiresAt: string;
|
|
506
|
+
currentDepth: number;
|
|
507
|
+
maxDepth: number;
|
|
508
|
+
activities: Array<{
|
|
509
|
+
type: string;
|
|
510
|
+
status: string;
|
|
511
|
+
message: string;
|
|
512
|
+
timestamp: string;
|
|
513
|
+
depth: number;
|
|
514
|
+
}>;
|
|
515
|
+
sources: Array<{
|
|
516
|
+
url: string;
|
|
517
|
+
title: string;
|
|
518
|
+
description: string;
|
|
519
|
+
}>;
|
|
520
|
+
summaries: string[];
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
/**
|
|
524
|
+
* Parameters for LLMs.txt generation operations.
|
|
525
|
+
*/
|
|
526
|
+
export interface GenerateLLMsTextParams {
|
|
527
|
+
/**
|
|
528
|
+
* Maximum number of URLs to process (1-100)
|
|
529
|
+
* @default 10
|
|
530
|
+
*/
|
|
531
|
+
maxUrls?: number;
|
|
532
|
+
/**
|
|
533
|
+
* Whether to show the full LLMs-full.txt in the response
|
|
534
|
+
* @default false
|
|
535
|
+
*/
|
|
536
|
+
showFullText?: boolean;
|
|
537
|
+
/**
|
|
538
|
+
* Whether to use cached content if available
|
|
539
|
+
* @default true
|
|
540
|
+
*/
|
|
541
|
+
cache?: boolean;
|
|
542
|
+
/**
|
|
543
|
+
* Experimental flag for streaming
|
|
544
|
+
*/
|
|
545
|
+
__experimental_stream?: boolean;
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
/**
|
|
549
|
+
* Response interface for LLMs.txt generation operations.
|
|
550
|
+
*/
|
|
551
|
+
export interface GenerateLLMsTextResponse {
|
|
552
|
+
success: boolean;
|
|
553
|
+
id: string;
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
/**
|
|
557
|
+
* Status response interface for LLMs.txt generation operations.
|
|
558
|
+
*/
|
|
559
|
+
export interface GenerateLLMsTextStatusResponse {
|
|
560
|
+
success: boolean;
|
|
561
|
+
data: {
|
|
562
|
+
llmstxt: string;
|
|
563
|
+
llmsfulltxt?: string;
|
|
564
|
+
};
|
|
565
|
+
status: "processing" | "completed" | "failed";
|
|
566
|
+
error?: string;
|
|
567
|
+
expiresAt: string;
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
/**
|
|
571
|
+
* Main class for interacting with the Firecrawl API.
|
|
572
|
+
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
573
|
+
*/
|
|
574
|
+
export default class FirecrawlApp {
|
|
575
|
+
public apiKey: string;
|
|
576
|
+
public apiUrl: string;
|
|
577
|
+
public version: string = "1.25.1";
|
|
578
|
+
|
|
579
|
+
private isCloudService(url: string): boolean {
|
|
580
|
+
return url.includes('api.firecrawl.dev');
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
private async getVersion(): Promise<string> {
|
|
584
|
+
try {
|
|
585
|
+
const packageJson = await import('../package.json', { assert: { type: 'json' } });
|
|
586
|
+
return packageJson.default.version;
|
|
587
|
+
} catch (error) {
|
|
588
|
+
console.error("Error getting version:", error);
|
|
589
|
+
return "1.25.1";
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
private async init() {
|
|
594
|
+
this.version = await this.getVersion();
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
/**
|
|
598
|
+
* Initializes a new instance of the FirecrawlApp class.
|
|
599
|
+
* @param config - Configuration options for the FirecrawlApp instance.
|
|
600
|
+
*/
|
|
601
|
+
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
|
602
|
+
const baseUrl = apiUrl || "https://api.firecrawl.dev";
|
|
603
|
+
|
|
604
|
+
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
|
|
605
|
+
throw new FirecrawlError("No API key provided", 401);
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
this.apiKey = apiKey || '';
|
|
609
|
+
this.apiUrl = baseUrl;
|
|
610
|
+
this.init();
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
/**
|
|
614
|
+
* Scrapes a URL using the Firecrawl API.
|
|
615
|
+
* @param url - The URL to scrape.
|
|
616
|
+
* @param params - Additional parameters for the scrape request.
|
|
617
|
+
* @returns The response from the scrape operation.
|
|
618
|
+
*/
|
|
619
|
+
async scrapeUrl<T extends zt.ZodSchema, ActionsSchema extends (Action[] | undefined) = undefined>(
|
|
620
|
+
url: string,
|
|
621
|
+
params?: ScrapeParams<T, ActionsSchema>
|
|
622
|
+
): Promise<ScrapeResponse<zt.infer<T>, ActionsSchema extends Action[] ? ActionsResult : never> | ErrorResponse> {
|
|
623
|
+
const headers: AxiosRequestHeaders = {
|
|
624
|
+
"Content-Type": "application/json",
|
|
625
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
626
|
+
} as AxiosRequestHeaders;
|
|
627
|
+
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
|
|
628
|
+
if (jsonData?.extract?.schema) {
|
|
629
|
+
let schema = jsonData.extract.schema;
|
|
630
|
+
|
|
631
|
+
// Try parsing the schema as a Zod schema
|
|
632
|
+
try {
|
|
633
|
+
schema = zodToJsonSchema(schema);
|
|
634
|
+
} catch (error) {
|
|
635
|
+
|
|
636
|
+
}
|
|
637
|
+
jsonData = {
|
|
638
|
+
...jsonData,
|
|
639
|
+
extract: {
|
|
640
|
+
...jsonData.extract,
|
|
641
|
+
schema: schema,
|
|
642
|
+
},
|
|
643
|
+
};
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
if (jsonData?.jsonOptions?.schema) {
|
|
647
|
+
let schema = jsonData.jsonOptions.schema;
|
|
648
|
+
// Try parsing the schema as a Zod schema
|
|
649
|
+
try {
|
|
650
|
+
schema = zodToJsonSchema(schema);
|
|
651
|
+
} catch (error) {
|
|
652
|
+
|
|
653
|
+
}
|
|
654
|
+
jsonData = {
|
|
655
|
+
...jsonData,
|
|
656
|
+
jsonOptions: {
|
|
657
|
+
...jsonData.jsonOptions,
|
|
658
|
+
schema: schema,
|
|
659
|
+
},
|
|
660
|
+
};
|
|
661
|
+
}
|
|
662
|
+
try {
|
|
663
|
+
const response: AxiosResponse = await axios.post(
|
|
664
|
+
this.apiUrl + `/v1/scrape`,
|
|
665
|
+
jsonData,
|
|
666
|
+
{ headers, timeout: params?.timeout !== undefined ? (params.timeout + 5000) : undefined },
|
|
667
|
+
);
|
|
668
|
+
if (response.status === 200) {
|
|
669
|
+
const responseData = response.data;
|
|
670
|
+
if (responseData.success) {
|
|
671
|
+
return {
|
|
672
|
+
success: true,
|
|
673
|
+
warning: responseData.warning,
|
|
674
|
+
error: responseData.error,
|
|
675
|
+
...responseData.data
|
|
676
|
+
};
|
|
677
|
+
} else {
|
|
678
|
+
throw new FirecrawlError(`Failed to scrape URL. Error: ${responseData.error}`, response.status);
|
|
679
|
+
}
|
|
680
|
+
} else {
|
|
681
|
+
this.handleError(response, "scrape URL");
|
|
682
|
+
}
|
|
683
|
+
} catch (error: any) {
|
|
684
|
+
this.handleError(error.response, "scrape URL");
|
|
685
|
+
}
|
|
686
|
+
return { success: false, error: "Internal server error." };
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
/**
|
|
690
|
+
* Searches using the Firecrawl API and optionally scrapes the results.
|
|
691
|
+
* @param query - The search query string.
|
|
692
|
+
* @param params - Optional parameters for the search request.
|
|
693
|
+
* @returns The response from the search operation.
|
|
694
|
+
*/
|
|
695
|
+
async search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse> {
|
|
696
|
+
const headers: AxiosRequestHeaders = {
|
|
697
|
+
"Content-Type": "application/json",
|
|
698
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
699
|
+
} as AxiosRequestHeaders;
|
|
700
|
+
|
|
701
|
+
let jsonData: any = {
|
|
702
|
+
query,
|
|
703
|
+
limit: params?.limit ?? 5,
|
|
704
|
+
tbs: params?.tbs,
|
|
705
|
+
filter: params?.filter,
|
|
706
|
+
lang: params?.lang ?? "en",
|
|
707
|
+
country: params?.country ?? "us",
|
|
708
|
+
location: params?.location,
|
|
709
|
+
origin: `js-sdk@${this.version}`,
|
|
710
|
+
timeout: params?.timeout ?? 60000,
|
|
711
|
+
scrapeOptions: params?.scrapeOptions ?? { formats: [] },
|
|
712
|
+
};
|
|
713
|
+
|
|
714
|
+
if (jsonData?.scrapeOptions?.extract?.schema) {
|
|
715
|
+
let schema = jsonData.scrapeOptions.extract.schema;
|
|
716
|
+
|
|
717
|
+
// Try parsing the schema as a Zod schema
|
|
718
|
+
try {
|
|
719
|
+
schema = zodToJsonSchema(schema);
|
|
720
|
+
} catch (error) {
|
|
721
|
+
|
|
722
|
+
}
|
|
723
|
+
jsonData = {
|
|
724
|
+
...jsonData,
|
|
725
|
+
scrapeOptions: {
|
|
726
|
+
...jsonData.scrapeOptions,
|
|
727
|
+
extract: {
|
|
728
|
+
...jsonData.scrapeOptions.extract,
|
|
729
|
+
schema: schema,
|
|
730
|
+
},
|
|
731
|
+
},
|
|
732
|
+
};
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
try {
|
|
736
|
+
const response: AxiosResponse = await this.postRequest(
|
|
737
|
+
this.apiUrl + `/v1/search`,
|
|
738
|
+
jsonData,
|
|
739
|
+
headers
|
|
740
|
+
);
|
|
741
|
+
|
|
742
|
+
if (response.status === 200) {
|
|
743
|
+
const responseData = response.data;
|
|
744
|
+
if (responseData.success) {
|
|
745
|
+
return {
|
|
746
|
+
success: true,
|
|
747
|
+
data: responseData.data as FirecrawlDocument<any>[],
|
|
748
|
+
warning: responseData.warning,
|
|
749
|
+
};
|
|
750
|
+
} else {
|
|
751
|
+
throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
|
|
752
|
+
}
|
|
753
|
+
} else {
|
|
754
|
+
this.handleError(response, "search");
|
|
755
|
+
}
|
|
756
|
+
} catch (error: any) {
|
|
757
|
+
if (error.response?.data?.error) {
|
|
758
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
759
|
+
} else {
|
|
760
|
+
throw new FirecrawlError(error.message, 500);
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
return { success: false, error: "Internal server error.", data: [] };
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
/**
|
|
767
|
+
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
768
|
+
* @param url - The URL to crawl.
|
|
769
|
+
* @param params - Additional parameters for the crawl request.
|
|
770
|
+
* @param pollInterval - Time in seconds for job status checks.
|
|
771
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
772
|
+
* @returns The response from the crawl operation.
|
|
773
|
+
*/
|
|
774
|
+
async crawlUrl(
|
|
775
|
+
url: string,
|
|
776
|
+
params?: CrawlParams,
|
|
777
|
+
pollInterval: number = 2,
|
|
778
|
+
idempotencyKey?: string
|
|
779
|
+
): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
780
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
781
|
+
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
|
|
782
|
+
try {
|
|
783
|
+
const response: AxiosResponse = await this.postRequest(
|
|
784
|
+
this.apiUrl + `/v1/crawl`,
|
|
785
|
+
jsonData,
|
|
786
|
+
headers
|
|
787
|
+
);
|
|
788
|
+
if (response.status === 200) {
|
|
789
|
+
const id: string = response.data.id;
|
|
790
|
+
return this.monitorJobStatus(id, headers, pollInterval);
|
|
791
|
+
} else {
|
|
792
|
+
this.handleError(response, "start crawl job");
|
|
793
|
+
}
|
|
794
|
+
} catch (error: any) {
|
|
795
|
+
if (error.response?.data?.error) {
|
|
796
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
797
|
+
} else {
|
|
798
|
+
throw new FirecrawlError(error.message, 500);
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
return { success: false, error: "Internal server error." };
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
async asyncCrawlUrl(
|
|
805
|
+
url: string,
|
|
806
|
+
params?: CrawlParams,
|
|
807
|
+
idempotencyKey?: string
|
|
808
|
+
): Promise<CrawlResponse | ErrorResponse> {
|
|
809
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
810
|
+
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
|
|
811
|
+
try {
|
|
812
|
+
const response: AxiosResponse = await this.postRequest(
|
|
813
|
+
this.apiUrl + `/v1/crawl`,
|
|
814
|
+
jsonData,
|
|
815
|
+
headers
|
|
816
|
+
);
|
|
817
|
+
if (response.status === 200) {
|
|
818
|
+
return response.data;
|
|
819
|
+
} else {
|
|
820
|
+
this.handleError(response, "start crawl job");
|
|
821
|
+
}
|
|
822
|
+
} catch (error: any) {
|
|
823
|
+
if (error.response?.data?.error) {
|
|
824
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
825
|
+
} else {
|
|
826
|
+
throw new FirecrawlError(error.message, 500);
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
return { success: false, error: "Internal server error." };
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
/**
|
|
833
|
+
* Checks the status of a crawl job using the Firecrawl API.
|
|
834
|
+
* @param id - The ID of the crawl operation.
|
|
835
|
+
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
836
|
+
* @param nextURL - The `next` URL from the previous crawl status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`.
|
|
837
|
+
* @param skip - How many entries to skip to paginate. Only required if you're not providing `nextURL`. Only used when `getAllData = false`.
|
|
838
|
+
* @param limit - How many entries to return. Only used when `getAllData = false`.
|
|
839
|
+
* @returns The response containing the job status.
|
|
840
|
+
*/
|
|
841
|
+
async checkCrawlStatus(id?: string, getAllData = false, nextURL?: string, skip?: number, limit?: number): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
842
|
+
if (!id) {
|
|
843
|
+
throw new FirecrawlError("No crawl ID provided", 400);
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
|
847
|
+
const targetURL = new URL(nextURL ?? `${this.apiUrl}/v1/crawl/${id}`);
|
|
848
|
+
if (skip !== undefined) {
|
|
849
|
+
targetURL.searchParams.set("skip", skip.toString());
|
|
850
|
+
}
|
|
851
|
+
if (limit !== undefined) {
|
|
852
|
+
targetURL.searchParams.set("limit", limit.toString());
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
try {
|
|
856
|
+
const response: AxiosResponse = await this.getRequest(
|
|
857
|
+
targetURL.href,
|
|
858
|
+
headers
|
|
859
|
+
);
|
|
860
|
+
if (response.status === 200) {
|
|
861
|
+
let allData = response.data.data;
|
|
862
|
+
if (getAllData && response.data.status === "completed") {
|
|
863
|
+
let statusData = response.data
|
|
864
|
+
if ("data" in statusData) {
|
|
865
|
+
let data = statusData.data;
|
|
866
|
+
while (typeof statusData === 'object' && 'next' in statusData) {
|
|
867
|
+
if (data.length === 0) {
|
|
868
|
+
break
|
|
869
|
+
}
|
|
870
|
+
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
871
|
+
data = data.concat(statusData.data);
|
|
872
|
+
}
|
|
873
|
+
allData = data;
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
let resp: CrawlStatusResponse | ErrorResponse = {
|
|
878
|
+
success: response.data.success,
|
|
879
|
+
status: response.data.status,
|
|
880
|
+
total: response.data.total,
|
|
881
|
+
completed: response.data.completed,
|
|
882
|
+
creditsUsed: response.data.creditsUsed,
|
|
883
|
+
next: getAllData ? undefined : response.data.next,
|
|
884
|
+
expiresAt: new Date(response.data.expiresAt),
|
|
885
|
+
data: allData
|
|
886
|
+
}
|
|
887
|
+
|
|
888
|
+
if (!response.data.success && response.data.error) {
|
|
889
|
+
resp = {
|
|
890
|
+
...resp,
|
|
891
|
+
success: false,
|
|
892
|
+
error: response.data.error
|
|
893
|
+
} as ErrorResponse;
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
if (response.data.next) {
|
|
897
|
+
(resp as CrawlStatusResponse).next = response.data.next;
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
return resp;
|
|
901
|
+
} else {
|
|
902
|
+
this.handleError(response, "check crawl status");
|
|
903
|
+
}
|
|
904
|
+
} catch (error: any) {
|
|
905
|
+
throw new FirecrawlError(error.message, 500);
|
|
906
|
+
}
|
|
907
|
+
return { success: false, error: "Internal server error." };
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
/**
|
|
911
|
+
* Returns information about crawl errors.
|
|
912
|
+
* @param id - The ID of the crawl operation.
|
|
913
|
+
* @returns Information about crawl errors.
|
|
914
|
+
*/
|
|
915
|
+
async checkCrawlErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> {
|
|
916
|
+
const headers = this.prepareHeaders();
|
|
917
|
+
try {
|
|
918
|
+
const response: AxiosResponse = await this.deleteRequest(
|
|
919
|
+
`${this.apiUrl}/v1/crawl/${id}/errors`,
|
|
920
|
+
headers
|
|
921
|
+
);
|
|
922
|
+
if (response.status === 200) {
|
|
923
|
+
return response.data;
|
|
924
|
+
} else {
|
|
925
|
+
this.handleError(response, "check crawl errors");
|
|
926
|
+
}
|
|
927
|
+
} catch (error: any) {
|
|
928
|
+
throw new FirecrawlError(error.message, 500);
|
|
929
|
+
}
|
|
930
|
+
return { success: false, error: "Internal server error." };
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
/**
|
|
934
|
+
* Cancels a crawl job using the Firecrawl API.
|
|
935
|
+
* @param id - The ID of the crawl operation.
|
|
936
|
+
* @returns The response from the cancel crawl operation.
|
|
937
|
+
*/
|
|
938
|
+
async cancelCrawl(id: string): Promise<ErrorResponse> {
|
|
939
|
+
const headers = this.prepareHeaders();
|
|
940
|
+
try {
|
|
941
|
+
const response: AxiosResponse = await this.deleteRequest(
|
|
942
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
943
|
+
headers
|
|
944
|
+
);
|
|
945
|
+
if (response.status === 200) {
|
|
946
|
+
return response.data;
|
|
947
|
+
} else {
|
|
948
|
+
this.handleError(response, "cancel crawl job");
|
|
949
|
+
}
|
|
950
|
+
} catch (error: any) {
|
|
951
|
+
throw new FirecrawlError(error.message, 500);
|
|
952
|
+
}
|
|
953
|
+
return { success: false, error: "Internal server error." };
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
/**
|
|
957
|
+
* Initiates a crawl job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
958
|
+
* @param url - The URL to crawl.
|
|
959
|
+
* @param params - Additional parameters for the crawl request.
|
|
960
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
961
|
+
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
962
|
+
*/
|
|
963
|
+
async crawlUrlAndWatch(
|
|
964
|
+
url: string,
|
|
965
|
+
params?: CrawlParams,
|
|
966
|
+
idempotencyKey?: string,
|
|
967
|
+
) {
|
|
968
|
+
const crawl = await this.asyncCrawlUrl(url, params, idempotencyKey);
|
|
969
|
+
|
|
970
|
+
if (crawl.success && crawl.id) {
|
|
971
|
+
const id = crawl.id;
|
|
972
|
+
return new CrawlWatcher(id, this);
|
|
973
|
+
}
|
|
974
|
+
|
|
975
|
+
throw new FirecrawlError("Crawl job failed to start", 400);
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
/**
|
|
979
|
+
* Maps a URL using the Firecrawl API.
|
|
980
|
+
* @param url - The URL to map.
|
|
981
|
+
* @param params - Additional parameters for the map request.
|
|
982
|
+
* @returns The response from the map operation.
|
|
983
|
+
*/
|
|
984
|
+
async mapUrl(url: string, params?: MapParams): Promise<MapResponse | ErrorResponse> {
|
|
985
|
+
const headers = this.prepareHeaders();
|
|
986
|
+
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
|
|
987
|
+
|
|
988
|
+
try {
|
|
989
|
+
const response: AxiosResponse = await this.postRequest(
|
|
990
|
+
this.apiUrl + `/v1/map`,
|
|
991
|
+
jsonData,
|
|
992
|
+
headers
|
|
993
|
+
);
|
|
994
|
+
if (response.status === 200) {
|
|
995
|
+
return response.data as MapResponse;
|
|
996
|
+
} else {
|
|
997
|
+
this.handleError(response, "map");
|
|
998
|
+
}
|
|
999
|
+
} catch (error: any) {
|
|
1000
|
+
throw new FirecrawlError(error.message, 500);
|
|
1001
|
+
}
|
|
1002
|
+
return { success: false, error: "Internal server error." };
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
/**
|
|
1006
|
+
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
|
|
1007
|
+
* @param url - The URLs to scrape.
|
|
1008
|
+
* @param params - Additional parameters for the scrape request.
|
|
1009
|
+
* @param pollInterval - Time in seconds for job status checks.
|
|
1010
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
1011
|
+
* @param webhook - Optional webhook for the batch scrape.
|
|
1012
|
+
* @param ignoreInvalidURLs - Optional flag to ignore invalid URLs.
|
|
1013
|
+
* @returns The response from the crawl operation.
|
|
1014
|
+
*/
|
|
1015
|
+
async batchScrapeUrls(
|
|
1016
|
+
urls: string[],
|
|
1017
|
+
params?: ScrapeParams,
|
|
1018
|
+
pollInterval: number = 2,
|
|
1019
|
+
idempotencyKey?: string,
|
|
1020
|
+
webhook?: CrawlParams["webhook"],
|
|
1021
|
+
ignoreInvalidURLs?: boolean,
|
|
1022
|
+
maxConcurrency?: number,
|
|
1023
|
+
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
|
1024
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
1025
|
+
let jsonData: any = { urls, webhook, ignoreInvalidURLs, maxConcurrency, ...params, origin: `js-sdk@${this.version}` };
|
|
1026
|
+
if (jsonData?.extract?.schema) {
|
|
1027
|
+
let schema = jsonData.extract.schema;
|
|
1028
|
+
|
|
1029
|
+
// Try parsing the schema as a Zod schema
|
|
1030
|
+
try {
|
|
1031
|
+
schema = zodToJsonSchema(schema);
|
|
1032
|
+
} catch (error) {
|
|
1033
|
+
|
|
1034
|
+
}
|
|
1035
|
+
jsonData = {
|
|
1036
|
+
...jsonData,
|
|
1037
|
+
extract: {
|
|
1038
|
+
...jsonData.extract,
|
|
1039
|
+
schema: schema,
|
|
1040
|
+
},
|
|
1041
|
+
};
|
|
1042
|
+
}
|
|
1043
|
+
if (jsonData?.jsonOptions?.schema) {
|
|
1044
|
+
let schema = jsonData.jsonOptions.schema;
|
|
1045
|
+
|
|
1046
|
+
// Try parsing the schema as a Zod schema
|
|
1047
|
+
try {
|
|
1048
|
+
schema = zodToJsonSchema(schema);
|
|
1049
|
+
} catch (error) {
|
|
1050
|
+
|
|
1051
|
+
}
|
|
1052
|
+
jsonData = {
|
|
1053
|
+
...jsonData,
|
|
1054
|
+
jsonOptions: {
|
|
1055
|
+
...jsonData.jsonOptions,
|
|
1056
|
+
schema: schema,
|
|
1057
|
+
},
|
|
1058
|
+
};
|
|
1059
|
+
}
|
|
1060
|
+
try {
|
|
1061
|
+
const response: AxiosResponse = await this.postRequest(
|
|
1062
|
+
this.apiUrl + `/v1/batch/scrape`,
|
|
1063
|
+
jsonData,
|
|
1064
|
+
headers
|
|
1065
|
+
);
|
|
1066
|
+
if (response.status === 200) {
|
|
1067
|
+
const id: string = response.data.id;
|
|
1068
|
+
return this.monitorJobStatus(id, headers, pollInterval);
|
|
1069
|
+
} else {
|
|
1070
|
+
this.handleError(response, "start batch scrape job");
|
|
1071
|
+
}
|
|
1072
|
+
} catch (error: any) {
|
|
1073
|
+
if (error.response?.data?.error) {
|
|
1074
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
1075
|
+
} else {
|
|
1076
|
+
throw new FirecrawlError(error.message, 500);
|
|
1077
|
+
}
|
|
1078
|
+
}
|
|
1079
|
+
return { success: false, error: "Internal server error." };
|
|
1080
|
+
}
|
|
1081
|
+
|
|
1082
|
+
async asyncBatchScrapeUrls(
|
|
1083
|
+
urls: string[],
|
|
1084
|
+
params?: ScrapeParams,
|
|
1085
|
+
idempotencyKey?: string,
|
|
1086
|
+
webhook?: CrawlParams["webhook"],
|
|
1087
|
+
ignoreInvalidURLs?: boolean,
|
|
1088
|
+
): Promise<BatchScrapeResponse | ErrorResponse> {
|
|
1089
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
1090
|
+
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params, origin: `js-sdk@${this.version}` };
|
|
1091
|
+
try {
|
|
1092
|
+
const response: AxiosResponse = await this.postRequest(
|
|
1093
|
+
this.apiUrl + `/v1/batch/scrape`,
|
|
1094
|
+
jsonData,
|
|
1095
|
+
headers
|
|
1096
|
+
);
|
|
1097
|
+
if (response.status === 200) {
|
|
1098
|
+
return response.data;
|
|
1099
|
+
} else {
|
|
1100
|
+
this.handleError(response, "start batch scrape job");
|
|
1101
|
+
}
|
|
1102
|
+
} catch (error: any) {
|
|
1103
|
+
if (error.response?.data?.error) {
|
|
1104
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
1105
|
+
} else {
|
|
1106
|
+
throw new FirecrawlError(error.message, 500);
|
|
1107
|
+
}
|
|
1108
|
+
}
|
|
1109
|
+
return { success: false, error: "Internal server error." };
|
|
1110
|
+
}
|
|
1111
|
+
|
|
1112
|
+
/**
|
|
1113
|
+
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
1114
|
+
* @param urls - The URL to scrape.
|
|
1115
|
+
* @param params - Additional parameters for the scrape request.
|
|
1116
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
1117
|
+
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
1118
|
+
*/
|
|
1119
|
+
async batchScrapeUrlsAndWatch(
|
|
1120
|
+
urls: string[],
|
|
1121
|
+
params?: ScrapeParams,
|
|
1122
|
+
idempotencyKey?: string,
|
|
1123
|
+
webhook?: CrawlParams["webhook"],
|
|
1124
|
+
ignoreInvalidURLs?: boolean,
|
|
1125
|
+
) {
|
|
1126
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs);
|
|
1127
|
+
|
|
1128
|
+
if (crawl.success && crawl.id) {
|
|
1129
|
+
const id = crawl.id;
|
|
1130
|
+
return new CrawlWatcher(id, this);
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
throw new FirecrawlError("Batch scrape job failed to start", 400);
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
/**
|
|
1137
|
+
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
1138
|
+
* @param id - The ID of the batch scrape operation.
|
|
1139
|
+
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
1140
|
+
* @param nextURL - The `next` URL from the previous batch scrape status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`.
|
|
1141
|
+
* @param skip - How many entries to skip to paginate. Only used when `getAllData = false`.
|
|
1142
|
+
* @param limit - How many entries to return. Only used when `getAllData = false`.
|
|
1143
|
+
* @returns The response containing the job status.
|
|
1144
|
+
*/
|
|
1145
|
+
async checkBatchScrapeStatus(id?: string, getAllData = false, nextURL?: string, skip?: number, limit?: number): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
|
1146
|
+
if (!id) {
|
|
1147
|
+
throw new FirecrawlError("No batch scrape ID provided", 400);
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
|
1151
|
+
const targetURL = new URL(nextURL ?? `${this.apiUrl}/v1/batch/scrape/${id}`);
|
|
1152
|
+
if (skip !== undefined) {
|
|
1153
|
+
targetURL.searchParams.set("skip", skip.toString());
|
|
1154
|
+
}
|
|
1155
|
+
if (limit !== undefined) {
|
|
1156
|
+
targetURL.searchParams.set("limit", limit.toString());
|
|
1157
|
+
}
|
|
1158
|
+
|
|
1159
|
+
try {
|
|
1160
|
+
const response: AxiosResponse = await this.getRequest(
|
|
1161
|
+
targetURL.href,
|
|
1162
|
+
headers
|
|
1163
|
+
);
|
|
1164
|
+
if (response.status === 200) {
|
|
1165
|
+
let allData = response.data.data;
|
|
1166
|
+
if (getAllData && response.data.status === "completed") {
|
|
1167
|
+
let statusData = response.data
|
|
1168
|
+
if ("data" in statusData) {
|
|
1169
|
+
let data = statusData.data;
|
|
1170
|
+
while (typeof statusData === 'object' && 'next' in statusData) {
|
|
1171
|
+
if (data.length === 0) {
|
|
1172
|
+
break
|
|
1173
|
+
}
|
|
1174
|
+
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
1175
|
+
data = data.concat(statusData.data);
|
|
1176
|
+
}
|
|
1177
|
+
allData = data;
|
|
1178
|
+
}
|
|
1179
|
+
}
|
|
1180
|
+
|
|
1181
|
+
let resp: BatchScrapeStatusResponse | ErrorResponse = {
|
|
1182
|
+
success: response.data.success,
|
|
1183
|
+
status: response.data.status,
|
|
1184
|
+
total: response.data.total,
|
|
1185
|
+
completed: response.data.completed,
|
|
1186
|
+
creditsUsed: response.data.creditsUsed,
|
|
1187
|
+
next: getAllData ? undefined : response.data.next,
|
|
1188
|
+
expiresAt: new Date(response.data.expiresAt),
|
|
1189
|
+
data: allData
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
if (!response.data.success && response.data.error) {
|
|
1193
|
+
resp = {
|
|
1194
|
+
...resp,
|
|
1195
|
+
success: false,
|
|
1196
|
+
error: response.data.error
|
|
1197
|
+
} as ErrorResponse;
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
if (response.data.next) {
|
|
1201
|
+
(resp as BatchScrapeStatusResponse).next = response.data.next;
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
return resp;
|
|
1205
|
+
} else {
|
|
1206
|
+
this.handleError(response, "check batch scrape status");
|
|
1207
|
+
}
|
|
1208
|
+
} catch (error: any) {
|
|
1209
|
+
throw new FirecrawlError(error.message, 500);
|
|
1210
|
+
}
|
|
1211
|
+
return { success: false, error: "Internal server error." };
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
/**
|
|
1215
|
+
* Returns information about batch scrape errors.
|
|
1216
|
+
* @param id - The ID of the batch scrape operation.
|
|
1217
|
+
* @returns Information about batch scrape errors.
|
|
1218
|
+
*/
|
|
1219
|
+
async checkBatchScrapeErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> {
|
|
1220
|
+
const headers = this.prepareHeaders();
|
|
1221
|
+
try {
|
|
1222
|
+
const response: AxiosResponse = await this.deleteRequest(
|
|
1223
|
+
`${this.apiUrl}/v1/batch/scrape/${id}/errors`,
|
|
1224
|
+
headers
|
|
1225
|
+
);
|
|
1226
|
+
if (response.status === 200) {
|
|
1227
|
+
return response.data;
|
|
1228
|
+
} else {
|
|
1229
|
+
this.handleError(response, "check batch scrape errors");
|
|
1230
|
+
}
|
|
1231
|
+
} catch (error: any) {
|
|
1232
|
+
throw new FirecrawlError(error.message, 500);
|
|
1233
|
+
}
|
|
1234
|
+
return { success: false, error: "Internal server error." };
|
|
1235
|
+
}
|
|
1236
|
+
|
|
1237
|
+
/**
|
|
1238
|
+
* Extracts information from URLs using the Firecrawl API.
|
|
1239
|
+
* Currently in Beta. Expect breaking changes on future minor versions.
|
|
1240
|
+
* @param urls - The URLs to extract information from. Optional if using other methods for data extraction.
|
|
1241
|
+
* @param params - Additional parameters for the extract request.
|
|
1242
|
+
* @returns The response from the extract operation.
|
|
1243
|
+
*/
|
|
1244
|
+
async extract<T extends zt.ZodSchema = any>(urls?: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> {
|
|
1245
|
+
const headers = this.prepareHeaders();
|
|
1246
|
+
|
|
1247
|
+
let jsonData: { urls?: string[] } & ExtractParams<T> = { urls: urls, ...params };
|
|
1248
|
+
let jsonSchema: any;
|
|
1249
|
+
try {
|
|
1250
|
+
if (!params?.schema) {
|
|
1251
|
+
jsonSchema = undefined;
|
|
1252
|
+
} else {
|
|
1253
|
+
try {
|
|
1254
|
+
jsonSchema = zodToJsonSchema(params.schema as zt.ZodType);
|
|
1255
|
+
} catch (_) {
|
|
1256
|
+
jsonSchema = params.schema;
|
|
1257
|
+
}
|
|
1258
|
+
}
|
|
1259
|
+
} catch (error: any) {
|
|
1260
|
+
throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400);
|
|
1261
|
+
}
|
|
1262
|
+
|
|
1263
|
+
try {
|
|
1264
|
+
const response: AxiosResponse = await this.postRequest(
|
|
1265
|
+
this.apiUrl + `/v1/extract`,
|
|
1266
|
+
{ ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` },
|
|
1267
|
+
headers
|
|
1268
|
+
);
|
|
1269
|
+
|
|
1270
|
+
if (response.status === 200) {
|
|
1271
|
+
const jobId = response.data.id;
|
|
1272
|
+
let extractStatus;
|
|
1273
|
+
do {
|
|
1274
|
+
const statusResponse: AxiosResponse = await this.getRequest(
|
|
1275
|
+
`${this.apiUrl}/v1/extract/${jobId}`,
|
|
1276
|
+
headers
|
|
1277
|
+
);
|
|
1278
|
+
extractStatus = statusResponse.data;
|
|
1279
|
+
if (extractStatus.status === "completed") {
|
|
1280
|
+
if (extractStatus.success) {
|
|
1281
|
+
return {
|
|
1282
|
+
success: true,
|
|
1283
|
+
data: extractStatus.data,
|
|
1284
|
+
warning: extractStatus.warning,
|
|
1285
|
+
error: extractStatus.error,
|
|
1286
|
+
sources: extractStatus?.sources || undefined,
|
|
1287
|
+
};
|
|
1288
|
+
} else {
|
|
1289
|
+
throw new FirecrawlError(`Failed to extract data. Error: ${extractStatus.error}`, statusResponse.status);
|
|
1290
|
+
}
|
|
1291
|
+
} else if (extractStatus.status === "failed" || extractStatus.status === "cancelled") {
|
|
1292
|
+
throw new FirecrawlError(`Extract job ${extractStatus.status}. Error: ${extractStatus.error}`, statusResponse.status);
|
|
1293
|
+
}
|
|
1294
|
+
await new Promise(resolve => setTimeout(resolve, 1000)); // Polling interval
|
|
1295
|
+
} while (extractStatus.status !== "completed");
|
|
1296
|
+
} else {
|
|
1297
|
+
this.handleError(response, "extract");
|
|
1298
|
+
}
|
|
1299
|
+
} catch (error: any) {
|
|
1300
|
+
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
|
1301
|
+
}
|
|
1302
|
+
return { success: false, error: "Internal server error."};
|
|
1303
|
+
}
|
|
1304
|
+
|
|
1305
|
+
/**
|
|
1306
|
+
* Initiates an asynchronous extract job for a URL using the Firecrawl API.
|
|
1307
|
+
* @param url - The URL to extract data from.
|
|
1308
|
+
* @param params - Additional parameters for the extract request.
|
|
1309
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
1310
|
+
* @returns The response from the extract operation.
|
|
1311
|
+
*/
|
|
1312
|
+
async asyncExtract(
|
|
1313
|
+
urls: string[],
|
|
1314
|
+
params?: ExtractParams,
|
|
1315
|
+
idempotencyKey?: string
|
|
1316
|
+
): Promise<ExtractResponse | ErrorResponse> {
|
|
1317
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
1318
|
+
let jsonData: any = { urls, ...params };
|
|
1319
|
+
let jsonSchema: any;
|
|
1320
|
+
|
|
1321
|
+
try {
|
|
1322
|
+
if (!params?.schema) {
|
|
1323
|
+
jsonSchema = undefined;
|
|
1324
|
+
} else {
|
|
1325
|
+
try {
|
|
1326
|
+
jsonSchema = zodToJsonSchema(params.schema as zt.ZodType);
|
|
1327
|
+
} catch (_) {
|
|
1328
|
+
jsonSchema = params.schema;
|
|
1329
|
+
}
|
|
1330
|
+
}
|
|
1331
|
+
} catch (error: any) {
|
|
1332
|
+
throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400);
|
|
1333
|
+
}
|
|
1334
|
+
|
|
1335
|
+
try {
|
|
1336
|
+
const response: AxiosResponse = await this.postRequest(
|
|
1337
|
+
this.apiUrl + `/v1/extract`,
|
|
1338
|
+
{ ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` },
|
|
1339
|
+
headers
|
|
1340
|
+
);
|
|
1341
|
+
|
|
1342
|
+
if (response.status === 200) {
|
|
1343
|
+
return response.data;
|
|
1344
|
+
} else {
|
|
1345
|
+
this.handleError(response, "start extract job");
|
|
1346
|
+
}
|
|
1347
|
+
} catch (error: any) {
|
|
1348
|
+
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
|
1349
|
+
}
|
|
1350
|
+
return { success: false, error: "Internal server error." };
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
/**
|
|
1354
|
+
* Retrieves the status of an extract job.
|
|
1355
|
+
* @param jobId - The ID of the extract job.
|
|
1356
|
+
* @returns The status of the extract job.
|
|
1357
|
+
*/
|
|
1358
|
+
async getExtractStatus(jobId: string): Promise<any> {
|
|
1359
|
+
try {
|
|
1360
|
+
const response: AxiosResponse = await this.getRequest(
|
|
1361
|
+
`${this.apiUrl}/v1/extract/${jobId}`,
|
|
1362
|
+
this.prepareHeaders()
|
|
1363
|
+
);
|
|
1364
|
+
|
|
1365
|
+
if (response.status === 200) {
|
|
1366
|
+
return response.data;
|
|
1367
|
+
} else {
|
|
1368
|
+
this.handleError(response, "get extract status");
|
|
1369
|
+
}
|
|
1370
|
+
} catch (error: any) {
|
|
1371
|
+
throw new FirecrawlError(error.message, 500);
|
|
1372
|
+
}
|
|
1373
|
+
}
|
|
1374
|
+
|
|
1375
|
+
/**
|
|
1376
|
+
* Prepares the headers for an API request.
|
|
1377
|
+
* @param idempotencyKey - Optional key to ensure idempotency.
|
|
1378
|
+
* @returns The prepared headers.
|
|
1379
|
+
*/
|
|
1380
|
+
prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
|
|
1381
|
+
return {
|
|
1382
|
+
"Content-Type": "application/json",
|
|
1383
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
1384
|
+
...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}),
|
|
1385
|
+
} as AxiosRequestHeaders & { "x-idempotency-key"?: string };
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
/**
|
|
1389
|
+
* Sends a POST request to the specified URL.
|
|
1390
|
+
* @param url - The URL to send the request to.
|
|
1391
|
+
* @param data - The data to send in the request.
|
|
1392
|
+
* @param headers - The headers for the request.
|
|
1393
|
+
* @returns The response from the POST request.
|
|
1394
|
+
*/
|
|
1395
|
+
postRequest(
|
|
1396
|
+
url: string,
|
|
1397
|
+
data: any,
|
|
1398
|
+
headers: AxiosRequestHeaders
|
|
1399
|
+
): Promise<AxiosResponse> {
|
|
1400
|
+
return axios.post(url, data, { headers, timeout: (data?.timeout ? (data.timeout + 5000) : undefined) });
|
|
1401
|
+
}
|
|
1402
|
+
|
|
1403
|
+
/**
|
|
1404
|
+
* Sends a GET request to the specified URL.
|
|
1405
|
+
* @param url - The URL to send the request to.
|
|
1406
|
+
* @param headers - The headers for the request.
|
|
1407
|
+
* @returns The response from the GET request.
|
|
1408
|
+
*/
|
|
1409
|
+
async getRequest(
|
|
1410
|
+
url: string,
|
|
1411
|
+
headers: AxiosRequestHeaders
|
|
1412
|
+
): Promise<AxiosResponse> {
|
|
1413
|
+
try {
|
|
1414
|
+
return await axios.get(url, { headers });
|
|
1415
|
+
} catch (error) {
|
|
1416
|
+
if (error instanceof AxiosError && error.response) {
|
|
1417
|
+
return error.response as AxiosResponse;
|
|
1418
|
+
} else {
|
|
1419
|
+
throw error;
|
|
1420
|
+
}
|
|
1421
|
+
}
|
|
1422
|
+
}
|
|
1423
|
+
|
|
1424
|
+
/**
|
|
1425
|
+
* Sends a DELETE request to the specified URL.
|
|
1426
|
+
* @param url - The URL to send the request to.
|
|
1427
|
+
* @param headers - The headers for the request.
|
|
1428
|
+
* @returns The response from the DELETE request.
|
|
1429
|
+
*/
|
|
1430
|
+
async deleteRequest(
|
|
1431
|
+
url: string,
|
|
1432
|
+
headers: AxiosRequestHeaders
|
|
1433
|
+
): Promise<AxiosResponse> {
|
|
1434
|
+
try {
|
|
1435
|
+
return await axios.delete(url, { headers });
|
|
1436
|
+
} catch (error) {
|
|
1437
|
+
if (error instanceof AxiosError && error.response) {
|
|
1438
|
+
return error.response as AxiosResponse;
|
|
1439
|
+
} else {
|
|
1440
|
+
throw error;
|
|
1441
|
+
}
|
|
1442
|
+
}
|
|
1443
|
+
}
|
|
1444
|
+
|
|
1445
|
+
/**
|
|
1446
|
+
* Monitors the status of a crawl job until completion or failure.
|
|
1447
|
+
* @param id - The ID of the crawl operation.
|
|
1448
|
+
* @param headers - The headers for the request.
|
|
1449
|
+
* @param checkInterval - Interval in seconds for job status checks.
|
|
1450
|
+
* @param checkUrl - Optional URL to check the status (used for v1 API)
|
|
1451
|
+
* @returns The final job status or data.
|
|
1452
|
+
*/
|
|
1453
|
+
async monitorJobStatus(
|
|
1454
|
+
id: string,
|
|
1455
|
+
headers: AxiosRequestHeaders,
|
|
1456
|
+
checkInterval: number
|
|
1457
|
+
): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
1458
|
+
let failedTries = 0;
|
|
1459
|
+
let networkRetries = 0;
|
|
1460
|
+
const maxNetworkRetries = 3;
|
|
1461
|
+
|
|
1462
|
+
while (true) {
|
|
1463
|
+
try {
|
|
1464
|
+
let statusResponse: AxiosResponse = await this.getRequest(
|
|
1465
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
1466
|
+
headers
|
|
1467
|
+
);
|
|
1468
|
+
|
|
1469
|
+
if (statusResponse.status === 200) {
|
|
1470
|
+
failedTries = 0;
|
|
1471
|
+
networkRetries = 0;
|
|
1472
|
+
let statusData = statusResponse.data;
|
|
1473
|
+
|
|
1474
|
+
if (statusData.status === "completed") {
|
|
1475
|
+
if ("data" in statusData) {
|
|
1476
|
+
let data = statusData.data;
|
|
1477
|
+
while (typeof statusData === 'object' && 'next' in statusData) {
|
|
1478
|
+
if (data.length === 0) {
|
|
1479
|
+
break
|
|
1480
|
+
}
|
|
1481
|
+
statusResponse = await this.getRequest(statusData.next, headers);
|
|
1482
|
+
statusData = statusResponse.data;
|
|
1483
|
+
data = data.concat(statusData.data);
|
|
1484
|
+
}
|
|
1485
|
+
statusData.data = data;
|
|
1486
|
+
return statusData;
|
|
1487
|
+
} else {
|
|
1488
|
+
throw new FirecrawlError("Crawl job completed but no data was returned", 500);
|
|
1489
|
+
}
|
|
1490
|
+
} else if (
|
|
1491
|
+
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
|
|
1492
|
+
) {
|
|
1493
|
+
checkInterval = Math.max(checkInterval, 2);
|
|
1494
|
+
await new Promise((resolve) =>
|
|
1495
|
+
setTimeout(resolve, checkInterval * 1000)
|
|
1496
|
+
);
|
|
1497
|
+
} else {
|
|
1498
|
+
throw new FirecrawlError(
|
|
1499
|
+
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
1500
|
+
500
|
|
1501
|
+
);
|
|
1502
|
+
}
|
|
1503
|
+
} else {
|
|
1504
|
+
failedTries++;
|
|
1505
|
+
if (failedTries >= 3) {
|
|
1506
|
+
this.handleError(statusResponse, "check crawl status");
|
|
1507
|
+
}
|
|
1508
|
+
}
|
|
1509
|
+
} catch (error: any) {
|
|
1510
|
+
if (this.isRetryableError(error) && networkRetries < maxNetworkRetries) {
|
|
1511
|
+
networkRetries++;
|
|
1512
|
+
const backoffDelay = Math.min(1000 * Math.pow(2, networkRetries - 1), 10000);
|
|
1513
|
+
|
|
1514
|
+
await new Promise((resolve) => setTimeout(resolve, backoffDelay));
|
|
1515
|
+
continue;
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1518
|
+
throw new FirecrawlError(error, 500);
|
|
1519
|
+
}
|
|
1520
|
+
}
|
|
1521
|
+
}
|
|
1522
|
+
|
|
1523
|
+
/**
|
|
1524
|
+
* Determines if an error is retryable (transient network error)
|
|
1525
|
+
* @param error - The error to check
|
|
1526
|
+
* @returns True if the error should be retried
|
|
1527
|
+
*/
|
|
1528
|
+
private isRetryableError(error: any): boolean {
|
|
1529
|
+
if (error instanceof AxiosError) {
|
|
1530
|
+
if (!error.response) {
|
|
1531
|
+
const code = error.code;
|
|
1532
|
+
const message = error.message?.toLowerCase() || '';
|
|
1533
|
+
|
|
1534
|
+
return (
|
|
1535
|
+
code === 'ECONNRESET' ||
|
|
1536
|
+
code === 'ETIMEDOUT' ||
|
|
1537
|
+
code === 'ENOTFOUND' ||
|
|
1538
|
+
code === 'ECONNREFUSED' ||
|
|
1539
|
+
message.includes('socket hang up') ||
|
|
1540
|
+
message.includes('network error') ||
|
|
1541
|
+
message.includes('timeout')
|
|
1542
|
+
);
|
|
1543
|
+
}
|
|
1544
|
+
|
|
1545
|
+
if (error.response?.status === 408 || error.response?.status === 504) {
|
|
1546
|
+
return true;
|
|
1547
|
+
}
|
|
1548
|
+
}
|
|
1549
|
+
|
|
1550
|
+
if (error && typeof error === 'object') {
|
|
1551
|
+
const code = error.code;
|
|
1552
|
+
const message = error.message?.toLowerCase() || '';
|
|
1553
|
+
|
|
1554
|
+
if (code === 'ECONNRESET' ||
|
|
1555
|
+
code === 'ETIMEDOUT' ||
|
|
1556
|
+
code === 'ENOTFOUND' ||
|
|
1557
|
+
code === 'ECONNREFUSED' ||
|
|
1558
|
+
message.includes('socket hang up') ||
|
|
1559
|
+
message.includes('network error') ||
|
|
1560
|
+
message.includes('timeout')) {
|
|
1561
|
+
return true;
|
|
1562
|
+
}
|
|
1563
|
+
|
|
1564
|
+
if (error.response?.status === 408 || error.response?.status === 504) {
|
|
1565
|
+
return true;
|
|
1566
|
+
}
|
|
1567
|
+
}
|
|
1568
|
+
|
|
1569
|
+
return false;
|
|
1570
|
+
}
|
|
1571
|
+
|
|
1572
|
+
/**
|
|
1573
|
+
* Handles errors from API responses.
|
|
1574
|
+
* @param {AxiosResponse} response - The response from the API.
|
|
1575
|
+
* @param {string} action - The action being performed when the error occurred.
|
|
1576
|
+
*/
|
|
1577
|
+
handleError(response: AxiosResponse, action: string): void {
|
|
1578
|
+
if (!response) {
|
|
1579
|
+
throw new FirecrawlError(
|
|
1580
|
+
`No response received while trying to ${action}. This may be a network error or the server is unreachable.`,
|
|
1581
|
+
0
|
|
1582
|
+
);
|
|
1583
|
+
}
|
|
1584
|
+
|
|
1585
|
+
if ([400, 402, 403, 408, 409, 500].includes(response.status)) {
|
|
1586
|
+
const errorMessage: string =
|
|
1587
|
+
response.data.error || "Unknown error occurred";
|
|
1588
|
+
const details = response.data.details ? ` - ${JSON.stringify(response.data.details)}` : '';
|
|
1589
|
+
throw new FirecrawlError(
|
|
1590
|
+
`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}${details}`,
|
|
1591
|
+
response.status,
|
|
1592
|
+
response?.data?.details
|
|
1593
|
+
);
|
|
1594
|
+
} else {
|
|
1595
|
+
throw new FirecrawlError(
|
|
1596
|
+
`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`,
|
|
1597
|
+
response.status
|
|
1598
|
+
);
|
|
1599
|
+
}
|
|
1600
|
+
}
|
|
1601
|
+
|
|
1602
|
+
/**
|
|
1603
|
+
* Initiates a deep research operation on a given query and polls until completion.
|
|
1604
|
+
* @param query - The query to research.
|
|
1605
|
+
* @param params - Parameters for the deep research operation.
|
|
1606
|
+
* @param onActivity - Optional callback to receive activity updates in real-time.
|
|
1607
|
+
* @param onSource - Optional callback to receive source updates in real-time.
|
|
1608
|
+
* @returns The final research results.
|
|
1609
|
+
*/
|
|
1610
|
+
async deepResearch(
|
|
1611
|
+
query: string,
|
|
1612
|
+
params: DeepResearchParams<zt.ZodSchema>,
|
|
1613
|
+
onActivity?: (activity: {
|
|
1614
|
+
type: string;
|
|
1615
|
+
status: string;
|
|
1616
|
+
message: string;
|
|
1617
|
+
timestamp: string;
|
|
1618
|
+
depth: number;
|
|
1619
|
+
}) => void,
|
|
1620
|
+
onSource?: (source: {
|
|
1621
|
+
url: string;
|
|
1622
|
+
title?: string;
|
|
1623
|
+
description?: string;
|
|
1624
|
+
icon?: string;
|
|
1625
|
+
}) => void
|
|
1626
|
+
): Promise<DeepResearchStatusResponse | ErrorResponse> {
|
|
1627
|
+
try {
|
|
1628
|
+
const response = await this.asyncDeepResearch(query, params);
|
|
1629
|
+
|
|
1630
|
+
if (!response.success || 'error' in response) {
|
|
1631
|
+
return { success: false, error: 'error' in response ? response.error : 'Unknown error' };
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
if (!response.id) {
|
|
1635
|
+
throw new FirecrawlError(`Failed to start research. No job ID returned.`, 500);
|
|
1636
|
+
}
|
|
1637
|
+
|
|
1638
|
+
const jobId = response.id;
|
|
1639
|
+
let researchStatus;
|
|
1640
|
+
let lastActivityCount = 0;
|
|
1641
|
+
let lastSourceCount = 0;
|
|
1642
|
+
|
|
1643
|
+
while (true) {
|
|
1644
|
+
researchStatus = await this.checkDeepResearchStatus(jobId);
|
|
1645
|
+
|
|
1646
|
+
if ('error' in researchStatus && !researchStatus.success) {
|
|
1647
|
+
return researchStatus;
|
|
1648
|
+
}
|
|
1649
|
+
|
|
1650
|
+
// Stream new activities through the callback if provided
|
|
1651
|
+
if (onActivity && researchStatus.activities) {
|
|
1652
|
+
const newActivities = researchStatus.activities.slice(lastActivityCount);
|
|
1653
|
+
for (const activity of newActivities) {
|
|
1654
|
+
onActivity(activity);
|
|
1655
|
+
}
|
|
1656
|
+
lastActivityCount = researchStatus.activities.length;
|
|
1657
|
+
}
|
|
1658
|
+
|
|
1659
|
+
// Stream new sources through the callback if provided
|
|
1660
|
+
if (onSource && researchStatus.sources) {
|
|
1661
|
+
const newSources = researchStatus.sources.slice(lastSourceCount);
|
|
1662
|
+
for (const source of newSources) {
|
|
1663
|
+
onSource(source);
|
|
1664
|
+
}
|
|
1665
|
+
lastSourceCount = researchStatus.sources.length;
|
|
1666
|
+
}
|
|
1667
|
+
|
|
1668
|
+
if (researchStatus.status === "completed") {
|
|
1669
|
+
return researchStatus;
|
|
1670
|
+
}
|
|
1671
|
+
|
|
1672
|
+
if (researchStatus.status === "failed") {
|
|
1673
|
+
throw new FirecrawlError(
|
|
1674
|
+
`Research job ${researchStatus.status}. Error: ${researchStatus.error}`,
|
|
1675
|
+
500
|
|
1676
|
+
);
|
|
1677
|
+
}
|
|
1678
|
+
|
|
1679
|
+
if (researchStatus.status !== "processing") {
|
|
1680
|
+
break;
|
|
1681
|
+
}
|
|
1682
|
+
|
|
1683
|
+
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
1684
|
+
}
|
|
1685
|
+
|
|
1686
|
+
return { success: false, error: "Research job terminated unexpectedly" };
|
|
1687
|
+
} catch (error: any) {
|
|
1688
|
+
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
|
1689
|
+
}
|
|
1690
|
+
}
|
|
1691
|
+
|
|
1692
|
+
/**
|
|
1693
|
+
* Initiates a deep research operation on a given query without polling.
|
|
1694
|
+
* @param params - Parameters for the deep research operation.
|
|
1695
|
+
* @returns The response containing the research job ID.
|
|
1696
|
+
*/
|
|
1697
|
+
async asyncDeepResearch(query: string, params: DeepResearchParams<zt.ZodSchema>): Promise<DeepResearchResponse | ErrorResponse> {
|
|
1698
|
+
const headers = this.prepareHeaders();
|
|
1699
|
+
let jsonData: any = { query, ...params, origin: `js-sdk@${this.version}` };
|
|
1700
|
+
|
|
1701
|
+
if (jsonData?.jsonOptions?.schema) {
|
|
1702
|
+
let schema = jsonData.jsonOptions.schema;
|
|
1703
|
+
// Try parsing the schema as a Zod schema
|
|
1704
|
+
try {
|
|
1705
|
+
schema = zodToJsonSchema(schema);
|
|
1706
|
+
} catch (error) {
|
|
1707
|
+
// Ignore error if schema can't be parsed as Zod
|
|
1708
|
+
}
|
|
1709
|
+
jsonData = {
|
|
1710
|
+
...jsonData,
|
|
1711
|
+
jsonOptions: {
|
|
1712
|
+
...jsonData.jsonOptions,
|
|
1713
|
+
schema: schema,
|
|
1714
|
+
},
|
|
1715
|
+
};
|
|
1716
|
+
}
|
|
1717
|
+
|
|
1718
|
+
try {
|
|
1719
|
+
const response: AxiosResponse = await this.postRequest(
|
|
1720
|
+
`${this.apiUrl}/v1/deep-research`,
|
|
1721
|
+
jsonData,
|
|
1722
|
+
headers
|
|
1723
|
+
);
|
|
1724
|
+
|
|
1725
|
+
if (response.status === 200) {
|
|
1726
|
+
return response.data;
|
|
1727
|
+
} else {
|
|
1728
|
+
this.handleError(response, "start deep research");
|
|
1729
|
+
}
|
|
1730
|
+
} catch (error: any) {
|
|
1731
|
+
if (error.response?.data?.error) {
|
|
1732
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
1733
|
+
} else {
|
|
1734
|
+
throw new FirecrawlError(error.message, 500);
|
|
1735
|
+
}
|
|
1736
|
+
}
|
|
1737
|
+
return { success: false, error: "Internal server error." };
|
|
1738
|
+
}
|
|
1739
|
+
|
|
1740
|
+
/**
|
|
1741
|
+
* Checks the status of a deep research operation.
|
|
1742
|
+
* @param id - The ID of the deep research operation.
|
|
1743
|
+
* @returns The current status and results of the research operation.
|
|
1744
|
+
*/
|
|
1745
|
+
async checkDeepResearchStatus(id: string): Promise<DeepResearchStatusResponse | ErrorResponse> {
|
|
1746
|
+
const headers = this.prepareHeaders();
|
|
1747
|
+
try {
|
|
1748
|
+
const response: AxiosResponse = await this.getRequest(
|
|
1749
|
+
`${this.apiUrl}/v1/deep-research/${id}`,
|
|
1750
|
+
headers
|
|
1751
|
+
);
|
|
1752
|
+
|
|
1753
|
+
if (response.status === 200) {
|
|
1754
|
+
return response.data;
|
|
1755
|
+
} else if (response.status === 404) {
|
|
1756
|
+
throw new FirecrawlError("Deep research job not found", 404);
|
|
1757
|
+
} else {
|
|
1758
|
+
this.handleError(response, "check deep research status");
|
|
1759
|
+
}
|
|
1760
|
+
} catch (error: any) {
|
|
1761
|
+
if (error.response?.data?.error) {
|
|
1762
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
1763
|
+
} else {
|
|
1764
|
+
throw new FirecrawlError(error.message, 500);
|
|
1765
|
+
}
|
|
1766
|
+
}
|
|
1767
|
+
return { success: false, error: "Internal server error." };
|
|
1768
|
+
}
|
|
1769
|
+
|
|
1770
|
+
/**
|
|
1771
|
+
* @deprecated Use deepResearch() instead
|
|
1772
|
+
* Initiates a deep research operation on a given topic and polls until completion.
|
|
1773
|
+
* @param topic - The topic to research.
|
|
1774
|
+
* @param params - Parameters for the deep research operation.
|
|
1775
|
+
* @param onActivity - Optional callback to receive activity updates in real-time.
|
|
1776
|
+
* @returns The final research results.
|
|
1777
|
+
*/
|
|
1778
|
+
async __deepResearch(
|
|
1779
|
+
topic: string,
|
|
1780
|
+
params: DeepResearchParams,
|
|
1781
|
+
onActivity?: (activity: {
|
|
1782
|
+
type: string;
|
|
1783
|
+
status: string;
|
|
1784
|
+
message: string;
|
|
1785
|
+
timestamp: string;
|
|
1786
|
+
depth: number;
|
|
1787
|
+
}) => void
|
|
1788
|
+
): Promise<DeepResearchStatusResponse | ErrorResponse> {
|
|
1789
|
+
try {
|
|
1790
|
+
const response = await this.__asyncDeepResearch(topic, params);
|
|
1791
|
+
|
|
1792
|
+
if (!response.success || 'error' in response) {
|
|
1793
|
+
return { success: false, error: 'error' in response ? response.error : 'Unknown error' };
|
|
1794
|
+
}
|
|
1795
|
+
|
|
1796
|
+
if (!response.id) {
|
|
1797
|
+
throw new FirecrawlError(`Failed to start research. No job ID returned.`, 500);
|
|
1798
|
+
}
|
|
1799
|
+
|
|
1800
|
+
const jobId = response.id;
|
|
1801
|
+
let researchStatus;
|
|
1802
|
+
let lastActivityCount = 0;
|
|
1803
|
+
|
|
1804
|
+
while (true) {
|
|
1805
|
+
researchStatus = await this.__checkDeepResearchStatus(jobId);
|
|
1806
|
+
|
|
1807
|
+
if ('error' in researchStatus && !researchStatus.success) {
|
|
1808
|
+
return researchStatus;
|
|
1809
|
+
}
|
|
1810
|
+
|
|
1811
|
+
// Stream new activities through the callback if provided
|
|
1812
|
+
if (onActivity && researchStatus.activities) {
|
|
1813
|
+
const newActivities = researchStatus.activities.slice(lastActivityCount);
|
|
1814
|
+
for (const activity of newActivities) {
|
|
1815
|
+
onActivity(activity);
|
|
1816
|
+
}
|
|
1817
|
+
lastActivityCount = researchStatus.activities.length;
|
|
1818
|
+
}
|
|
1819
|
+
|
|
1820
|
+
if (researchStatus.status === "completed") {
|
|
1821
|
+
return researchStatus;
|
|
1822
|
+
}
|
|
1823
|
+
|
|
1824
|
+
if (researchStatus.status === "failed") {
|
|
1825
|
+
throw new FirecrawlError(
|
|
1826
|
+
`Research job ${researchStatus.status}. Error: ${researchStatus.error}`,
|
|
1827
|
+
500
|
|
1828
|
+
);
|
|
1829
|
+
}
|
|
1830
|
+
|
|
1831
|
+
if (researchStatus.status !== "processing") {
|
|
1832
|
+
break;
|
|
1833
|
+
}
|
|
1834
|
+
|
|
1835
|
+
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
1836
|
+
}
|
|
1837
|
+
|
|
1838
|
+
return { success: false, error: "Research job terminated unexpectedly" };
|
|
1839
|
+
} catch (error: any) {
|
|
1840
|
+
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
|
1841
|
+
}
|
|
1842
|
+
}
|
|
1843
|
+
|
|
1844
|
+
/**
|
|
1845
|
+
* @deprecated Use asyncDeepResearch() instead
|
|
1846
|
+
* Initiates a deep research operation on a given topic without polling.
|
|
1847
|
+
* @param params - Parameters for the deep research operation.
|
|
1848
|
+
* @returns The response containing the research job ID.
|
|
1849
|
+
*/
|
|
1850
|
+
async __asyncDeepResearch(topic: string, params: DeepResearchParams): Promise<DeepResearchResponse | ErrorResponse> {
|
|
1851
|
+
const headers = this.prepareHeaders();
|
|
1852
|
+
try {
|
|
1853
|
+
let jsonData: any = { topic, ...params, origin: `js-sdk@${this.version}` };
|
|
1854
|
+
const response: AxiosResponse = await this.postRequest(
|
|
1855
|
+
`${this.apiUrl}/v1/deep-research`,
|
|
1856
|
+
jsonData,
|
|
1857
|
+
headers
|
|
1858
|
+
);
|
|
1859
|
+
|
|
1860
|
+
if (response.status === 200) {
|
|
1861
|
+
return response.data;
|
|
1862
|
+
} else {
|
|
1863
|
+
this.handleError(response, "start deep research");
|
|
1864
|
+
}
|
|
1865
|
+
} catch (error: any) {
|
|
1866
|
+
if (error.response?.data?.error) {
|
|
1867
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
1868
|
+
} else {
|
|
1869
|
+
throw new FirecrawlError(error.message, 500);
|
|
1870
|
+
}
|
|
1871
|
+
}
|
|
1872
|
+
return { success: false, error: "Internal server error." };
|
|
1873
|
+
}
|
|
1874
|
+
|
|
1875
|
+
/**
|
|
1876
|
+
* @deprecated Use checkDeepResearchStatus() instead
|
|
1877
|
+
* Checks the status of a deep research operation.
|
|
1878
|
+
* @param id - The ID of the deep research operation.
|
|
1879
|
+
* @returns The current status and results of the research operation.
|
|
1880
|
+
*/
|
|
1881
|
+
async __checkDeepResearchStatus(id: string): Promise<DeepResearchStatusResponse | ErrorResponse> {
|
|
1882
|
+
const headers = this.prepareHeaders();
|
|
1883
|
+
try {
|
|
1884
|
+
const response: AxiosResponse = await this.getRequest(
|
|
1885
|
+
`${this.apiUrl}/v1/deep-research/${id}`,
|
|
1886
|
+
headers
|
|
1887
|
+
);
|
|
1888
|
+
|
|
1889
|
+
if (response.status === 200) {
|
|
1890
|
+
return response.data;
|
|
1891
|
+
} else if (response.status === 404) {
|
|
1892
|
+
throw new FirecrawlError("Deep research job not found", 404);
|
|
1893
|
+
} else {
|
|
1894
|
+
this.handleError(response, "check deep research status");
|
|
1895
|
+
}
|
|
1896
|
+
} catch (error: any) {
|
|
1897
|
+
if (error.response?.data?.error) {
|
|
1898
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
1899
|
+
} else {
|
|
1900
|
+
throw new FirecrawlError(error.message, 500);
|
|
1901
|
+
}
|
|
1902
|
+
}
|
|
1903
|
+
return { success: false, error: "Internal server error." };
|
|
1904
|
+
}
|
|
1905
|
+
|
|
1906
|
+
/**
|
|
1907
|
+
* Generates LLMs.txt for a given URL and polls until completion.
|
|
1908
|
+
* @param url - The URL to generate LLMs.txt from.
|
|
1909
|
+
* @param params - Parameters for the LLMs.txt generation operation.
|
|
1910
|
+
* @returns The final generation results.
|
|
1911
|
+
*/
|
|
1912
|
+
async generateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise<GenerateLLMsTextStatusResponse | ErrorResponse> {
|
|
1913
|
+
try {
|
|
1914
|
+
const response = await this.asyncGenerateLLMsText(url, params);
|
|
1915
|
+
|
|
1916
|
+
if (!response.success || 'error' in response) {
|
|
1917
|
+
return { success: false, error: 'error' in response ? response.error : 'Unknown error' };
|
|
1918
|
+
}
|
|
1919
|
+
|
|
1920
|
+
if (!response.id) {
|
|
1921
|
+
throw new FirecrawlError(`Failed to start LLMs.txt generation. No job ID returned.`, 500);
|
|
1922
|
+
}
|
|
1923
|
+
|
|
1924
|
+
const jobId = response.id;
|
|
1925
|
+
let generationStatus;
|
|
1926
|
+
|
|
1927
|
+
while (true) {
|
|
1928
|
+
generationStatus = await this.checkGenerateLLMsTextStatus(jobId);
|
|
1929
|
+
|
|
1930
|
+
if ('error' in generationStatus && !generationStatus.success) {
|
|
1931
|
+
return generationStatus;
|
|
1932
|
+
}
|
|
1933
|
+
|
|
1934
|
+
if (generationStatus.status === "completed") {
|
|
1935
|
+
return generationStatus;
|
|
1936
|
+
}
|
|
1937
|
+
|
|
1938
|
+
if (generationStatus.status === "failed") {
|
|
1939
|
+
throw new FirecrawlError(
|
|
1940
|
+
`LLMs.txt generation job ${generationStatus.status}. Error: ${generationStatus.error}`,
|
|
1941
|
+
500
|
|
1942
|
+
);
|
|
1943
|
+
}
|
|
1944
|
+
|
|
1945
|
+
if (generationStatus.status !== "processing") {
|
|
1946
|
+
break;
|
|
1947
|
+
}
|
|
1948
|
+
|
|
1949
|
+
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
1950
|
+
}
|
|
1951
|
+
|
|
1952
|
+
return { success: false, error: "LLMs.txt generation job terminated unexpectedly" };
|
|
1953
|
+
} catch (error: any) {
|
|
1954
|
+
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
|
1955
|
+
}
|
|
1956
|
+
}
|
|
1957
|
+
|
|
1958
|
+
/**
|
|
1959
|
+
* Initiates a LLMs.txt generation operation without polling.
|
|
1960
|
+
* @param url - The URL to generate LLMs.txt from.
|
|
1961
|
+
* @param params - Parameters for the LLMs.txt generation operation.
|
|
1962
|
+
* @returns The response containing the generation job ID.
|
|
1963
|
+
*/
|
|
1964
|
+
async asyncGenerateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise<GenerateLLMsTextResponse | ErrorResponse> {
|
|
1965
|
+
const headers = this.prepareHeaders();
|
|
1966
|
+
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
|
|
1967
|
+
try {
|
|
1968
|
+
const response: AxiosResponse = await this.postRequest(
|
|
1969
|
+
`${this.apiUrl}/v1/llmstxt`,
|
|
1970
|
+
jsonData,
|
|
1971
|
+
headers
|
|
1972
|
+
);
|
|
1973
|
+
|
|
1974
|
+
if (response.status === 200) {
|
|
1975
|
+
return response.data;
|
|
1976
|
+
} else {
|
|
1977
|
+
this.handleError(response, "start LLMs.txt generation");
|
|
1978
|
+
}
|
|
1979
|
+
} catch (error: any) {
|
|
1980
|
+
if (error.response?.data?.error) {
|
|
1981
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
1982
|
+
} else {
|
|
1983
|
+
throw new FirecrawlError(error.message, 500);
|
|
1984
|
+
}
|
|
1985
|
+
}
|
|
1986
|
+
return { success: false, error: "Internal server error." };
|
|
1987
|
+
}
|
|
1988
|
+
|
|
1989
|
+
/**
|
|
1990
|
+
* Checks the status of a LLMs.txt generation operation.
|
|
1991
|
+
* @param id - The ID of the LLMs.txt generation operation.
|
|
1992
|
+
* @returns The current status and results of the generation operation.
|
|
1993
|
+
*/
|
|
1994
|
+
async checkGenerateLLMsTextStatus(id: string): Promise<GenerateLLMsTextStatusResponse | ErrorResponse> {
|
|
1995
|
+
const headers = this.prepareHeaders();
|
|
1996
|
+
try {
|
|
1997
|
+
const response: AxiosResponse = await this.getRequest(
|
|
1998
|
+
`${this.apiUrl}/v1/llmstxt/${id}`,
|
|
1999
|
+
headers
|
|
2000
|
+
);
|
|
2001
|
+
|
|
2002
|
+
if (response.status === 200) {
|
|
2003
|
+
return response.data;
|
|
2004
|
+
} else if (response.status === 404) {
|
|
2005
|
+
throw new FirecrawlError("LLMs.txt generation job not found", 404);
|
|
2006
|
+
} else {
|
|
2007
|
+
this.handleError(response, "check LLMs.txt generation status");
|
|
2008
|
+
}
|
|
2009
|
+
} catch (error: any) {
|
|
2010
|
+
if (error.response?.data?.error) {
|
|
2011
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
2012
|
+
} else {
|
|
2013
|
+
throw new FirecrawlError(error.message, 500);
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
return { success: false, error: "Internal server error." };
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2019
|
+
|
|
2020
|
+
interface CrawlWatcherEvents {
|
|
2021
|
+
document: CustomEvent<FirecrawlDocument<undefined>>,
|
|
2022
|
+
done: CustomEvent<{
|
|
2023
|
+
status: CrawlStatusResponse["status"];
|
|
2024
|
+
data: FirecrawlDocument<undefined>[];
|
|
2025
|
+
}>,
|
|
2026
|
+
error: CustomEvent<{
|
|
2027
|
+
status: CrawlStatusResponse["status"],
|
|
2028
|
+
data: FirecrawlDocument<undefined>[],
|
|
2029
|
+
error: string,
|
|
2030
|
+
}>,
|
|
2031
|
+
}
|
|
2032
|
+
|
|
2033
|
+
export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
2034
|
+
private ws: WebSocket;
|
|
2035
|
+
public data: FirecrawlDocument<undefined>[];
|
|
2036
|
+
public status: CrawlStatusResponse["status"];
|
|
2037
|
+
public id: string;
|
|
2038
|
+
|
|
2039
|
+
constructor(id: string, app: FirecrawlApp) {
|
|
2040
|
+
super();
|
|
2041
|
+
this.id = id;
|
|
2042
|
+
// replace `http` with `ws` (`http://` -> `ws://` and `https://` -> `wss://`)
|
|
2043
|
+
const wsUrl = app.apiUrl.replace(/^http/, "ws");
|
|
2044
|
+
this.ws = new WebSocket(`${wsUrl}/v1/crawl/${id}`, app.apiKey);
|
|
2045
|
+
this.status = "scraping";
|
|
2046
|
+
this.data = [];
|
|
2047
|
+
|
|
2048
|
+
type ErrorMessage = {
|
|
2049
|
+
type: "error",
|
|
2050
|
+
error: string,
|
|
2051
|
+
}
|
|
2052
|
+
|
|
2053
|
+
type CatchupMessage = {
|
|
2054
|
+
type: "catchup",
|
|
2055
|
+
data: CrawlStatusResponse,
|
|
2056
|
+
}
|
|
2057
|
+
|
|
2058
|
+
type DocumentMessage = {
|
|
2059
|
+
type: "document",
|
|
2060
|
+
data: FirecrawlDocument<undefined>,
|
|
2061
|
+
}
|
|
2062
|
+
|
|
2063
|
+
type DoneMessage = { type: "done" }
|
|
2064
|
+
|
|
2065
|
+
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
|
|
2066
|
+
|
|
2067
|
+
const messageHandler = (msg: Message) => {
|
|
2068
|
+
if (msg.type === "done") {
|
|
2069
|
+
this.status = "completed";
|
|
2070
|
+
this.dispatchTypedEvent("done", new CustomEvent("done", {
|
|
2071
|
+
detail: {
|
|
2072
|
+
status: this.status,
|
|
2073
|
+
data: this.data,
|
|
2074
|
+
id: this.id,
|
|
2075
|
+
},
|
|
2076
|
+
}));
|
|
2077
|
+
} else if (msg.type === "error") {
|
|
2078
|
+
this.status = "failed";
|
|
2079
|
+
this.dispatchTypedEvent("error", new CustomEvent("error", {
|
|
2080
|
+
detail: {
|
|
2081
|
+
status: this.status,
|
|
2082
|
+
data: this.data,
|
|
2083
|
+
error: msg.error,
|
|
2084
|
+
id: this.id,
|
|
2085
|
+
},
|
|
2086
|
+
}));
|
|
2087
|
+
} else if (msg.type === "catchup") {
|
|
2088
|
+
this.status = msg.data.status;
|
|
2089
|
+
this.data.push(...(msg.data.data ?? []));
|
|
2090
|
+
for (const doc of this.data) {
|
|
2091
|
+
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
2092
|
+
detail: {
|
|
2093
|
+
...doc,
|
|
2094
|
+
id: this.id,
|
|
2095
|
+
},
|
|
2096
|
+
}));
|
|
2097
|
+
}
|
|
2098
|
+
} else if (msg.type === "document") {
|
|
2099
|
+
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
2100
|
+
detail: {
|
|
2101
|
+
...msg.data,
|
|
2102
|
+
id: this.id,
|
|
2103
|
+
},
|
|
2104
|
+
}));
|
|
2105
|
+
}
|
|
2106
|
+
}
|
|
2107
|
+
|
|
2108
|
+
this.ws.onmessage = ((ev: MessageEvent) => {
|
|
2109
|
+
if (typeof ev.data !== "string") {
|
|
2110
|
+
this.ws.close();
|
|
2111
|
+
return;
|
|
2112
|
+
}
|
|
2113
|
+
try {
|
|
2114
|
+
const msg = JSON.parse(ev.data) as Message;
|
|
2115
|
+
messageHandler(msg);
|
|
2116
|
+
} catch (error) {
|
|
2117
|
+
console.error("Error on message", error);
|
|
2118
|
+
}
|
|
2119
|
+
}).bind(this);
|
|
2120
|
+
|
|
2121
|
+
this.ws.onclose = ((ev: CloseEvent) => {
|
|
2122
|
+
try {
|
|
2123
|
+
const msg = JSON.parse(ev.reason) as Message;
|
|
2124
|
+
messageHandler(msg);
|
|
2125
|
+
} catch (error) {
|
|
2126
|
+
console.error("Error on close", error);
|
|
2127
|
+
}
|
|
2128
|
+
}).bind(this);
|
|
2129
|
+
|
|
2130
|
+
this.ws.onerror = ((_: Event) => {
|
|
2131
|
+
this.status = "failed"
|
|
2132
|
+
this.dispatchTypedEvent("error", new CustomEvent("error", {
|
|
2133
|
+
detail: {
|
|
2134
|
+
status: this.status,
|
|
2135
|
+
data: this.data,
|
|
2136
|
+
error: "WebSocket error",
|
|
2137
|
+
id: this.id,
|
|
2138
|
+
},
|
|
2139
|
+
}));
|
|
2140
|
+
}).bind(this);
|
|
2141
|
+
}
|
|
2142
|
+
|
|
2143
|
+
close() {
|
|
2144
|
+
this.ws.close();
|
|
2145
|
+
}
|
|
2146
|
+
}
|