firecrawl 1.10.1 → 1.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +105 -25
- package/dist/index.d.cts +40 -9
- package/dist/index.d.ts +40 -9
- package/dist/index.js +105 -25
- package/package.json +1 -1
- package/src/__tests__/index.test.ts +18 -9
- package/src/__tests__/v1/e2e_withAuth/index.test.ts +73 -20
- package/src/index.ts +148 -27
package/dist/index.cjs
CHANGED
|
@@ -49,16 +49,20 @@ var FirecrawlError = class extends Error {
|
|
|
49
49
|
var FirecrawlApp = class {
|
|
50
50
|
apiKey;
|
|
51
51
|
apiUrl;
|
|
52
|
+
isCloudService(url) {
|
|
53
|
+
return url.includes("api.firecrawl.dev");
|
|
54
|
+
}
|
|
52
55
|
/**
|
|
53
56
|
* Initializes a new instance of the FirecrawlApp class.
|
|
54
57
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
55
58
|
*/
|
|
56
59
|
constructor({ apiKey = null, apiUrl = null }) {
|
|
57
|
-
|
|
60
|
+
const baseUrl = apiUrl || "https://api.firecrawl.dev";
|
|
61
|
+
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
|
|
58
62
|
throw new FirecrawlError("No API key provided", 401);
|
|
59
63
|
}
|
|
60
|
-
this.apiKey = apiKey;
|
|
61
|
-
this.apiUrl =
|
|
64
|
+
this.apiKey = apiKey || "";
|
|
65
|
+
this.apiUrl = baseUrl;
|
|
62
66
|
}
|
|
63
67
|
/**
|
|
64
68
|
* Scrapes a URL using the Firecrawl API.
|
|
@@ -113,13 +117,73 @@ var FirecrawlApp = class {
|
|
|
113
117
|
return { success: false, error: "Internal server error." };
|
|
114
118
|
}
|
|
115
119
|
/**
|
|
116
|
-
*
|
|
120
|
+
* Searches using the Firecrawl API and optionally scrapes the results.
|
|
117
121
|
* @param query - The search query string.
|
|
118
|
-
* @param params -
|
|
119
|
-
* @returns
|
|
122
|
+
* @param params - Optional parameters for the search request.
|
|
123
|
+
* @returns The response from the search operation.
|
|
120
124
|
*/
|
|
121
125
|
async search(query, params) {
|
|
122
|
-
|
|
126
|
+
const headers = {
|
|
127
|
+
"Content-Type": "application/json",
|
|
128
|
+
Authorization: `Bearer ${this.apiKey}`
|
|
129
|
+
};
|
|
130
|
+
let jsonData = {
|
|
131
|
+
query,
|
|
132
|
+
limit: params?.limit ?? 5,
|
|
133
|
+
tbs: params?.tbs,
|
|
134
|
+
filter: params?.filter,
|
|
135
|
+
lang: params?.lang ?? "en",
|
|
136
|
+
country: params?.country ?? "us",
|
|
137
|
+
location: params?.location,
|
|
138
|
+
origin: params?.origin ?? "api",
|
|
139
|
+
timeout: params?.timeout ?? 6e4,
|
|
140
|
+
scrapeOptions: params?.scrapeOptions ?? { formats: [] }
|
|
141
|
+
};
|
|
142
|
+
if (jsonData?.scrapeOptions?.extract?.schema) {
|
|
143
|
+
let schema = jsonData.scrapeOptions.extract.schema;
|
|
144
|
+
try {
|
|
145
|
+
schema = (0, import_zod_to_json_schema.zodToJsonSchema)(schema);
|
|
146
|
+
} catch (error) {
|
|
147
|
+
}
|
|
148
|
+
jsonData = {
|
|
149
|
+
...jsonData,
|
|
150
|
+
scrapeOptions: {
|
|
151
|
+
...jsonData.scrapeOptions,
|
|
152
|
+
extract: {
|
|
153
|
+
...jsonData.scrapeOptions.extract,
|
|
154
|
+
schema
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
try {
|
|
160
|
+
const response = await this.postRequest(
|
|
161
|
+
this.apiUrl + `/v1/search`,
|
|
162
|
+
jsonData,
|
|
163
|
+
headers
|
|
164
|
+
);
|
|
165
|
+
if (response.status === 200) {
|
|
166
|
+
const responseData = response.data;
|
|
167
|
+
if (responseData.success) {
|
|
168
|
+
return {
|
|
169
|
+
success: true,
|
|
170
|
+
data: responseData.data,
|
|
171
|
+
warning: responseData.warning
|
|
172
|
+
};
|
|
173
|
+
} else {
|
|
174
|
+
throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
|
|
175
|
+
}
|
|
176
|
+
} else {
|
|
177
|
+
this.handleError(response, "search");
|
|
178
|
+
}
|
|
179
|
+
} catch (error) {
|
|
180
|
+
if (error.response?.data?.error) {
|
|
181
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ""}`, error.response.status);
|
|
182
|
+
} else {
|
|
183
|
+
throw new FirecrawlError(error.message, 500);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
return { success: false, error: "Internal server error.", data: [] };
|
|
123
187
|
}
|
|
124
188
|
/**
|
|
125
189
|
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
@@ -295,9 +359,9 @@ var FirecrawlApp = class {
|
|
|
295
359
|
* @param webhook - Optional webhook for the batch scrape.
|
|
296
360
|
* @returns The response from the crawl operation.
|
|
297
361
|
*/
|
|
298
|
-
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook) {
|
|
362
|
+
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook, ignoreInvalidURLs) {
|
|
299
363
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
300
|
-
let jsonData = { urls, ...params };
|
|
364
|
+
let jsonData = { urls, webhook, ignoreInvalidURLs, ...params };
|
|
301
365
|
if (jsonData?.extract?.schema) {
|
|
302
366
|
let schema = jsonData.extract.schema;
|
|
303
367
|
try {
|
|
@@ -333,9 +397,9 @@ var FirecrawlApp = class {
|
|
|
333
397
|
}
|
|
334
398
|
return { success: false, error: "Internal server error." };
|
|
335
399
|
}
|
|
336
|
-
async asyncBatchScrapeUrls(urls, params, idempotencyKey) {
|
|
400
|
+
async asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs) {
|
|
337
401
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
338
|
-
let jsonData = { urls, ...params ?? {} };
|
|
402
|
+
let jsonData = { urls, webhook, ignoreInvalidURLs, ...params ?? {} };
|
|
339
403
|
try {
|
|
340
404
|
const response = await this.postRequest(
|
|
341
405
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -363,8 +427,8 @@ var FirecrawlApp = class {
|
|
|
363
427
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
364
428
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
365
429
|
*/
|
|
366
|
-
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey) {
|
|
367
|
-
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
430
|
+
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey, webhook, ignoreInvalidURLs) {
|
|
431
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs);
|
|
368
432
|
if (crawl.success && crawl.id) {
|
|
369
433
|
const id = crawl.id;
|
|
370
434
|
return new CrawlWatcher(id, this);
|
|
@@ -428,9 +492,6 @@ var FirecrawlApp = class {
|
|
|
428
492
|
*/
|
|
429
493
|
async extract(urls, params) {
|
|
430
494
|
const headers = this.prepareHeaders();
|
|
431
|
-
if (!params?.prompt) {
|
|
432
|
-
throw new FirecrawlError("Prompt is required", 400);
|
|
433
|
-
}
|
|
434
495
|
let jsonData = { urls, ...params };
|
|
435
496
|
let jsonSchema;
|
|
436
497
|
try {
|
|
@@ -593,8 +654,10 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
593
654
|
ws;
|
|
594
655
|
data;
|
|
595
656
|
status;
|
|
657
|
+
id;
|
|
596
658
|
constructor(id, app) {
|
|
597
659
|
super();
|
|
660
|
+
this.id = id;
|
|
598
661
|
this.ws = new import_isows.WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
599
662
|
this.status = "scraping";
|
|
600
663
|
this.data = [];
|
|
@@ -604,7 +667,8 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
604
667
|
this.dispatchTypedEvent("done", new CustomEvent("done", {
|
|
605
668
|
detail: {
|
|
606
669
|
status: this.status,
|
|
607
|
-
data: this.data
|
|
670
|
+
data: this.data,
|
|
671
|
+
id: this.id
|
|
608
672
|
}
|
|
609
673
|
}));
|
|
610
674
|
} else if (msg.type === "error") {
|
|
@@ -613,7 +677,8 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
613
677
|
detail: {
|
|
614
678
|
status: this.status,
|
|
615
679
|
data: this.data,
|
|
616
|
-
error: msg.error
|
|
680
|
+
error: msg.error,
|
|
681
|
+
id: this.id
|
|
617
682
|
}
|
|
618
683
|
}));
|
|
619
684
|
} else if (msg.type === "catchup") {
|
|
@@ -621,12 +686,18 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
621
686
|
this.data.push(...msg.data.data ?? []);
|
|
622
687
|
for (const doc of this.data) {
|
|
623
688
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
624
|
-
detail:
|
|
689
|
+
detail: {
|
|
690
|
+
...doc,
|
|
691
|
+
id: this.id
|
|
692
|
+
}
|
|
625
693
|
}));
|
|
626
694
|
}
|
|
627
695
|
} else if (msg.type === "document") {
|
|
628
696
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
629
|
-
detail:
|
|
697
|
+
detail: {
|
|
698
|
+
...msg.data,
|
|
699
|
+
id: this.id
|
|
700
|
+
}
|
|
630
701
|
}));
|
|
631
702
|
}
|
|
632
703
|
};
|
|
@@ -635,12 +706,20 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
635
706
|
this.ws.close();
|
|
636
707
|
return;
|
|
637
708
|
}
|
|
638
|
-
|
|
639
|
-
|
|
709
|
+
try {
|
|
710
|
+
const msg = JSON.parse(ev.data);
|
|
711
|
+
messageHandler(msg);
|
|
712
|
+
} catch (error) {
|
|
713
|
+
console.error("Error on message", error);
|
|
714
|
+
}
|
|
640
715
|
}).bind(this);
|
|
641
716
|
this.ws.onclose = ((ev) => {
|
|
642
|
-
|
|
643
|
-
|
|
717
|
+
try {
|
|
718
|
+
const msg = JSON.parse(ev.reason);
|
|
719
|
+
messageHandler(msg);
|
|
720
|
+
} catch (error) {
|
|
721
|
+
console.error("Error on close", error);
|
|
722
|
+
}
|
|
644
723
|
}).bind(this);
|
|
645
724
|
this.ws.onerror = ((_) => {
|
|
646
725
|
this.status = "failed";
|
|
@@ -648,7 +727,8 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
648
727
|
detail: {
|
|
649
728
|
status: this.status,
|
|
650
729
|
data: this.data,
|
|
651
|
-
error: "WebSocket error"
|
|
730
|
+
error: "WebSocket error",
|
|
731
|
+
id: this.id
|
|
652
732
|
}
|
|
653
733
|
}));
|
|
654
734
|
}).bind(this);
|
package/dist/index.d.cts
CHANGED
|
@@ -64,6 +64,8 @@ interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | neve
|
|
|
64
64
|
screenshot?: string;
|
|
65
65
|
metadata?: FirecrawlDocumentMetadata;
|
|
66
66
|
actions: ActionsSchema;
|
|
67
|
+
title?: string;
|
|
68
|
+
description?: string;
|
|
67
69
|
}
|
|
68
70
|
/**
|
|
69
71
|
* Parameters for scraping operations.
|
|
@@ -171,6 +173,7 @@ interface BatchScrapeResponse {
|
|
|
171
173
|
url?: string;
|
|
172
174
|
success: true;
|
|
173
175
|
error?: string;
|
|
176
|
+
invalidURLs?: string[];
|
|
174
177
|
}
|
|
175
178
|
/**
|
|
176
179
|
* Response interface for job status checks.
|
|
@@ -225,10 +228,11 @@ interface MapResponse {
|
|
|
225
228
|
* Defines options for extracting information from URLs.
|
|
226
229
|
*/
|
|
227
230
|
interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
228
|
-
prompt
|
|
231
|
+
prompt?: string;
|
|
229
232
|
schema?: LLMSchema;
|
|
230
233
|
systemPrompt?: string;
|
|
231
234
|
allowExternalLinks?: boolean;
|
|
235
|
+
includeSubdomains?: boolean;
|
|
232
236
|
}
|
|
233
237
|
/**
|
|
234
238
|
* Response interface for extracting information from URLs.
|
|
@@ -256,6 +260,31 @@ declare class FirecrawlError extends Error {
|
|
|
256
260
|
statusCode: number;
|
|
257
261
|
constructor(message: string, statusCode: number);
|
|
258
262
|
}
|
|
263
|
+
/**
|
|
264
|
+
* Parameters for search operations.
|
|
265
|
+
* Defines options for searching and scraping search results.
|
|
266
|
+
*/
|
|
267
|
+
interface SearchParams {
|
|
268
|
+
limit?: number;
|
|
269
|
+
tbs?: string;
|
|
270
|
+
filter?: string;
|
|
271
|
+
lang?: string;
|
|
272
|
+
country?: string;
|
|
273
|
+
location?: string;
|
|
274
|
+
origin?: string;
|
|
275
|
+
timeout?: number;
|
|
276
|
+
scrapeOptions?: ScrapeParams;
|
|
277
|
+
}
|
|
278
|
+
/**
|
|
279
|
+
* Response interface for search operations.
|
|
280
|
+
* Defines the structure of the response received after a search operation.
|
|
281
|
+
*/
|
|
282
|
+
interface SearchResponse {
|
|
283
|
+
success: boolean;
|
|
284
|
+
data: FirecrawlDocument<undefined>[];
|
|
285
|
+
warning?: string;
|
|
286
|
+
error?: string;
|
|
287
|
+
}
|
|
259
288
|
/**
|
|
260
289
|
* Main class for interacting with the Firecrawl API.
|
|
261
290
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
@@ -263,6 +292,7 @@ declare class FirecrawlError extends Error {
|
|
|
263
292
|
declare class FirecrawlApp {
|
|
264
293
|
apiKey: string;
|
|
265
294
|
apiUrl: string;
|
|
295
|
+
private isCloudService;
|
|
266
296
|
/**
|
|
267
297
|
* Initializes a new instance of the FirecrawlApp class.
|
|
268
298
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
@@ -276,12 +306,12 @@ declare class FirecrawlApp {
|
|
|
276
306
|
*/
|
|
277
307
|
scrapeUrl<T extends zt.ZodSchema, ActionsSchema extends (Action[] | undefined) = undefined>(url: string, params?: ScrapeParams<T, ActionsSchema>): Promise<ScrapeResponse<zt.infer<T>, ActionsSchema extends Action[] ? ActionsResult : never> | ErrorResponse>;
|
|
278
308
|
/**
|
|
279
|
-
*
|
|
309
|
+
* Searches using the Firecrawl API and optionally scrapes the results.
|
|
280
310
|
* @param query - The search query string.
|
|
281
|
-
* @param params -
|
|
282
|
-
* @returns
|
|
311
|
+
* @param params - Optional parameters for the search request.
|
|
312
|
+
* @returns The response from the search operation.
|
|
283
313
|
*/
|
|
284
|
-
search(query: string, params?: any): Promise<
|
|
314
|
+
search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse>;
|
|
285
315
|
/**
|
|
286
316
|
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
287
317
|
* @param url - The URL to crawl.
|
|
@@ -329,8 +359,8 @@ declare class FirecrawlApp {
|
|
|
329
359
|
* @param webhook - Optional webhook for the batch scrape.
|
|
330
360
|
* @returns The response from the crawl operation.
|
|
331
361
|
*/
|
|
332
|
-
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"]): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
333
|
-
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
362
|
+
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
363
|
+
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
334
364
|
/**
|
|
335
365
|
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
336
366
|
* @param urls - The URL to scrape.
|
|
@@ -338,7 +368,7 @@ declare class FirecrawlApp {
|
|
|
338
368
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
339
369
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
340
370
|
*/
|
|
341
|
-
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<CrawlWatcher>;
|
|
371
|
+
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<CrawlWatcher>;
|
|
342
372
|
/**
|
|
343
373
|
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
344
374
|
* @param id - The ID of the batch scrape operation.
|
|
@@ -414,8 +444,9 @@ declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
414
444
|
private ws;
|
|
415
445
|
data: FirecrawlDocument<undefined>[];
|
|
416
446
|
status: CrawlStatusResponse["status"];
|
|
447
|
+
id: string;
|
|
417
448
|
constructor(id: string, app: FirecrawlApp);
|
|
418
449
|
close(): void;
|
|
419
450
|
}
|
|
420
451
|
|
|
421
|
-
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type ExtractParams, type ExtractResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, FirecrawlApp as default };
|
|
452
|
+
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type ExtractParams, type ExtractResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, type SearchParams, type SearchResponse, FirecrawlApp as default };
|
package/dist/index.d.ts
CHANGED
|
@@ -64,6 +64,8 @@ interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | neve
|
|
|
64
64
|
screenshot?: string;
|
|
65
65
|
metadata?: FirecrawlDocumentMetadata;
|
|
66
66
|
actions: ActionsSchema;
|
|
67
|
+
title?: string;
|
|
68
|
+
description?: string;
|
|
67
69
|
}
|
|
68
70
|
/**
|
|
69
71
|
* Parameters for scraping operations.
|
|
@@ -171,6 +173,7 @@ interface BatchScrapeResponse {
|
|
|
171
173
|
url?: string;
|
|
172
174
|
success: true;
|
|
173
175
|
error?: string;
|
|
176
|
+
invalidURLs?: string[];
|
|
174
177
|
}
|
|
175
178
|
/**
|
|
176
179
|
* Response interface for job status checks.
|
|
@@ -225,10 +228,11 @@ interface MapResponse {
|
|
|
225
228
|
* Defines options for extracting information from URLs.
|
|
226
229
|
*/
|
|
227
230
|
interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
228
|
-
prompt
|
|
231
|
+
prompt?: string;
|
|
229
232
|
schema?: LLMSchema;
|
|
230
233
|
systemPrompt?: string;
|
|
231
234
|
allowExternalLinks?: boolean;
|
|
235
|
+
includeSubdomains?: boolean;
|
|
232
236
|
}
|
|
233
237
|
/**
|
|
234
238
|
* Response interface for extracting information from URLs.
|
|
@@ -256,6 +260,31 @@ declare class FirecrawlError extends Error {
|
|
|
256
260
|
statusCode: number;
|
|
257
261
|
constructor(message: string, statusCode: number);
|
|
258
262
|
}
|
|
263
|
+
/**
|
|
264
|
+
* Parameters for search operations.
|
|
265
|
+
* Defines options for searching and scraping search results.
|
|
266
|
+
*/
|
|
267
|
+
interface SearchParams {
|
|
268
|
+
limit?: number;
|
|
269
|
+
tbs?: string;
|
|
270
|
+
filter?: string;
|
|
271
|
+
lang?: string;
|
|
272
|
+
country?: string;
|
|
273
|
+
location?: string;
|
|
274
|
+
origin?: string;
|
|
275
|
+
timeout?: number;
|
|
276
|
+
scrapeOptions?: ScrapeParams;
|
|
277
|
+
}
|
|
278
|
+
/**
|
|
279
|
+
* Response interface for search operations.
|
|
280
|
+
* Defines the structure of the response received after a search operation.
|
|
281
|
+
*/
|
|
282
|
+
interface SearchResponse {
|
|
283
|
+
success: boolean;
|
|
284
|
+
data: FirecrawlDocument<undefined>[];
|
|
285
|
+
warning?: string;
|
|
286
|
+
error?: string;
|
|
287
|
+
}
|
|
259
288
|
/**
|
|
260
289
|
* Main class for interacting with the Firecrawl API.
|
|
261
290
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
@@ -263,6 +292,7 @@ declare class FirecrawlError extends Error {
|
|
|
263
292
|
declare class FirecrawlApp {
|
|
264
293
|
apiKey: string;
|
|
265
294
|
apiUrl: string;
|
|
295
|
+
private isCloudService;
|
|
266
296
|
/**
|
|
267
297
|
* Initializes a new instance of the FirecrawlApp class.
|
|
268
298
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
@@ -276,12 +306,12 @@ declare class FirecrawlApp {
|
|
|
276
306
|
*/
|
|
277
307
|
scrapeUrl<T extends zt.ZodSchema, ActionsSchema extends (Action[] | undefined) = undefined>(url: string, params?: ScrapeParams<T, ActionsSchema>): Promise<ScrapeResponse<zt.infer<T>, ActionsSchema extends Action[] ? ActionsResult : never> | ErrorResponse>;
|
|
278
308
|
/**
|
|
279
|
-
*
|
|
309
|
+
* Searches using the Firecrawl API and optionally scrapes the results.
|
|
280
310
|
* @param query - The search query string.
|
|
281
|
-
* @param params -
|
|
282
|
-
* @returns
|
|
311
|
+
* @param params - Optional parameters for the search request.
|
|
312
|
+
* @returns The response from the search operation.
|
|
283
313
|
*/
|
|
284
|
-
search(query: string, params?: any): Promise<
|
|
314
|
+
search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse>;
|
|
285
315
|
/**
|
|
286
316
|
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
287
317
|
* @param url - The URL to crawl.
|
|
@@ -329,8 +359,8 @@ declare class FirecrawlApp {
|
|
|
329
359
|
* @param webhook - Optional webhook for the batch scrape.
|
|
330
360
|
* @returns The response from the crawl operation.
|
|
331
361
|
*/
|
|
332
|
-
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"]): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
333
|
-
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
362
|
+
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
363
|
+
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
334
364
|
/**
|
|
335
365
|
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
336
366
|
* @param urls - The URL to scrape.
|
|
@@ -338,7 +368,7 @@ declare class FirecrawlApp {
|
|
|
338
368
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
339
369
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
340
370
|
*/
|
|
341
|
-
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<CrawlWatcher>;
|
|
371
|
+
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<CrawlWatcher>;
|
|
342
372
|
/**
|
|
343
373
|
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
344
374
|
* @param id - The ID of the batch scrape operation.
|
|
@@ -414,8 +444,9 @@ declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
414
444
|
private ws;
|
|
415
445
|
data: FirecrawlDocument<undefined>[];
|
|
416
446
|
status: CrawlStatusResponse["status"];
|
|
447
|
+
id: string;
|
|
417
448
|
constructor(id: string, app: FirecrawlApp);
|
|
418
449
|
close(): void;
|
|
419
450
|
}
|
|
420
451
|
|
|
421
|
-
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type ExtractParams, type ExtractResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, FirecrawlApp as default };
|
|
452
|
+
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type ExtractParams, type ExtractResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, type SearchParams, type SearchResponse, FirecrawlApp as default };
|
package/dist/index.js
CHANGED
|
@@ -13,16 +13,20 @@ var FirecrawlError = class extends Error {
|
|
|
13
13
|
var FirecrawlApp = class {
|
|
14
14
|
apiKey;
|
|
15
15
|
apiUrl;
|
|
16
|
+
isCloudService(url) {
|
|
17
|
+
return url.includes("api.firecrawl.dev");
|
|
18
|
+
}
|
|
16
19
|
/**
|
|
17
20
|
* Initializes a new instance of the FirecrawlApp class.
|
|
18
21
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
19
22
|
*/
|
|
20
23
|
constructor({ apiKey = null, apiUrl = null }) {
|
|
21
|
-
|
|
24
|
+
const baseUrl = apiUrl || "https://api.firecrawl.dev";
|
|
25
|
+
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
|
|
22
26
|
throw new FirecrawlError("No API key provided", 401);
|
|
23
27
|
}
|
|
24
|
-
this.apiKey = apiKey;
|
|
25
|
-
this.apiUrl =
|
|
28
|
+
this.apiKey = apiKey || "";
|
|
29
|
+
this.apiUrl = baseUrl;
|
|
26
30
|
}
|
|
27
31
|
/**
|
|
28
32
|
* Scrapes a URL using the Firecrawl API.
|
|
@@ -77,13 +81,73 @@ var FirecrawlApp = class {
|
|
|
77
81
|
return { success: false, error: "Internal server error." };
|
|
78
82
|
}
|
|
79
83
|
/**
|
|
80
|
-
*
|
|
84
|
+
* Searches using the Firecrawl API and optionally scrapes the results.
|
|
81
85
|
* @param query - The search query string.
|
|
82
|
-
* @param params -
|
|
83
|
-
* @returns
|
|
86
|
+
* @param params - Optional parameters for the search request.
|
|
87
|
+
* @returns The response from the search operation.
|
|
84
88
|
*/
|
|
85
89
|
async search(query, params) {
|
|
86
|
-
|
|
90
|
+
const headers = {
|
|
91
|
+
"Content-Type": "application/json",
|
|
92
|
+
Authorization: `Bearer ${this.apiKey}`
|
|
93
|
+
};
|
|
94
|
+
let jsonData = {
|
|
95
|
+
query,
|
|
96
|
+
limit: params?.limit ?? 5,
|
|
97
|
+
tbs: params?.tbs,
|
|
98
|
+
filter: params?.filter,
|
|
99
|
+
lang: params?.lang ?? "en",
|
|
100
|
+
country: params?.country ?? "us",
|
|
101
|
+
location: params?.location,
|
|
102
|
+
origin: params?.origin ?? "api",
|
|
103
|
+
timeout: params?.timeout ?? 6e4,
|
|
104
|
+
scrapeOptions: params?.scrapeOptions ?? { formats: [] }
|
|
105
|
+
};
|
|
106
|
+
if (jsonData?.scrapeOptions?.extract?.schema) {
|
|
107
|
+
let schema = jsonData.scrapeOptions.extract.schema;
|
|
108
|
+
try {
|
|
109
|
+
schema = zodToJsonSchema(schema);
|
|
110
|
+
} catch (error) {
|
|
111
|
+
}
|
|
112
|
+
jsonData = {
|
|
113
|
+
...jsonData,
|
|
114
|
+
scrapeOptions: {
|
|
115
|
+
...jsonData.scrapeOptions,
|
|
116
|
+
extract: {
|
|
117
|
+
...jsonData.scrapeOptions.extract,
|
|
118
|
+
schema
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
try {
|
|
124
|
+
const response = await this.postRequest(
|
|
125
|
+
this.apiUrl + `/v1/search`,
|
|
126
|
+
jsonData,
|
|
127
|
+
headers
|
|
128
|
+
);
|
|
129
|
+
if (response.status === 200) {
|
|
130
|
+
const responseData = response.data;
|
|
131
|
+
if (responseData.success) {
|
|
132
|
+
return {
|
|
133
|
+
success: true,
|
|
134
|
+
data: responseData.data,
|
|
135
|
+
warning: responseData.warning
|
|
136
|
+
};
|
|
137
|
+
} else {
|
|
138
|
+
throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
|
|
139
|
+
}
|
|
140
|
+
} else {
|
|
141
|
+
this.handleError(response, "search");
|
|
142
|
+
}
|
|
143
|
+
} catch (error) {
|
|
144
|
+
if (error.response?.data?.error) {
|
|
145
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ""}`, error.response.status);
|
|
146
|
+
} else {
|
|
147
|
+
throw new FirecrawlError(error.message, 500);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
return { success: false, error: "Internal server error.", data: [] };
|
|
87
151
|
}
|
|
88
152
|
/**
|
|
89
153
|
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
@@ -259,9 +323,9 @@ var FirecrawlApp = class {
|
|
|
259
323
|
* @param webhook - Optional webhook for the batch scrape.
|
|
260
324
|
* @returns The response from the crawl operation.
|
|
261
325
|
*/
|
|
262
|
-
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook) {
|
|
326
|
+
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook, ignoreInvalidURLs) {
|
|
263
327
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
264
|
-
let jsonData = { urls, ...params };
|
|
328
|
+
let jsonData = { urls, webhook, ignoreInvalidURLs, ...params };
|
|
265
329
|
if (jsonData?.extract?.schema) {
|
|
266
330
|
let schema = jsonData.extract.schema;
|
|
267
331
|
try {
|
|
@@ -297,9 +361,9 @@ var FirecrawlApp = class {
|
|
|
297
361
|
}
|
|
298
362
|
return { success: false, error: "Internal server error." };
|
|
299
363
|
}
|
|
300
|
-
async asyncBatchScrapeUrls(urls, params, idempotencyKey) {
|
|
364
|
+
async asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs) {
|
|
301
365
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
302
|
-
let jsonData = { urls, ...params ?? {} };
|
|
366
|
+
let jsonData = { urls, webhook, ignoreInvalidURLs, ...params ?? {} };
|
|
303
367
|
try {
|
|
304
368
|
const response = await this.postRequest(
|
|
305
369
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -327,8 +391,8 @@ var FirecrawlApp = class {
|
|
|
327
391
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
328
392
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
329
393
|
*/
|
|
330
|
-
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey) {
|
|
331
|
-
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
394
|
+
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey, webhook, ignoreInvalidURLs) {
|
|
395
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs);
|
|
332
396
|
if (crawl.success && crawl.id) {
|
|
333
397
|
const id = crawl.id;
|
|
334
398
|
return new CrawlWatcher(id, this);
|
|
@@ -392,9 +456,6 @@ var FirecrawlApp = class {
|
|
|
392
456
|
*/
|
|
393
457
|
async extract(urls, params) {
|
|
394
458
|
const headers = this.prepareHeaders();
|
|
395
|
-
if (!params?.prompt) {
|
|
396
|
-
throw new FirecrawlError("Prompt is required", 400);
|
|
397
|
-
}
|
|
398
459
|
let jsonData = { urls, ...params };
|
|
399
460
|
let jsonSchema;
|
|
400
461
|
try {
|
|
@@ -557,8 +618,10 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
557
618
|
ws;
|
|
558
619
|
data;
|
|
559
620
|
status;
|
|
621
|
+
id;
|
|
560
622
|
constructor(id, app) {
|
|
561
623
|
super();
|
|
624
|
+
this.id = id;
|
|
562
625
|
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
563
626
|
this.status = "scraping";
|
|
564
627
|
this.data = [];
|
|
@@ -568,7 +631,8 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
568
631
|
this.dispatchTypedEvent("done", new CustomEvent("done", {
|
|
569
632
|
detail: {
|
|
570
633
|
status: this.status,
|
|
571
|
-
data: this.data
|
|
634
|
+
data: this.data,
|
|
635
|
+
id: this.id
|
|
572
636
|
}
|
|
573
637
|
}));
|
|
574
638
|
} else if (msg.type === "error") {
|
|
@@ -577,7 +641,8 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
577
641
|
detail: {
|
|
578
642
|
status: this.status,
|
|
579
643
|
data: this.data,
|
|
580
|
-
error: msg.error
|
|
644
|
+
error: msg.error,
|
|
645
|
+
id: this.id
|
|
581
646
|
}
|
|
582
647
|
}));
|
|
583
648
|
} else if (msg.type === "catchup") {
|
|
@@ -585,12 +650,18 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
585
650
|
this.data.push(...msg.data.data ?? []);
|
|
586
651
|
for (const doc of this.data) {
|
|
587
652
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
588
|
-
detail:
|
|
653
|
+
detail: {
|
|
654
|
+
...doc,
|
|
655
|
+
id: this.id
|
|
656
|
+
}
|
|
589
657
|
}));
|
|
590
658
|
}
|
|
591
659
|
} else if (msg.type === "document") {
|
|
592
660
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
593
|
-
detail:
|
|
661
|
+
detail: {
|
|
662
|
+
...msg.data,
|
|
663
|
+
id: this.id
|
|
664
|
+
}
|
|
594
665
|
}));
|
|
595
666
|
}
|
|
596
667
|
};
|
|
@@ -599,12 +670,20 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
599
670
|
this.ws.close();
|
|
600
671
|
return;
|
|
601
672
|
}
|
|
602
|
-
|
|
603
|
-
|
|
673
|
+
try {
|
|
674
|
+
const msg = JSON.parse(ev.data);
|
|
675
|
+
messageHandler(msg);
|
|
676
|
+
} catch (error) {
|
|
677
|
+
console.error("Error on message", error);
|
|
678
|
+
}
|
|
604
679
|
}).bind(this);
|
|
605
680
|
this.ws.onclose = ((ev) => {
|
|
606
|
-
|
|
607
|
-
|
|
681
|
+
try {
|
|
682
|
+
const msg = JSON.parse(ev.reason);
|
|
683
|
+
messageHandler(msg);
|
|
684
|
+
} catch (error) {
|
|
685
|
+
console.error("Error on close", error);
|
|
686
|
+
}
|
|
608
687
|
}).bind(this);
|
|
609
688
|
this.ws.onerror = ((_) => {
|
|
610
689
|
this.status = "failed";
|
|
@@ -612,7 +691,8 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
612
691
|
detail: {
|
|
613
692
|
status: this.status,
|
|
614
693
|
data: this.data,
|
|
615
|
-
error: "WebSocket error"
|
|
694
|
+
error: "WebSocket error",
|
|
695
|
+
id: this.id
|
|
616
696
|
}
|
|
617
697
|
}));
|
|
618
698
|
}).bind(this);
|
package/package.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { describe,
|
|
2
|
-
import axios from 'axios';
|
|
3
|
-
import FirecrawlApp from '../index';
|
|
1
|
+
import { describe, expect, jest, test } from '@jest/globals';
|
|
4
2
|
|
|
5
|
-
import
|
|
3
|
+
import FirecrawlApp from '../index';
|
|
4
|
+
import axios from 'axios';
|
|
6
5
|
import { join } from 'path';
|
|
6
|
+
import { readFile } from 'fs/promises';
|
|
7
7
|
|
|
8
8
|
// Mock jest and set the type
|
|
9
9
|
jest.mock('axios');
|
|
@@ -14,13 +14,22 @@ async function loadFixture(name: string): Promise<string> {
|
|
|
14
14
|
return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8')
|
|
15
15
|
}
|
|
16
16
|
|
|
17
|
+
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
|
|
18
|
+
|
|
17
19
|
describe('the firecrawl JS SDK', () => {
|
|
18
20
|
|
|
19
|
-
test('Should require an API key
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
21
|
+
test('Should require an API key only for cloud service', async () => {
|
|
22
|
+
if (API_URL.includes('api.firecrawl.dev')) {
|
|
23
|
+
// Should throw for cloud service
|
|
24
|
+
expect(() => {
|
|
25
|
+
new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL });
|
|
26
|
+
}).toThrow('No API key provided');
|
|
27
|
+
} else {
|
|
28
|
+
// Should not throw for self-hosted
|
|
29
|
+
expect(() => {
|
|
30
|
+
new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL });
|
|
31
|
+
}).not.toThrow();
|
|
32
|
+
}
|
|
24
33
|
});
|
|
25
34
|
|
|
26
35
|
test('Should return scraped data from a /scrape API call', async () => {
|
|
@@ -9,15 +9,28 @@ const TEST_API_KEY = process.env.TEST_API_KEY;
|
|
|
9
9
|
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
|
|
10
10
|
|
|
11
11
|
describe('FirecrawlApp E2E Tests', () => {
|
|
12
|
-
test.concurrent('should throw error for no API key', async () => {
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
12
|
+
test.concurrent('should throw error for no API key only for cloud service', async () => {
|
|
13
|
+
if (API_URL.includes('api.firecrawl.dev')) {
|
|
14
|
+
// Should throw for cloud service
|
|
15
|
+
expect(() => {
|
|
16
|
+
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
|
17
|
+
}).toThrow("No API key provided");
|
|
18
|
+
} else {
|
|
19
|
+
// Should not throw for self-hosted
|
|
20
|
+
expect(() => {
|
|
21
|
+
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
|
22
|
+
}).not.toThrow();
|
|
23
|
+
}
|
|
16
24
|
});
|
|
17
25
|
|
|
18
26
|
test.concurrent('should throw error for invalid API key on scrape', async () => {
|
|
19
|
-
|
|
20
|
-
|
|
27
|
+
if (API_URL.includes('api.firecrawl.dev')) {
|
|
28
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
29
|
+
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 404");
|
|
30
|
+
} else {
|
|
31
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
32
|
+
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
|
|
33
|
+
}
|
|
21
34
|
});
|
|
22
35
|
|
|
23
36
|
test.concurrent('should throw error for blocklisted URL on scrape', async () => {
|
|
@@ -155,14 +168,13 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
155
168
|
}, 30000); // 30 seconds timeout
|
|
156
169
|
|
|
157
170
|
test.concurrent('should throw error for invalid API key on crawl', async () => {
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
|
171
|
+
if (API_URL.includes('api.firecrawl.dev')) {
|
|
172
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
173
|
+
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404");
|
|
174
|
+
} else {
|
|
175
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
176
|
+
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
|
|
177
|
+
}
|
|
166
178
|
});
|
|
167
179
|
|
|
168
180
|
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
|
@@ -337,8 +349,13 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
337
349
|
}, 60000); // 60 seconds timeout
|
|
338
350
|
|
|
339
351
|
test.concurrent('should throw error for invalid API key on map', async () => {
|
|
340
|
-
|
|
341
|
-
|
|
352
|
+
if (API_URL.includes('api.firecrawl.dev')) {
|
|
353
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
354
|
+
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404");
|
|
355
|
+
} else {
|
|
356
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
357
|
+
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
|
|
358
|
+
}
|
|
342
359
|
});
|
|
343
360
|
|
|
344
361
|
test.concurrent('should throw error for blocklisted URL on map', async () => {
|
|
@@ -355,8 +372,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
355
372
|
}, 30000); // 30 seconds timeout
|
|
356
373
|
|
|
357
374
|
test.concurrent('should return successful response for valid map', async () => {
|
|
358
|
-
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
359
|
-
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
|
375
|
+
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
|
360
376
|
expect(response).not.toBeNull();
|
|
361
377
|
|
|
362
378
|
expect(response.links?.length).toBeGreaterThan(0);
|
|
@@ -365,8 +381,45 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
365
381
|
expect(filteredLinks?.length).toBeGreaterThan(0);
|
|
366
382
|
}, 30000); // 30 seconds timeout
|
|
367
383
|
|
|
368
|
-
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
test('should search with string query', async () => {
|
|
369
387
|
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
|
|
370
|
-
await
|
|
388
|
+
const response = await app.search("firecrawl");
|
|
389
|
+
expect(response.success).toBe(true);
|
|
390
|
+
console.log(response.data);
|
|
391
|
+
expect(response.data?.length).toBeGreaterThan(0);
|
|
392
|
+
expect(response.data?.[0]?.markdown).toBeDefined();
|
|
393
|
+
expect(response.data?.[0]?.metadata).toBeDefined();
|
|
394
|
+
expect(response.data?.[0]?.metadata?.title).toBeDefined();
|
|
395
|
+
expect(response.data?.[0]?.metadata?.description).toBeDefined();
|
|
396
|
+
});
|
|
397
|
+
|
|
398
|
+
test('should search with params object', async () => {
|
|
399
|
+
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
|
|
400
|
+
const response = await app.search("firecrawl", {
|
|
401
|
+
limit: 3,
|
|
402
|
+
lang: 'en',
|
|
403
|
+
country: 'us',
|
|
404
|
+
scrapeOptions: {
|
|
405
|
+
formats: ['markdown', 'html', 'links'],
|
|
406
|
+
onlyMainContent: true
|
|
407
|
+
}
|
|
408
|
+
});
|
|
409
|
+
expect(response.success).toBe(true);
|
|
410
|
+
expect(response.data.length).toBeLessThanOrEqual(3);
|
|
411
|
+
for (const doc of response.data) {
|
|
412
|
+
expect(doc.markdown).toBeDefined();
|
|
413
|
+
expect(doc.html).toBeDefined();
|
|
414
|
+
expect(doc.links).toBeDefined();
|
|
415
|
+
expect(doc.metadata).toBeDefined();
|
|
416
|
+
expect(doc.metadata?.title).toBeDefined();
|
|
417
|
+
expect(doc.metadata?.description).toBeDefined();
|
|
418
|
+
}
|
|
419
|
+
});
|
|
420
|
+
|
|
421
|
+
test('should handle invalid API key for search', async () => {
|
|
422
|
+
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: "invalid_api_key" });
|
|
423
|
+
await expect(app.search("test query")).rejects.toThrow("Request failed with status code 404");
|
|
371
424
|
});
|
|
372
425
|
});
|
package/src/index.ts
CHANGED
|
@@ -68,6 +68,9 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
|
|
68
68
|
screenshot?: string;
|
|
69
69
|
metadata?: FirecrawlDocumentMetadata;
|
|
70
70
|
actions: ActionsSchema;
|
|
71
|
+
// v1 search only
|
|
72
|
+
title?: string;
|
|
73
|
+
description?: string;
|
|
71
74
|
}
|
|
72
75
|
|
|
73
76
|
/**
|
|
@@ -183,6 +186,7 @@ export interface BatchScrapeResponse {
|
|
|
183
186
|
url?: string;
|
|
184
187
|
success: true;
|
|
185
188
|
error?: string;
|
|
189
|
+
invalidURLs?: string[];
|
|
186
190
|
}
|
|
187
191
|
|
|
188
192
|
/**
|
|
@@ -242,10 +246,11 @@ export interface MapResponse {
|
|
|
242
246
|
* Defines options for extracting information from URLs.
|
|
243
247
|
*/
|
|
244
248
|
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
245
|
-
prompt
|
|
249
|
+
prompt?: string;
|
|
246
250
|
schema?: LLMSchema;
|
|
247
251
|
systemPrompt?: string;
|
|
248
252
|
allowExternalLinks?: boolean;
|
|
253
|
+
includeSubdomains?: boolean;
|
|
249
254
|
}
|
|
250
255
|
|
|
251
256
|
/**
|
|
@@ -280,6 +285,33 @@ export class FirecrawlError extends Error {
|
|
|
280
285
|
}
|
|
281
286
|
}
|
|
282
287
|
|
|
288
|
+
/**
|
|
289
|
+
* Parameters for search operations.
|
|
290
|
+
* Defines options for searching and scraping search results.
|
|
291
|
+
*/
|
|
292
|
+
export interface SearchParams {
|
|
293
|
+
limit?: number;
|
|
294
|
+
tbs?: string;
|
|
295
|
+
filter?: string;
|
|
296
|
+
lang?: string;
|
|
297
|
+
country?: string;
|
|
298
|
+
location?: string;
|
|
299
|
+
origin?: string;
|
|
300
|
+
timeout?: number;
|
|
301
|
+
scrapeOptions?: ScrapeParams;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* Response interface for search operations.
|
|
306
|
+
* Defines the structure of the response received after a search operation.
|
|
307
|
+
*/
|
|
308
|
+
export interface SearchResponse {
|
|
309
|
+
success: boolean;
|
|
310
|
+
data: FirecrawlDocument<undefined>[];
|
|
311
|
+
warning?: string;
|
|
312
|
+
error?: string;
|
|
313
|
+
}
|
|
314
|
+
|
|
283
315
|
/**
|
|
284
316
|
* Main class for interacting with the Firecrawl API.
|
|
285
317
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
@@ -288,17 +320,23 @@ export default class FirecrawlApp {
|
|
|
288
320
|
public apiKey: string;
|
|
289
321
|
public apiUrl: string;
|
|
290
322
|
|
|
323
|
+
private isCloudService(url: string): boolean {
|
|
324
|
+
return url.includes('api.firecrawl.dev');
|
|
325
|
+
}
|
|
326
|
+
|
|
291
327
|
/**
|
|
292
328
|
* Initializes a new instance of the FirecrawlApp class.
|
|
293
329
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
294
330
|
*/
|
|
295
331
|
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
|
296
|
-
|
|
332
|
+
const baseUrl = apiUrl || "https://api.firecrawl.dev";
|
|
333
|
+
|
|
334
|
+
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
|
|
297
335
|
throw new FirecrawlError("No API key provided", 401);
|
|
298
336
|
}
|
|
299
337
|
|
|
300
|
-
this.apiKey = apiKey;
|
|
301
|
-
this.apiUrl =
|
|
338
|
+
this.apiKey = apiKey || '';
|
|
339
|
+
this.apiUrl = baseUrl;
|
|
302
340
|
}
|
|
303
341
|
|
|
304
342
|
/**
|
|
@@ -361,16 +399,80 @@ export default class FirecrawlApp {
|
|
|
361
399
|
}
|
|
362
400
|
|
|
363
401
|
/**
|
|
364
|
-
*
|
|
402
|
+
* Searches using the Firecrawl API and optionally scrapes the results.
|
|
365
403
|
* @param query - The search query string.
|
|
366
|
-
* @param params -
|
|
367
|
-
* @returns
|
|
404
|
+
* @param params - Optional parameters for the search request.
|
|
405
|
+
* @returns The response from the search operation.
|
|
368
406
|
*/
|
|
369
|
-
async search(
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
407
|
+
async search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse> {
|
|
408
|
+
const headers: AxiosRequestHeaders = {
|
|
409
|
+
"Content-Type": "application/json",
|
|
410
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
411
|
+
} as AxiosRequestHeaders;
|
|
412
|
+
|
|
413
|
+
let jsonData: any = {
|
|
414
|
+
query,
|
|
415
|
+
limit: params?.limit ?? 5,
|
|
416
|
+
tbs: params?.tbs,
|
|
417
|
+
filter: params?.filter,
|
|
418
|
+
lang: params?.lang ?? "en",
|
|
419
|
+
country: params?.country ?? "us",
|
|
420
|
+
location: params?.location,
|
|
421
|
+
origin: params?.origin ?? "api",
|
|
422
|
+
timeout: params?.timeout ?? 60000,
|
|
423
|
+
scrapeOptions: params?.scrapeOptions ?? { formats: [] },
|
|
424
|
+
};
|
|
425
|
+
|
|
426
|
+
if (jsonData?.scrapeOptions?.extract?.schema) {
|
|
427
|
+
let schema = jsonData.scrapeOptions.extract.schema;
|
|
428
|
+
|
|
429
|
+
// Try parsing the schema as a Zod schema
|
|
430
|
+
try {
|
|
431
|
+
schema = zodToJsonSchema(schema);
|
|
432
|
+
} catch (error) {
|
|
433
|
+
|
|
434
|
+
}
|
|
435
|
+
jsonData = {
|
|
436
|
+
...jsonData,
|
|
437
|
+
scrapeOptions: {
|
|
438
|
+
...jsonData.scrapeOptions,
|
|
439
|
+
extract: {
|
|
440
|
+
...jsonData.scrapeOptions.extract,
|
|
441
|
+
schema: schema,
|
|
442
|
+
},
|
|
443
|
+
},
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
try {
|
|
448
|
+
const response: AxiosResponse = await this.postRequest(
|
|
449
|
+
this.apiUrl + `/v1/search`,
|
|
450
|
+
jsonData,
|
|
451
|
+
headers
|
|
452
|
+
);
|
|
453
|
+
|
|
454
|
+
if (response.status === 200) {
|
|
455
|
+
const responseData = response.data;
|
|
456
|
+
if (responseData.success) {
|
|
457
|
+
return {
|
|
458
|
+
success: true,
|
|
459
|
+
data: responseData.data as FirecrawlDocument<any>[],
|
|
460
|
+
warning: responseData.warning,
|
|
461
|
+
};
|
|
462
|
+
} else {
|
|
463
|
+
throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
|
|
464
|
+
}
|
|
465
|
+
} else {
|
|
466
|
+
this.handleError(response, "search");
|
|
467
|
+
}
|
|
468
|
+
} catch (error: any) {
|
|
469
|
+
if (error.response?.data?.error) {
|
|
470
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
471
|
+
} else {
|
|
472
|
+
throw new FirecrawlError(error.message, 500);
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
return { success: false, error: "Internal server error.", data: [] };
|
|
374
476
|
}
|
|
375
477
|
|
|
376
478
|
/**
|
|
@@ -576,9 +678,10 @@ export default class FirecrawlApp {
|
|
|
576
678
|
pollInterval: number = 2,
|
|
577
679
|
idempotencyKey?: string,
|
|
578
680
|
webhook?: CrawlParams["webhook"],
|
|
681
|
+
ignoreInvalidURLs?: boolean,
|
|
579
682
|
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
|
580
683
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
581
|
-
let jsonData: any = { urls, ...params };
|
|
684
|
+
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params };
|
|
582
685
|
if (jsonData?.extract?.schema) {
|
|
583
686
|
let schema = jsonData.extract.schema;
|
|
584
687
|
|
|
@@ -621,10 +724,12 @@ export default class FirecrawlApp {
|
|
|
621
724
|
async asyncBatchScrapeUrls(
|
|
622
725
|
urls: string[],
|
|
623
726
|
params?: ScrapeParams,
|
|
624
|
-
idempotencyKey?: string
|
|
727
|
+
idempotencyKey?: string,
|
|
728
|
+
webhook?: CrawlParams["webhook"],
|
|
729
|
+
ignoreInvalidURLs?: boolean,
|
|
625
730
|
): Promise<BatchScrapeResponse | ErrorResponse> {
|
|
626
731
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
627
|
-
let jsonData: any = { urls, ...(params ?? {}) };
|
|
732
|
+
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) };
|
|
628
733
|
try {
|
|
629
734
|
const response: AxiosResponse = await this.postRequest(
|
|
630
735
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -657,8 +762,10 @@ export default class FirecrawlApp {
|
|
|
657
762
|
urls: string[],
|
|
658
763
|
params?: ScrapeParams,
|
|
659
764
|
idempotencyKey?: string,
|
|
765
|
+
webhook?: CrawlParams["webhook"],
|
|
766
|
+
ignoreInvalidURLs?: boolean,
|
|
660
767
|
) {
|
|
661
|
-
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
768
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs);
|
|
662
769
|
|
|
663
770
|
if (crawl.success && crawl.id) {
|
|
664
771
|
const id = crawl.id;
|
|
@@ -728,10 +835,6 @@ export default class FirecrawlApp {
|
|
|
728
835
|
async extract<T extends zt.ZodSchema = any>(urls: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> {
|
|
729
836
|
const headers = this.prepareHeaders();
|
|
730
837
|
|
|
731
|
-
if (!params?.prompt) {
|
|
732
|
-
throw new FirecrawlError("Prompt is required", 400);
|
|
733
|
-
}
|
|
734
|
-
|
|
735
838
|
let jsonData: { urls: string[] } & ExtractParams<T> = { urls, ...params };
|
|
736
839
|
let jsonSchema: any;
|
|
737
840
|
try {
|
|
@@ -932,9 +1035,11 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
932
1035
|
private ws: WebSocket;
|
|
933
1036
|
public data: FirecrawlDocument<undefined>[];
|
|
934
1037
|
public status: CrawlStatusResponse["status"];
|
|
1038
|
+
public id: string;
|
|
935
1039
|
|
|
936
1040
|
constructor(id: string, app: FirecrawlApp) {
|
|
937
1041
|
super();
|
|
1042
|
+
this.id = id;
|
|
938
1043
|
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
939
1044
|
this.status = "scraping";
|
|
940
1045
|
this.data = [];
|
|
@@ -965,6 +1070,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
965
1070
|
detail: {
|
|
966
1071
|
status: this.status,
|
|
967
1072
|
data: this.data,
|
|
1073
|
+
id: this.id,
|
|
968
1074
|
},
|
|
969
1075
|
}));
|
|
970
1076
|
} else if (msg.type === "error") {
|
|
@@ -974,6 +1080,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
974
1080
|
status: this.status,
|
|
975
1081
|
data: this.data,
|
|
976
1082
|
error: msg.error,
|
|
1083
|
+
id: this.id,
|
|
977
1084
|
},
|
|
978
1085
|
}));
|
|
979
1086
|
} else if (msg.type === "catchup") {
|
|
@@ -981,12 +1088,18 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
981
1088
|
this.data.push(...(msg.data.data ?? []));
|
|
982
1089
|
for (const doc of this.data) {
|
|
983
1090
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
984
|
-
detail:
|
|
1091
|
+
detail: {
|
|
1092
|
+
...doc,
|
|
1093
|
+
id: this.id,
|
|
1094
|
+
},
|
|
985
1095
|
}));
|
|
986
1096
|
}
|
|
987
1097
|
} else if (msg.type === "document") {
|
|
988
1098
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
989
|
-
detail:
|
|
1099
|
+
detail: {
|
|
1100
|
+
...msg.data,
|
|
1101
|
+
id: this.id,
|
|
1102
|
+
},
|
|
990
1103
|
}));
|
|
991
1104
|
}
|
|
992
1105
|
}
|
|
@@ -996,14 +1109,21 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
996
1109
|
this.ws.close();
|
|
997
1110
|
return;
|
|
998
1111
|
}
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1112
|
+
try {
|
|
1113
|
+
const msg = JSON.parse(ev.data) as Message;
|
|
1114
|
+
messageHandler(msg);
|
|
1115
|
+
} catch (error) {
|
|
1116
|
+
console.error("Error on message", error);
|
|
1117
|
+
}
|
|
1002
1118
|
}).bind(this);
|
|
1003
1119
|
|
|
1004
1120
|
this.ws.onclose = ((ev: CloseEvent) => {
|
|
1005
|
-
|
|
1006
|
-
|
|
1121
|
+
try {
|
|
1122
|
+
const msg = JSON.parse(ev.reason) as Message;
|
|
1123
|
+
messageHandler(msg);
|
|
1124
|
+
} catch (error) {
|
|
1125
|
+
console.error("Error on close", error);
|
|
1126
|
+
}
|
|
1007
1127
|
}).bind(this);
|
|
1008
1128
|
|
|
1009
1129
|
this.ws.onerror = ((_: Event) => {
|
|
@@ -1013,6 +1133,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
1013
1133
|
status: this.status,
|
|
1014
1134
|
data: this.data,
|
|
1015
1135
|
error: "WebSocket error",
|
|
1136
|
+
id: this.id,
|
|
1016
1137
|
},
|
|
1017
1138
|
}));
|
|
1018
1139
|
}).bind(this);
|