firecrawl 1.10.1 → 1.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +105 -22
- package/dist/index.d.cts +40 -9
- package/dist/index.d.ts +40 -9
- package/dist/index.js +105 -22
- package/package.json +1 -1
- package/src/__tests__/index.test.ts +18 -9
- package/src/__tests__/v1/e2e_withAuth/index.test.ts +73 -20
- package/src/index.ts +148 -23
package/dist/index.cjs
CHANGED
|
@@ -49,16 +49,20 @@ var FirecrawlError = class extends Error {
|
|
|
49
49
|
var FirecrawlApp = class {
|
|
50
50
|
apiKey;
|
|
51
51
|
apiUrl;
|
|
52
|
+
isCloudService(url) {
|
|
53
|
+
return url.includes("api.firecrawl.dev");
|
|
54
|
+
}
|
|
52
55
|
/**
|
|
53
56
|
* Initializes a new instance of the FirecrawlApp class.
|
|
54
57
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
55
58
|
*/
|
|
56
59
|
constructor({ apiKey = null, apiUrl = null }) {
|
|
57
|
-
|
|
60
|
+
const baseUrl = apiUrl || "https://api.firecrawl.dev";
|
|
61
|
+
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
|
|
58
62
|
throw new FirecrawlError("No API key provided", 401);
|
|
59
63
|
}
|
|
60
|
-
this.apiKey = apiKey;
|
|
61
|
-
this.apiUrl =
|
|
64
|
+
this.apiKey = apiKey || "";
|
|
65
|
+
this.apiUrl = baseUrl;
|
|
62
66
|
}
|
|
63
67
|
/**
|
|
64
68
|
* Scrapes a URL using the Firecrawl API.
|
|
@@ -113,13 +117,73 @@ var FirecrawlApp = class {
|
|
|
113
117
|
return { success: false, error: "Internal server error." };
|
|
114
118
|
}
|
|
115
119
|
/**
|
|
116
|
-
*
|
|
120
|
+
* Searches using the Firecrawl API and optionally scrapes the results.
|
|
117
121
|
* @param query - The search query string.
|
|
118
|
-
* @param params -
|
|
119
|
-
* @returns
|
|
122
|
+
* @param params - Optional parameters for the search request.
|
|
123
|
+
* @returns The response from the search operation.
|
|
120
124
|
*/
|
|
121
125
|
async search(query, params) {
|
|
122
|
-
|
|
126
|
+
const headers = {
|
|
127
|
+
"Content-Type": "application/json",
|
|
128
|
+
Authorization: `Bearer ${this.apiKey}`
|
|
129
|
+
};
|
|
130
|
+
let jsonData = {
|
|
131
|
+
query,
|
|
132
|
+
limit: params?.limit ?? 5,
|
|
133
|
+
tbs: params?.tbs,
|
|
134
|
+
filter: params?.filter,
|
|
135
|
+
lang: params?.lang ?? "en",
|
|
136
|
+
country: params?.country ?? "us",
|
|
137
|
+
location: params?.location,
|
|
138
|
+
origin: params?.origin ?? "api",
|
|
139
|
+
timeout: params?.timeout ?? 6e4,
|
|
140
|
+
scrapeOptions: params?.scrapeOptions ?? { formats: [] }
|
|
141
|
+
};
|
|
142
|
+
if (jsonData?.scrapeOptions?.extract?.schema) {
|
|
143
|
+
let schema = jsonData.scrapeOptions.extract.schema;
|
|
144
|
+
try {
|
|
145
|
+
schema = (0, import_zod_to_json_schema.zodToJsonSchema)(schema);
|
|
146
|
+
} catch (error) {
|
|
147
|
+
}
|
|
148
|
+
jsonData = {
|
|
149
|
+
...jsonData,
|
|
150
|
+
scrapeOptions: {
|
|
151
|
+
...jsonData.scrapeOptions,
|
|
152
|
+
extract: {
|
|
153
|
+
...jsonData.scrapeOptions.extract,
|
|
154
|
+
schema
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
try {
|
|
160
|
+
const response = await this.postRequest(
|
|
161
|
+
this.apiUrl + `/v1/search`,
|
|
162
|
+
jsonData,
|
|
163
|
+
headers
|
|
164
|
+
);
|
|
165
|
+
if (response.status === 200) {
|
|
166
|
+
const responseData = response.data;
|
|
167
|
+
if (responseData.success) {
|
|
168
|
+
return {
|
|
169
|
+
success: true,
|
|
170
|
+
data: responseData.data,
|
|
171
|
+
warning: responseData.warning
|
|
172
|
+
};
|
|
173
|
+
} else {
|
|
174
|
+
throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
|
|
175
|
+
}
|
|
176
|
+
} else {
|
|
177
|
+
this.handleError(response, "search");
|
|
178
|
+
}
|
|
179
|
+
} catch (error) {
|
|
180
|
+
if (error.response?.data?.error) {
|
|
181
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ""}`, error.response.status);
|
|
182
|
+
} else {
|
|
183
|
+
throw new FirecrawlError(error.message, 500);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
return { success: false, error: "Internal server error.", data: [] };
|
|
123
187
|
}
|
|
124
188
|
/**
|
|
125
189
|
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
@@ -295,9 +359,9 @@ var FirecrawlApp = class {
|
|
|
295
359
|
* @param webhook - Optional webhook for the batch scrape.
|
|
296
360
|
* @returns The response from the crawl operation.
|
|
297
361
|
*/
|
|
298
|
-
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook) {
|
|
362
|
+
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook, ignoreInvalidURLs) {
|
|
299
363
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
300
|
-
let jsonData = { urls, ...params };
|
|
364
|
+
let jsonData = { urls, webhook, ignoreInvalidURLs, ...params };
|
|
301
365
|
if (jsonData?.extract?.schema) {
|
|
302
366
|
let schema = jsonData.extract.schema;
|
|
303
367
|
try {
|
|
@@ -333,9 +397,9 @@ var FirecrawlApp = class {
|
|
|
333
397
|
}
|
|
334
398
|
return { success: false, error: "Internal server error." };
|
|
335
399
|
}
|
|
336
|
-
async asyncBatchScrapeUrls(urls, params, idempotencyKey) {
|
|
400
|
+
async asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs) {
|
|
337
401
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
338
|
-
let jsonData = { urls, ...params ?? {} };
|
|
402
|
+
let jsonData = { urls, webhook, ignoreInvalidURLs, ...params ?? {} };
|
|
339
403
|
try {
|
|
340
404
|
const response = await this.postRequest(
|
|
341
405
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -363,8 +427,8 @@ var FirecrawlApp = class {
|
|
|
363
427
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
364
428
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
365
429
|
*/
|
|
366
|
-
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey) {
|
|
367
|
-
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
430
|
+
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey, webhook, ignoreInvalidURLs) {
|
|
431
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs);
|
|
368
432
|
if (crawl.success && crawl.id) {
|
|
369
433
|
const id = crawl.id;
|
|
370
434
|
return new CrawlWatcher(id, this);
|
|
@@ -593,8 +657,10 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
593
657
|
ws;
|
|
594
658
|
data;
|
|
595
659
|
status;
|
|
660
|
+
id;
|
|
596
661
|
constructor(id, app) {
|
|
597
662
|
super();
|
|
663
|
+
this.id = id;
|
|
598
664
|
this.ws = new import_isows.WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
599
665
|
this.status = "scraping";
|
|
600
666
|
this.data = [];
|
|
@@ -604,7 +670,8 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
604
670
|
this.dispatchTypedEvent("done", new CustomEvent("done", {
|
|
605
671
|
detail: {
|
|
606
672
|
status: this.status,
|
|
607
|
-
data: this.data
|
|
673
|
+
data: this.data,
|
|
674
|
+
id: this.id
|
|
608
675
|
}
|
|
609
676
|
}));
|
|
610
677
|
} else if (msg.type === "error") {
|
|
@@ -613,7 +680,8 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
613
680
|
detail: {
|
|
614
681
|
status: this.status,
|
|
615
682
|
data: this.data,
|
|
616
|
-
error: msg.error
|
|
683
|
+
error: msg.error,
|
|
684
|
+
id: this.id
|
|
617
685
|
}
|
|
618
686
|
}));
|
|
619
687
|
} else if (msg.type === "catchup") {
|
|
@@ -621,12 +689,18 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
621
689
|
this.data.push(...msg.data.data ?? []);
|
|
622
690
|
for (const doc of this.data) {
|
|
623
691
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
624
|
-
detail:
|
|
692
|
+
detail: {
|
|
693
|
+
...doc,
|
|
694
|
+
id: this.id
|
|
695
|
+
}
|
|
625
696
|
}));
|
|
626
697
|
}
|
|
627
698
|
} else if (msg.type === "document") {
|
|
628
699
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
629
|
-
detail:
|
|
700
|
+
detail: {
|
|
701
|
+
...msg.data,
|
|
702
|
+
id: this.id
|
|
703
|
+
}
|
|
630
704
|
}));
|
|
631
705
|
}
|
|
632
706
|
};
|
|
@@ -635,12 +709,20 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
635
709
|
this.ws.close();
|
|
636
710
|
return;
|
|
637
711
|
}
|
|
638
|
-
|
|
639
|
-
|
|
712
|
+
try {
|
|
713
|
+
const msg = JSON.parse(ev.data);
|
|
714
|
+
messageHandler(msg);
|
|
715
|
+
} catch (error) {
|
|
716
|
+
console.error("Error on message", error);
|
|
717
|
+
}
|
|
640
718
|
}).bind(this);
|
|
641
719
|
this.ws.onclose = ((ev) => {
|
|
642
|
-
|
|
643
|
-
|
|
720
|
+
try {
|
|
721
|
+
const msg = JSON.parse(ev.reason);
|
|
722
|
+
messageHandler(msg);
|
|
723
|
+
} catch (error) {
|
|
724
|
+
console.error("Error on close", error);
|
|
725
|
+
}
|
|
644
726
|
}).bind(this);
|
|
645
727
|
this.ws.onerror = ((_) => {
|
|
646
728
|
this.status = "failed";
|
|
@@ -648,7 +730,8 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
648
730
|
detail: {
|
|
649
731
|
status: this.status,
|
|
650
732
|
data: this.data,
|
|
651
|
-
error: "WebSocket error"
|
|
733
|
+
error: "WebSocket error",
|
|
734
|
+
id: this.id
|
|
652
735
|
}
|
|
653
736
|
}));
|
|
654
737
|
}).bind(this);
|
package/dist/index.d.cts
CHANGED
|
@@ -64,6 +64,8 @@ interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | neve
|
|
|
64
64
|
screenshot?: string;
|
|
65
65
|
metadata?: FirecrawlDocumentMetadata;
|
|
66
66
|
actions: ActionsSchema;
|
|
67
|
+
title?: string;
|
|
68
|
+
description?: string;
|
|
67
69
|
}
|
|
68
70
|
/**
|
|
69
71
|
* Parameters for scraping operations.
|
|
@@ -171,6 +173,7 @@ interface BatchScrapeResponse {
|
|
|
171
173
|
url?: string;
|
|
172
174
|
success: true;
|
|
173
175
|
error?: string;
|
|
176
|
+
invalidURLs?: string[];
|
|
174
177
|
}
|
|
175
178
|
/**
|
|
176
179
|
* Response interface for job status checks.
|
|
@@ -225,10 +228,11 @@ interface MapResponse {
|
|
|
225
228
|
* Defines options for extracting information from URLs.
|
|
226
229
|
*/
|
|
227
230
|
interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
228
|
-
prompt
|
|
231
|
+
prompt?: string;
|
|
229
232
|
schema?: LLMSchema;
|
|
230
233
|
systemPrompt?: string;
|
|
231
234
|
allowExternalLinks?: boolean;
|
|
235
|
+
includeSubdomains?: boolean;
|
|
232
236
|
}
|
|
233
237
|
/**
|
|
234
238
|
* Response interface for extracting information from URLs.
|
|
@@ -256,6 +260,31 @@ declare class FirecrawlError extends Error {
|
|
|
256
260
|
statusCode: number;
|
|
257
261
|
constructor(message: string, statusCode: number);
|
|
258
262
|
}
|
|
263
|
+
/**
|
|
264
|
+
* Parameters for search operations.
|
|
265
|
+
* Defines options for searching and scraping search results.
|
|
266
|
+
*/
|
|
267
|
+
interface SearchParams {
|
|
268
|
+
limit?: number;
|
|
269
|
+
tbs?: string;
|
|
270
|
+
filter?: string;
|
|
271
|
+
lang?: string;
|
|
272
|
+
country?: string;
|
|
273
|
+
location?: string;
|
|
274
|
+
origin?: string;
|
|
275
|
+
timeout?: number;
|
|
276
|
+
scrapeOptions?: ScrapeParams;
|
|
277
|
+
}
|
|
278
|
+
/**
|
|
279
|
+
* Response interface for search operations.
|
|
280
|
+
* Defines the structure of the response received after a search operation.
|
|
281
|
+
*/
|
|
282
|
+
interface SearchResponse {
|
|
283
|
+
success: boolean;
|
|
284
|
+
data: FirecrawlDocument<undefined>[];
|
|
285
|
+
warning?: string;
|
|
286
|
+
error?: string;
|
|
287
|
+
}
|
|
259
288
|
/**
|
|
260
289
|
* Main class for interacting with the Firecrawl API.
|
|
261
290
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
@@ -263,6 +292,7 @@ declare class FirecrawlError extends Error {
|
|
|
263
292
|
declare class FirecrawlApp {
|
|
264
293
|
apiKey: string;
|
|
265
294
|
apiUrl: string;
|
|
295
|
+
private isCloudService;
|
|
266
296
|
/**
|
|
267
297
|
* Initializes a new instance of the FirecrawlApp class.
|
|
268
298
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
@@ -276,12 +306,12 @@ declare class FirecrawlApp {
|
|
|
276
306
|
*/
|
|
277
307
|
scrapeUrl<T extends zt.ZodSchema, ActionsSchema extends (Action[] | undefined) = undefined>(url: string, params?: ScrapeParams<T, ActionsSchema>): Promise<ScrapeResponse<zt.infer<T>, ActionsSchema extends Action[] ? ActionsResult : never> | ErrorResponse>;
|
|
278
308
|
/**
|
|
279
|
-
*
|
|
309
|
+
* Searches using the Firecrawl API and optionally scrapes the results.
|
|
280
310
|
* @param query - The search query string.
|
|
281
|
-
* @param params -
|
|
282
|
-
* @returns
|
|
311
|
+
* @param params - Optional parameters for the search request.
|
|
312
|
+
* @returns The response from the search operation.
|
|
283
313
|
*/
|
|
284
|
-
search(query: string, params?: any): Promise<
|
|
314
|
+
search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse>;
|
|
285
315
|
/**
|
|
286
316
|
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
287
317
|
* @param url - The URL to crawl.
|
|
@@ -329,8 +359,8 @@ declare class FirecrawlApp {
|
|
|
329
359
|
* @param webhook - Optional webhook for the batch scrape.
|
|
330
360
|
* @returns The response from the crawl operation.
|
|
331
361
|
*/
|
|
332
|
-
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"]): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
333
|
-
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
362
|
+
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
363
|
+
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
334
364
|
/**
|
|
335
365
|
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
336
366
|
* @param urls - The URL to scrape.
|
|
@@ -338,7 +368,7 @@ declare class FirecrawlApp {
|
|
|
338
368
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
339
369
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
340
370
|
*/
|
|
341
|
-
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<CrawlWatcher>;
|
|
371
|
+
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<CrawlWatcher>;
|
|
342
372
|
/**
|
|
343
373
|
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
344
374
|
* @param id - The ID of the batch scrape operation.
|
|
@@ -414,8 +444,9 @@ declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
414
444
|
private ws;
|
|
415
445
|
data: FirecrawlDocument<undefined>[];
|
|
416
446
|
status: CrawlStatusResponse["status"];
|
|
447
|
+
id: string;
|
|
417
448
|
constructor(id: string, app: FirecrawlApp);
|
|
418
449
|
close(): void;
|
|
419
450
|
}
|
|
420
451
|
|
|
421
|
-
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type ExtractParams, type ExtractResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, FirecrawlApp as default };
|
|
452
|
+
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type ExtractParams, type ExtractResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, type SearchParams, type SearchResponse, FirecrawlApp as default };
|
package/dist/index.d.ts
CHANGED
|
@@ -64,6 +64,8 @@ interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | neve
|
|
|
64
64
|
screenshot?: string;
|
|
65
65
|
metadata?: FirecrawlDocumentMetadata;
|
|
66
66
|
actions: ActionsSchema;
|
|
67
|
+
title?: string;
|
|
68
|
+
description?: string;
|
|
67
69
|
}
|
|
68
70
|
/**
|
|
69
71
|
* Parameters for scraping operations.
|
|
@@ -171,6 +173,7 @@ interface BatchScrapeResponse {
|
|
|
171
173
|
url?: string;
|
|
172
174
|
success: true;
|
|
173
175
|
error?: string;
|
|
176
|
+
invalidURLs?: string[];
|
|
174
177
|
}
|
|
175
178
|
/**
|
|
176
179
|
* Response interface for job status checks.
|
|
@@ -225,10 +228,11 @@ interface MapResponse {
|
|
|
225
228
|
* Defines options for extracting information from URLs.
|
|
226
229
|
*/
|
|
227
230
|
interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
228
|
-
prompt
|
|
231
|
+
prompt?: string;
|
|
229
232
|
schema?: LLMSchema;
|
|
230
233
|
systemPrompt?: string;
|
|
231
234
|
allowExternalLinks?: boolean;
|
|
235
|
+
includeSubdomains?: boolean;
|
|
232
236
|
}
|
|
233
237
|
/**
|
|
234
238
|
* Response interface for extracting information from URLs.
|
|
@@ -256,6 +260,31 @@ declare class FirecrawlError extends Error {
|
|
|
256
260
|
statusCode: number;
|
|
257
261
|
constructor(message: string, statusCode: number);
|
|
258
262
|
}
|
|
263
|
+
/**
|
|
264
|
+
* Parameters for search operations.
|
|
265
|
+
* Defines options for searching and scraping search results.
|
|
266
|
+
*/
|
|
267
|
+
interface SearchParams {
|
|
268
|
+
limit?: number;
|
|
269
|
+
tbs?: string;
|
|
270
|
+
filter?: string;
|
|
271
|
+
lang?: string;
|
|
272
|
+
country?: string;
|
|
273
|
+
location?: string;
|
|
274
|
+
origin?: string;
|
|
275
|
+
timeout?: number;
|
|
276
|
+
scrapeOptions?: ScrapeParams;
|
|
277
|
+
}
|
|
278
|
+
/**
|
|
279
|
+
* Response interface for search operations.
|
|
280
|
+
* Defines the structure of the response received after a search operation.
|
|
281
|
+
*/
|
|
282
|
+
interface SearchResponse {
|
|
283
|
+
success: boolean;
|
|
284
|
+
data: FirecrawlDocument<undefined>[];
|
|
285
|
+
warning?: string;
|
|
286
|
+
error?: string;
|
|
287
|
+
}
|
|
259
288
|
/**
|
|
260
289
|
* Main class for interacting with the Firecrawl API.
|
|
261
290
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
@@ -263,6 +292,7 @@ declare class FirecrawlError extends Error {
|
|
|
263
292
|
declare class FirecrawlApp {
|
|
264
293
|
apiKey: string;
|
|
265
294
|
apiUrl: string;
|
|
295
|
+
private isCloudService;
|
|
266
296
|
/**
|
|
267
297
|
* Initializes a new instance of the FirecrawlApp class.
|
|
268
298
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
@@ -276,12 +306,12 @@ declare class FirecrawlApp {
|
|
|
276
306
|
*/
|
|
277
307
|
scrapeUrl<T extends zt.ZodSchema, ActionsSchema extends (Action[] | undefined) = undefined>(url: string, params?: ScrapeParams<T, ActionsSchema>): Promise<ScrapeResponse<zt.infer<T>, ActionsSchema extends Action[] ? ActionsResult : never> | ErrorResponse>;
|
|
278
308
|
/**
|
|
279
|
-
*
|
|
309
|
+
* Searches using the Firecrawl API and optionally scrapes the results.
|
|
280
310
|
* @param query - The search query string.
|
|
281
|
-
* @param params -
|
|
282
|
-
* @returns
|
|
311
|
+
* @param params - Optional parameters for the search request.
|
|
312
|
+
* @returns The response from the search operation.
|
|
283
313
|
*/
|
|
284
|
-
search(query: string, params?: any): Promise<
|
|
314
|
+
search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse>;
|
|
285
315
|
/**
|
|
286
316
|
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
287
317
|
* @param url - The URL to crawl.
|
|
@@ -329,8 +359,8 @@ declare class FirecrawlApp {
|
|
|
329
359
|
* @param webhook - Optional webhook for the batch scrape.
|
|
330
360
|
* @returns The response from the crawl operation.
|
|
331
361
|
*/
|
|
332
|
-
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"]): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
333
|
-
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
362
|
+
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
363
|
+
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
334
364
|
/**
|
|
335
365
|
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
336
366
|
* @param urls - The URL to scrape.
|
|
@@ -338,7 +368,7 @@ declare class FirecrawlApp {
|
|
|
338
368
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
339
369
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
340
370
|
*/
|
|
341
|
-
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<CrawlWatcher>;
|
|
371
|
+
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string, webhook?: CrawlParams["webhook"], ignoreInvalidURLs?: boolean): Promise<CrawlWatcher>;
|
|
342
372
|
/**
|
|
343
373
|
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
344
374
|
* @param id - The ID of the batch scrape operation.
|
|
@@ -414,8 +444,9 @@ declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
414
444
|
private ws;
|
|
415
445
|
data: FirecrawlDocument<undefined>[];
|
|
416
446
|
status: CrawlStatusResponse["status"];
|
|
447
|
+
id: string;
|
|
417
448
|
constructor(id: string, app: FirecrawlApp);
|
|
418
449
|
close(): void;
|
|
419
450
|
}
|
|
420
451
|
|
|
421
|
-
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type ExtractParams, type ExtractResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, FirecrawlApp as default };
|
|
452
|
+
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type ExtractParams, type ExtractResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, type SearchParams, type SearchResponse, FirecrawlApp as default };
|
package/dist/index.js
CHANGED
|
@@ -13,16 +13,20 @@ var FirecrawlError = class extends Error {
|
|
|
13
13
|
var FirecrawlApp = class {
|
|
14
14
|
apiKey;
|
|
15
15
|
apiUrl;
|
|
16
|
+
isCloudService(url) {
|
|
17
|
+
return url.includes("api.firecrawl.dev");
|
|
18
|
+
}
|
|
16
19
|
/**
|
|
17
20
|
* Initializes a new instance of the FirecrawlApp class.
|
|
18
21
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
19
22
|
*/
|
|
20
23
|
constructor({ apiKey = null, apiUrl = null }) {
|
|
21
|
-
|
|
24
|
+
const baseUrl = apiUrl || "https://api.firecrawl.dev";
|
|
25
|
+
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
|
|
22
26
|
throw new FirecrawlError("No API key provided", 401);
|
|
23
27
|
}
|
|
24
|
-
this.apiKey = apiKey;
|
|
25
|
-
this.apiUrl =
|
|
28
|
+
this.apiKey = apiKey || "";
|
|
29
|
+
this.apiUrl = baseUrl;
|
|
26
30
|
}
|
|
27
31
|
/**
|
|
28
32
|
* Scrapes a URL using the Firecrawl API.
|
|
@@ -77,13 +81,73 @@ var FirecrawlApp = class {
|
|
|
77
81
|
return { success: false, error: "Internal server error." };
|
|
78
82
|
}
|
|
79
83
|
/**
|
|
80
|
-
*
|
|
84
|
+
* Searches using the Firecrawl API and optionally scrapes the results.
|
|
81
85
|
* @param query - The search query string.
|
|
82
|
-
* @param params -
|
|
83
|
-
* @returns
|
|
86
|
+
* @param params - Optional parameters for the search request.
|
|
87
|
+
* @returns The response from the search operation.
|
|
84
88
|
*/
|
|
85
89
|
async search(query, params) {
|
|
86
|
-
|
|
90
|
+
const headers = {
|
|
91
|
+
"Content-Type": "application/json",
|
|
92
|
+
Authorization: `Bearer ${this.apiKey}`
|
|
93
|
+
};
|
|
94
|
+
let jsonData = {
|
|
95
|
+
query,
|
|
96
|
+
limit: params?.limit ?? 5,
|
|
97
|
+
tbs: params?.tbs,
|
|
98
|
+
filter: params?.filter,
|
|
99
|
+
lang: params?.lang ?? "en",
|
|
100
|
+
country: params?.country ?? "us",
|
|
101
|
+
location: params?.location,
|
|
102
|
+
origin: params?.origin ?? "api",
|
|
103
|
+
timeout: params?.timeout ?? 6e4,
|
|
104
|
+
scrapeOptions: params?.scrapeOptions ?? { formats: [] }
|
|
105
|
+
};
|
|
106
|
+
if (jsonData?.scrapeOptions?.extract?.schema) {
|
|
107
|
+
let schema = jsonData.scrapeOptions.extract.schema;
|
|
108
|
+
try {
|
|
109
|
+
schema = zodToJsonSchema(schema);
|
|
110
|
+
} catch (error) {
|
|
111
|
+
}
|
|
112
|
+
jsonData = {
|
|
113
|
+
...jsonData,
|
|
114
|
+
scrapeOptions: {
|
|
115
|
+
...jsonData.scrapeOptions,
|
|
116
|
+
extract: {
|
|
117
|
+
...jsonData.scrapeOptions.extract,
|
|
118
|
+
schema
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
try {
|
|
124
|
+
const response = await this.postRequest(
|
|
125
|
+
this.apiUrl + `/v1/search`,
|
|
126
|
+
jsonData,
|
|
127
|
+
headers
|
|
128
|
+
);
|
|
129
|
+
if (response.status === 200) {
|
|
130
|
+
const responseData = response.data;
|
|
131
|
+
if (responseData.success) {
|
|
132
|
+
return {
|
|
133
|
+
success: true,
|
|
134
|
+
data: responseData.data,
|
|
135
|
+
warning: responseData.warning
|
|
136
|
+
};
|
|
137
|
+
} else {
|
|
138
|
+
throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
|
|
139
|
+
}
|
|
140
|
+
} else {
|
|
141
|
+
this.handleError(response, "search");
|
|
142
|
+
}
|
|
143
|
+
} catch (error) {
|
|
144
|
+
if (error.response?.data?.error) {
|
|
145
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ""}`, error.response.status);
|
|
146
|
+
} else {
|
|
147
|
+
throw new FirecrawlError(error.message, 500);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
return { success: false, error: "Internal server error.", data: [] };
|
|
87
151
|
}
|
|
88
152
|
/**
|
|
89
153
|
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
@@ -259,9 +323,9 @@ var FirecrawlApp = class {
|
|
|
259
323
|
* @param webhook - Optional webhook for the batch scrape.
|
|
260
324
|
* @returns The response from the crawl operation.
|
|
261
325
|
*/
|
|
262
|
-
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook) {
|
|
326
|
+
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook, ignoreInvalidURLs) {
|
|
263
327
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
264
|
-
let jsonData = { urls, ...params };
|
|
328
|
+
let jsonData = { urls, webhook, ignoreInvalidURLs, ...params };
|
|
265
329
|
if (jsonData?.extract?.schema) {
|
|
266
330
|
let schema = jsonData.extract.schema;
|
|
267
331
|
try {
|
|
@@ -297,9 +361,9 @@ var FirecrawlApp = class {
|
|
|
297
361
|
}
|
|
298
362
|
return { success: false, error: "Internal server error." };
|
|
299
363
|
}
|
|
300
|
-
async asyncBatchScrapeUrls(urls, params, idempotencyKey) {
|
|
364
|
+
async asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs) {
|
|
301
365
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
302
|
-
let jsonData = { urls, ...params ?? {} };
|
|
366
|
+
let jsonData = { urls, webhook, ignoreInvalidURLs, ...params ?? {} };
|
|
303
367
|
try {
|
|
304
368
|
const response = await this.postRequest(
|
|
305
369
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -327,8 +391,8 @@ var FirecrawlApp = class {
|
|
|
327
391
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
328
392
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
329
393
|
*/
|
|
330
|
-
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey) {
|
|
331
|
-
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
394
|
+
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey, webhook, ignoreInvalidURLs) {
|
|
395
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs);
|
|
332
396
|
if (crawl.success && crawl.id) {
|
|
333
397
|
const id = crawl.id;
|
|
334
398
|
return new CrawlWatcher(id, this);
|
|
@@ -557,8 +621,10 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
557
621
|
ws;
|
|
558
622
|
data;
|
|
559
623
|
status;
|
|
624
|
+
id;
|
|
560
625
|
constructor(id, app) {
|
|
561
626
|
super();
|
|
627
|
+
this.id = id;
|
|
562
628
|
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
563
629
|
this.status = "scraping";
|
|
564
630
|
this.data = [];
|
|
@@ -568,7 +634,8 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
568
634
|
this.dispatchTypedEvent("done", new CustomEvent("done", {
|
|
569
635
|
detail: {
|
|
570
636
|
status: this.status,
|
|
571
|
-
data: this.data
|
|
637
|
+
data: this.data,
|
|
638
|
+
id: this.id
|
|
572
639
|
}
|
|
573
640
|
}));
|
|
574
641
|
} else if (msg.type === "error") {
|
|
@@ -577,7 +644,8 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
577
644
|
detail: {
|
|
578
645
|
status: this.status,
|
|
579
646
|
data: this.data,
|
|
580
|
-
error: msg.error
|
|
647
|
+
error: msg.error,
|
|
648
|
+
id: this.id
|
|
581
649
|
}
|
|
582
650
|
}));
|
|
583
651
|
} else if (msg.type === "catchup") {
|
|
@@ -585,12 +653,18 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
585
653
|
this.data.push(...msg.data.data ?? []);
|
|
586
654
|
for (const doc of this.data) {
|
|
587
655
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
588
|
-
detail:
|
|
656
|
+
detail: {
|
|
657
|
+
...doc,
|
|
658
|
+
id: this.id
|
|
659
|
+
}
|
|
589
660
|
}));
|
|
590
661
|
}
|
|
591
662
|
} else if (msg.type === "document") {
|
|
592
663
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
593
|
-
detail:
|
|
664
|
+
detail: {
|
|
665
|
+
...msg.data,
|
|
666
|
+
id: this.id
|
|
667
|
+
}
|
|
594
668
|
}));
|
|
595
669
|
}
|
|
596
670
|
};
|
|
@@ -599,12 +673,20 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
599
673
|
this.ws.close();
|
|
600
674
|
return;
|
|
601
675
|
}
|
|
602
|
-
|
|
603
|
-
|
|
676
|
+
try {
|
|
677
|
+
const msg = JSON.parse(ev.data);
|
|
678
|
+
messageHandler(msg);
|
|
679
|
+
} catch (error) {
|
|
680
|
+
console.error("Error on message", error);
|
|
681
|
+
}
|
|
604
682
|
}).bind(this);
|
|
605
683
|
this.ws.onclose = ((ev) => {
|
|
606
|
-
|
|
607
|
-
|
|
684
|
+
try {
|
|
685
|
+
const msg = JSON.parse(ev.reason);
|
|
686
|
+
messageHandler(msg);
|
|
687
|
+
} catch (error) {
|
|
688
|
+
console.error("Error on close", error);
|
|
689
|
+
}
|
|
608
690
|
}).bind(this);
|
|
609
691
|
this.ws.onerror = ((_) => {
|
|
610
692
|
this.status = "failed";
|
|
@@ -612,7 +694,8 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
612
694
|
detail: {
|
|
613
695
|
status: this.status,
|
|
614
696
|
data: this.data,
|
|
615
|
-
error: "WebSocket error"
|
|
697
|
+
error: "WebSocket error",
|
|
698
|
+
id: this.id
|
|
616
699
|
}
|
|
617
700
|
}));
|
|
618
701
|
}).bind(this);
|
package/package.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { describe,
|
|
2
|
-
import axios from 'axios';
|
|
3
|
-
import FirecrawlApp from '../index';
|
|
1
|
+
import { describe, expect, jest, test } from '@jest/globals';
|
|
4
2
|
|
|
5
|
-
import
|
|
3
|
+
import FirecrawlApp from '../index';
|
|
4
|
+
import axios from 'axios';
|
|
6
5
|
import { join } from 'path';
|
|
6
|
+
import { readFile } from 'fs/promises';
|
|
7
7
|
|
|
8
8
|
// Mock jest and set the type
|
|
9
9
|
jest.mock('axios');
|
|
@@ -14,13 +14,22 @@ async function loadFixture(name: string): Promise<string> {
|
|
|
14
14
|
return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8')
|
|
15
15
|
}
|
|
16
16
|
|
|
17
|
+
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
|
|
18
|
+
|
|
17
19
|
describe('the firecrawl JS SDK', () => {
|
|
18
20
|
|
|
19
|
-
test('Should require an API key
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
21
|
+
test('Should require an API key only for cloud service', async () => {
|
|
22
|
+
if (API_URL.includes('api.firecrawl.dev')) {
|
|
23
|
+
// Should throw for cloud service
|
|
24
|
+
expect(() => {
|
|
25
|
+
new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL });
|
|
26
|
+
}).toThrow('No API key provided');
|
|
27
|
+
} else {
|
|
28
|
+
// Should not throw for self-hosted
|
|
29
|
+
expect(() => {
|
|
30
|
+
new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL });
|
|
31
|
+
}).not.toThrow();
|
|
32
|
+
}
|
|
24
33
|
});
|
|
25
34
|
|
|
26
35
|
test('Should return scraped data from a /scrape API call', async () => {
|
|
@@ -9,15 +9,28 @@ const TEST_API_KEY = process.env.TEST_API_KEY;
|
|
|
9
9
|
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
|
|
10
10
|
|
|
11
11
|
describe('FirecrawlApp E2E Tests', () => {
|
|
12
|
-
test.concurrent('should throw error for no API key', async () => {
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
12
|
+
test.concurrent('should throw error for no API key only for cloud service', async () => {
|
|
13
|
+
if (API_URL.includes('api.firecrawl.dev')) {
|
|
14
|
+
// Should throw for cloud service
|
|
15
|
+
expect(() => {
|
|
16
|
+
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
|
17
|
+
}).toThrow("No API key provided");
|
|
18
|
+
} else {
|
|
19
|
+
// Should not throw for self-hosted
|
|
20
|
+
expect(() => {
|
|
21
|
+
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
|
22
|
+
}).not.toThrow();
|
|
23
|
+
}
|
|
16
24
|
});
|
|
17
25
|
|
|
18
26
|
test.concurrent('should throw error for invalid API key on scrape', async () => {
|
|
19
|
-
|
|
20
|
-
|
|
27
|
+
if (API_URL.includes('api.firecrawl.dev')) {
|
|
28
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
29
|
+
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 404");
|
|
30
|
+
} else {
|
|
31
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
32
|
+
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
|
|
33
|
+
}
|
|
21
34
|
});
|
|
22
35
|
|
|
23
36
|
test.concurrent('should throw error for blocklisted URL on scrape', async () => {
|
|
@@ -155,14 +168,13 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
155
168
|
}, 30000); // 30 seconds timeout
|
|
156
169
|
|
|
157
170
|
test.concurrent('should throw error for invalid API key on crawl', async () => {
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
|
171
|
+
if (API_URL.includes('api.firecrawl.dev')) {
|
|
172
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
173
|
+
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404");
|
|
174
|
+
} else {
|
|
175
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
176
|
+
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
|
|
177
|
+
}
|
|
166
178
|
});
|
|
167
179
|
|
|
168
180
|
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
|
@@ -337,8 +349,13 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
337
349
|
}, 60000); // 60 seconds timeout
|
|
338
350
|
|
|
339
351
|
test.concurrent('should throw error for invalid API key on map', async () => {
|
|
340
|
-
|
|
341
|
-
|
|
352
|
+
if (API_URL.includes('api.firecrawl.dev')) {
|
|
353
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
354
|
+
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404");
|
|
355
|
+
} else {
|
|
356
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
357
|
+
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
|
|
358
|
+
}
|
|
342
359
|
});
|
|
343
360
|
|
|
344
361
|
test.concurrent('should throw error for blocklisted URL on map', async () => {
|
|
@@ -355,8 +372,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
355
372
|
}, 30000); // 30 seconds timeout
|
|
356
373
|
|
|
357
374
|
test.concurrent('should return successful response for valid map', async () => {
|
|
358
|
-
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
359
|
-
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
|
375
|
+
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
|
360
376
|
expect(response).not.toBeNull();
|
|
361
377
|
|
|
362
378
|
expect(response.links?.length).toBeGreaterThan(0);
|
|
@@ -365,8 +381,45 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
365
381
|
expect(filteredLinks?.length).toBeGreaterThan(0);
|
|
366
382
|
}, 30000); // 30 seconds timeout
|
|
367
383
|
|
|
368
|
-
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
test('should search with string query', async () => {
|
|
369
387
|
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
|
|
370
|
-
await
|
|
388
|
+
const response = await app.search("firecrawl");
|
|
389
|
+
expect(response.success).toBe(true);
|
|
390
|
+
console.log(response.data);
|
|
391
|
+
expect(response.data?.length).toBeGreaterThan(0);
|
|
392
|
+
expect(response.data?.[0]?.markdown).toBeDefined();
|
|
393
|
+
expect(response.data?.[0]?.metadata).toBeDefined();
|
|
394
|
+
expect(response.data?.[0]?.metadata?.title).toBeDefined();
|
|
395
|
+
expect(response.data?.[0]?.metadata?.description).toBeDefined();
|
|
396
|
+
});
|
|
397
|
+
|
|
398
|
+
test('should search with params object', async () => {
|
|
399
|
+
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
|
|
400
|
+
const response = await app.search("firecrawl", {
|
|
401
|
+
limit: 3,
|
|
402
|
+
lang: 'en',
|
|
403
|
+
country: 'us',
|
|
404
|
+
scrapeOptions: {
|
|
405
|
+
formats: ['markdown', 'html', 'links'],
|
|
406
|
+
onlyMainContent: true
|
|
407
|
+
}
|
|
408
|
+
});
|
|
409
|
+
expect(response.success).toBe(true);
|
|
410
|
+
expect(response.data.length).toBeLessThanOrEqual(3);
|
|
411
|
+
for (const doc of response.data) {
|
|
412
|
+
expect(doc.markdown).toBeDefined();
|
|
413
|
+
expect(doc.html).toBeDefined();
|
|
414
|
+
expect(doc.links).toBeDefined();
|
|
415
|
+
expect(doc.metadata).toBeDefined();
|
|
416
|
+
expect(doc.metadata?.title).toBeDefined();
|
|
417
|
+
expect(doc.metadata?.description).toBeDefined();
|
|
418
|
+
}
|
|
419
|
+
});
|
|
420
|
+
|
|
421
|
+
test('should handle invalid API key for search', async () => {
|
|
422
|
+
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: "invalid_api_key" });
|
|
423
|
+
await expect(app.search("test query")).rejects.toThrow("Request failed with status code 404");
|
|
371
424
|
});
|
|
372
425
|
});
|
package/src/index.ts
CHANGED
|
@@ -68,6 +68,9 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
|
|
68
68
|
screenshot?: string;
|
|
69
69
|
metadata?: FirecrawlDocumentMetadata;
|
|
70
70
|
actions: ActionsSchema;
|
|
71
|
+
// v1 search only
|
|
72
|
+
title?: string;
|
|
73
|
+
description?: string;
|
|
71
74
|
}
|
|
72
75
|
|
|
73
76
|
/**
|
|
@@ -183,6 +186,7 @@ export interface BatchScrapeResponse {
|
|
|
183
186
|
url?: string;
|
|
184
187
|
success: true;
|
|
185
188
|
error?: string;
|
|
189
|
+
invalidURLs?: string[];
|
|
186
190
|
}
|
|
187
191
|
|
|
188
192
|
/**
|
|
@@ -242,10 +246,11 @@ export interface MapResponse {
|
|
|
242
246
|
* Defines options for extracting information from URLs.
|
|
243
247
|
*/
|
|
244
248
|
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
245
|
-
prompt
|
|
249
|
+
prompt?: string;
|
|
246
250
|
schema?: LLMSchema;
|
|
247
251
|
systemPrompt?: string;
|
|
248
252
|
allowExternalLinks?: boolean;
|
|
253
|
+
includeSubdomains?: boolean;
|
|
249
254
|
}
|
|
250
255
|
|
|
251
256
|
/**
|
|
@@ -280,6 +285,33 @@ export class FirecrawlError extends Error {
|
|
|
280
285
|
}
|
|
281
286
|
}
|
|
282
287
|
|
|
288
|
+
/**
|
|
289
|
+
* Parameters for search operations.
|
|
290
|
+
* Defines options for searching and scraping search results.
|
|
291
|
+
*/
|
|
292
|
+
export interface SearchParams {
|
|
293
|
+
limit?: number;
|
|
294
|
+
tbs?: string;
|
|
295
|
+
filter?: string;
|
|
296
|
+
lang?: string;
|
|
297
|
+
country?: string;
|
|
298
|
+
location?: string;
|
|
299
|
+
origin?: string;
|
|
300
|
+
timeout?: number;
|
|
301
|
+
scrapeOptions?: ScrapeParams;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* Response interface for search operations.
|
|
306
|
+
* Defines the structure of the response received after a search operation.
|
|
307
|
+
*/
|
|
308
|
+
export interface SearchResponse {
|
|
309
|
+
success: boolean;
|
|
310
|
+
data: FirecrawlDocument<undefined>[];
|
|
311
|
+
warning?: string;
|
|
312
|
+
error?: string;
|
|
313
|
+
}
|
|
314
|
+
|
|
283
315
|
/**
|
|
284
316
|
* Main class for interacting with the Firecrawl API.
|
|
285
317
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
@@ -288,17 +320,23 @@ export default class FirecrawlApp {
|
|
|
288
320
|
public apiKey: string;
|
|
289
321
|
public apiUrl: string;
|
|
290
322
|
|
|
323
|
+
private isCloudService(url: string): boolean {
|
|
324
|
+
return url.includes('api.firecrawl.dev');
|
|
325
|
+
}
|
|
326
|
+
|
|
291
327
|
/**
|
|
292
328
|
* Initializes a new instance of the FirecrawlApp class.
|
|
293
329
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
294
330
|
*/
|
|
295
331
|
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
|
296
|
-
|
|
332
|
+
const baseUrl = apiUrl || "https://api.firecrawl.dev";
|
|
333
|
+
|
|
334
|
+
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
|
|
297
335
|
throw new FirecrawlError("No API key provided", 401);
|
|
298
336
|
}
|
|
299
337
|
|
|
300
|
-
this.apiKey = apiKey;
|
|
301
|
-
this.apiUrl =
|
|
338
|
+
this.apiKey = apiKey || '';
|
|
339
|
+
this.apiUrl = baseUrl;
|
|
302
340
|
}
|
|
303
341
|
|
|
304
342
|
/**
|
|
@@ -361,16 +399,80 @@ export default class FirecrawlApp {
|
|
|
361
399
|
}
|
|
362
400
|
|
|
363
401
|
/**
|
|
364
|
-
*
|
|
402
|
+
* Searches using the Firecrawl API and optionally scrapes the results.
|
|
365
403
|
* @param query - The search query string.
|
|
366
|
-
* @param params -
|
|
367
|
-
* @returns
|
|
404
|
+
* @param params - Optional parameters for the search request.
|
|
405
|
+
* @returns The response from the search operation.
|
|
368
406
|
*/
|
|
369
|
-
async search(
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
407
|
+
async search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse> {
|
|
408
|
+
const headers: AxiosRequestHeaders = {
|
|
409
|
+
"Content-Type": "application/json",
|
|
410
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
411
|
+
} as AxiosRequestHeaders;
|
|
412
|
+
|
|
413
|
+
let jsonData: any = {
|
|
414
|
+
query,
|
|
415
|
+
limit: params?.limit ?? 5,
|
|
416
|
+
tbs: params?.tbs,
|
|
417
|
+
filter: params?.filter,
|
|
418
|
+
lang: params?.lang ?? "en",
|
|
419
|
+
country: params?.country ?? "us",
|
|
420
|
+
location: params?.location,
|
|
421
|
+
origin: params?.origin ?? "api",
|
|
422
|
+
timeout: params?.timeout ?? 60000,
|
|
423
|
+
scrapeOptions: params?.scrapeOptions ?? { formats: [] },
|
|
424
|
+
};
|
|
425
|
+
|
|
426
|
+
if (jsonData?.scrapeOptions?.extract?.schema) {
|
|
427
|
+
let schema = jsonData.scrapeOptions.extract.schema;
|
|
428
|
+
|
|
429
|
+
// Try parsing the schema as a Zod schema
|
|
430
|
+
try {
|
|
431
|
+
schema = zodToJsonSchema(schema);
|
|
432
|
+
} catch (error) {
|
|
433
|
+
|
|
434
|
+
}
|
|
435
|
+
jsonData = {
|
|
436
|
+
...jsonData,
|
|
437
|
+
scrapeOptions: {
|
|
438
|
+
...jsonData.scrapeOptions,
|
|
439
|
+
extract: {
|
|
440
|
+
...jsonData.scrapeOptions.extract,
|
|
441
|
+
schema: schema,
|
|
442
|
+
},
|
|
443
|
+
},
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
try {
|
|
448
|
+
const response: AxiosResponse = await this.postRequest(
|
|
449
|
+
this.apiUrl + `/v1/search`,
|
|
450
|
+
jsonData,
|
|
451
|
+
headers
|
|
452
|
+
);
|
|
453
|
+
|
|
454
|
+
if (response.status === 200) {
|
|
455
|
+
const responseData = response.data;
|
|
456
|
+
if (responseData.success) {
|
|
457
|
+
return {
|
|
458
|
+
success: true,
|
|
459
|
+
data: responseData.data as FirecrawlDocument<any>[],
|
|
460
|
+
warning: responseData.warning,
|
|
461
|
+
};
|
|
462
|
+
} else {
|
|
463
|
+
throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
|
|
464
|
+
}
|
|
465
|
+
} else {
|
|
466
|
+
this.handleError(response, "search");
|
|
467
|
+
}
|
|
468
|
+
} catch (error: any) {
|
|
469
|
+
if (error.response?.data?.error) {
|
|
470
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
471
|
+
} else {
|
|
472
|
+
throw new FirecrawlError(error.message, 500);
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
return { success: false, error: "Internal server error.", data: [] };
|
|
374
476
|
}
|
|
375
477
|
|
|
376
478
|
/**
|
|
@@ -576,9 +678,10 @@ export default class FirecrawlApp {
|
|
|
576
678
|
pollInterval: number = 2,
|
|
577
679
|
idempotencyKey?: string,
|
|
578
680
|
webhook?: CrawlParams["webhook"],
|
|
681
|
+
ignoreInvalidURLs?: boolean,
|
|
579
682
|
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
|
580
683
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
581
|
-
let jsonData: any = { urls, ...params };
|
|
684
|
+
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params };
|
|
582
685
|
if (jsonData?.extract?.schema) {
|
|
583
686
|
let schema = jsonData.extract.schema;
|
|
584
687
|
|
|
@@ -621,10 +724,12 @@ export default class FirecrawlApp {
|
|
|
621
724
|
async asyncBatchScrapeUrls(
|
|
622
725
|
urls: string[],
|
|
623
726
|
params?: ScrapeParams,
|
|
624
|
-
idempotencyKey?: string
|
|
727
|
+
idempotencyKey?: string,
|
|
728
|
+
webhook?: CrawlParams["webhook"],
|
|
729
|
+
ignoreInvalidURLs?: boolean,
|
|
625
730
|
): Promise<BatchScrapeResponse | ErrorResponse> {
|
|
626
731
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
627
|
-
let jsonData: any = { urls, ...(params ?? {}) };
|
|
732
|
+
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) };
|
|
628
733
|
try {
|
|
629
734
|
const response: AxiosResponse = await this.postRequest(
|
|
630
735
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -657,8 +762,10 @@ export default class FirecrawlApp {
|
|
|
657
762
|
urls: string[],
|
|
658
763
|
params?: ScrapeParams,
|
|
659
764
|
idempotencyKey?: string,
|
|
765
|
+
webhook?: CrawlParams["webhook"],
|
|
766
|
+
ignoreInvalidURLs?: boolean,
|
|
660
767
|
) {
|
|
661
|
-
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
768
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs);
|
|
662
769
|
|
|
663
770
|
if (crawl.success && crawl.id) {
|
|
664
771
|
const id = crawl.id;
|
|
@@ -932,9 +1039,11 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
932
1039
|
private ws: WebSocket;
|
|
933
1040
|
public data: FirecrawlDocument<undefined>[];
|
|
934
1041
|
public status: CrawlStatusResponse["status"];
|
|
1042
|
+
public id: string;
|
|
935
1043
|
|
|
936
1044
|
constructor(id: string, app: FirecrawlApp) {
|
|
937
1045
|
super();
|
|
1046
|
+
this.id = id;
|
|
938
1047
|
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
939
1048
|
this.status = "scraping";
|
|
940
1049
|
this.data = [];
|
|
@@ -965,6 +1074,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
965
1074
|
detail: {
|
|
966
1075
|
status: this.status,
|
|
967
1076
|
data: this.data,
|
|
1077
|
+
id: this.id,
|
|
968
1078
|
},
|
|
969
1079
|
}));
|
|
970
1080
|
} else if (msg.type === "error") {
|
|
@@ -974,6 +1084,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
974
1084
|
status: this.status,
|
|
975
1085
|
data: this.data,
|
|
976
1086
|
error: msg.error,
|
|
1087
|
+
id: this.id,
|
|
977
1088
|
},
|
|
978
1089
|
}));
|
|
979
1090
|
} else if (msg.type === "catchup") {
|
|
@@ -981,12 +1092,18 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
981
1092
|
this.data.push(...(msg.data.data ?? []));
|
|
982
1093
|
for (const doc of this.data) {
|
|
983
1094
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
984
|
-
detail:
|
|
1095
|
+
detail: {
|
|
1096
|
+
...doc,
|
|
1097
|
+
id: this.id,
|
|
1098
|
+
},
|
|
985
1099
|
}));
|
|
986
1100
|
}
|
|
987
1101
|
} else if (msg.type === "document") {
|
|
988
1102
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
989
|
-
detail:
|
|
1103
|
+
detail: {
|
|
1104
|
+
...msg.data,
|
|
1105
|
+
id: this.id,
|
|
1106
|
+
},
|
|
990
1107
|
}));
|
|
991
1108
|
}
|
|
992
1109
|
}
|
|
@@ -996,14 +1113,21 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
996
1113
|
this.ws.close();
|
|
997
1114
|
return;
|
|
998
1115
|
}
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1116
|
+
try {
|
|
1117
|
+
const msg = JSON.parse(ev.data) as Message;
|
|
1118
|
+
messageHandler(msg);
|
|
1119
|
+
} catch (error) {
|
|
1120
|
+
console.error("Error on message", error);
|
|
1121
|
+
}
|
|
1002
1122
|
}).bind(this);
|
|
1003
1123
|
|
|
1004
1124
|
this.ws.onclose = ((ev: CloseEvent) => {
|
|
1005
|
-
|
|
1006
|
-
|
|
1125
|
+
try {
|
|
1126
|
+
const msg = JSON.parse(ev.reason) as Message;
|
|
1127
|
+
messageHandler(msg);
|
|
1128
|
+
} catch (error) {
|
|
1129
|
+
console.error("Error on close", error);
|
|
1130
|
+
}
|
|
1007
1131
|
}).bind(this);
|
|
1008
1132
|
|
|
1009
1133
|
this.ws.onerror = ((_: Event) => {
|
|
@@ -1013,6 +1137,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
1013
1137
|
status: this.status,
|
|
1014
1138
|
data: this.data,
|
|
1015
1139
|
error: "WebSocket error",
|
|
1140
|
+
id: this.id,
|
|
1016
1141
|
},
|
|
1017
1142
|
}));
|
|
1018
1143
|
}).bind(this);
|