firecrawl 1.17.0 → 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +22 -191
- package/dist/index.d.cts +5 -69
- package/dist/index.d.ts +5 -69
- package/dist/index.js +22 -191
- package/package.json +1 -1
- package/src/index.ts +179 -0
- package/dump.rdb +0 -0
package/dist/index.cjs
CHANGED
|
@@ -42,11 +42,9 @@ var import_isows = require("isows");
|
|
|
42
42
|
var import_typescript_event_target = require("typescript-event-target");
|
|
43
43
|
var FirecrawlError = class extends Error {
|
|
44
44
|
statusCode;
|
|
45
|
-
|
|
46
|
-
constructor(message, statusCode, details) {
|
|
45
|
+
constructor(message, statusCode) {
|
|
47
46
|
super(message);
|
|
48
47
|
this.statusCode = statusCode;
|
|
49
|
-
this.details = details;
|
|
50
48
|
}
|
|
51
49
|
};
|
|
52
50
|
var FirecrawlApp = class {
|
|
@@ -93,20 +91,6 @@ var FirecrawlApp = class {
|
|
|
93
91
|
}
|
|
94
92
|
};
|
|
95
93
|
}
|
|
96
|
-
if (jsonData?.jsonOptions?.schema) {
|
|
97
|
-
let schema = jsonData.jsonOptions.schema;
|
|
98
|
-
try {
|
|
99
|
-
schema = (0, import_zod_to_json_schema.zodToJsonSchema)(schema);
|
|
100
|
-
} catch (error) {
|
|
101
|
-
}
|
|
102
|
-
jsonData = {
|
|
103
|
-
...jsonData,
|
|
104
|
-
jsonOptions: {
|
|
105
|
-
...jsonData.jsonOptions,
|
|
106
|
-
schema
|
|
107
|
-
}
|
|
108
|
-
};
|
|
109
|
-
}
|
|
110
94
|
try {
|
|
111
95
|
const response = await import_axios.default.post(
|
|
112
96
|
this.apiUrl + `/v1/scrape`,
|
|
@@ -261,26 +245,16 @@ var FirecrawlApp = class {
|
|
|
261
245
|
* Checks the status of a crawl job using the Firecrawl API.
|
|
262
246
|
* @param id - The ID of the crawl operation.
|
|
263
247
|
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
264
|
-
* @param nextURL - The `next` URL from the previous crawl status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`.
|
|
265
|
-
* @param skip - How many entries to skip to paginate. Only required if you're not providing `nextURL`. Only used when `getAllData = false`.
|
|
266
|
-
* @param limit - How many entries to return. Only used when `getAllData = false`.
|
|
267
248
|
* @returns The response containing the job status.
|
|
268
249
|
*/
|
|
269
|
-
async checkCrawlStatus(id, getAllData = false
|
|
250
|
+
async checkCrawlStatus(id, getAllData = false) {
|
|
270
251
|
if (!id) {
|
|
271
252
|
throw new FirecrawlError("No crawl ID provided", 400);
|
|
272
253
|
}
|
|
273
254
|
const headers = this.prepareHeaders();
|
|
274
|
-
const targetURL = new URL(nextURL ?? `${this.apiUrl}/v1/crawl/${id}`);
|
|
275
|
-
if (skip !== void 0) {
|
|
276
|
-
targetURL.searchParams.set("skip", skip.toString());
|
|
277
|
-
}
|
|
278
|
-
if (limit !== void 0) {
|
|
279
|
-
targetURL.searchParams.set("limit", limit.toString());
|
|
280
|
-
}
|
|
281
255
|
try {
|
|
282
256
|
const response = await this.getRequest(
|
|
283
|
-
|
|
257
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
284
258
|
headers
|
|
285
259
|
);
|
|
286
260
|
if (response.status === 200) {
|
|
@@ -305,7 +279,6 @@ var FirecrawlApp = class {
|
|
|
305
279
|
total: response.data.total,
|
|
306
280
|
completed: response.data.completed,
|
|
307
281
|
creditsUsed: response.data.creditsUsed,
|
|
308
|
-
next: getAllData ? void 0 : response.data.next,
|
|
309
282
|
expiresAt: new Date(response.data.expiresAt),
|
|
310
283
|
data: allData
|
|
311
284
|
};
|
|
@@ -328,28 +301,6 @@ var FirecrawlApp = class {
|
|
|
328
301
|
}
|
|
329
302
|
return { success: false, error: "Internal server error." };
|
|
330
303
|
}
|
|
331
|
-
/**
|
|
332
|
-
* Returns information about crawl errors.
|
|
333
|
-
* @param id - The ID of the crawl operation.
|
|
334
|
-
* @returns Information about crawl errors.
|
|
335
|
-
*/
|
|
336
|
-
async checkCrawlErrors(id) {
|
|
337
|
-
const headers = this.prepareHeaders();
|
|
338
|
-
try {
|
|
339
|
-
const response = await this.deleteRequest(
|
|
340
|
-
`${this.apiUrl}/v1/crawl/${id}/errors`,
|
|
341
|
-
headers
|
|
342
|
-
);
|
|
343
|
-
if (response.status === 200) {
|
|
344
|
-
return response.data;
|
|
345
|
-
} else {
|
|
346
|
-
this.handleError(response, "check crawl errors");
|
|
347
|
-
}
|
|
348
|
-
} catch (error) {
|
|
349
|
-
throw new FirecrawlError(error.message, 500);
|
|
350
|
-
}
|
|
351
|
-
return { success: false, error: "Internal server error." };
|
|
352
|
-
}
|
|
353
304
|
/**
|
|
354
305
|
* Cancels a crawl job using the Firecrawl API.
|
|
355
306
|
* @param id - The ID of the crawl operation.
|
|
@@ -438,20 +389,6 @@ var FirecrawlApp = class {
|
|
|
438
389
|
}
|
|
439
390
|
};
|
|
440
391
|
}
|
|
441
|
-
if (jsonData?.jsonOptions?.schema) {
|
|
442
|
-
let schema = jsonData.jsonOptions.schema;
|
|
443
|
-
try {
|
|
444
|
-
schema = (0, import_zod_to_json_schema.zodToJsonSchema)(schema);
|
|
445
|
-
} catch (error) {
|
|
446
|
-
}
|
|
447
|
-
jsonData = {
|
|
448
|
-
...jsonData,
|
|
449
|
-
jsonOptions: {
|
|
450
|
-
...jsonData.jsonOptions,
|
|
451
|
-
schema
|
|
452
|
-
}
|
|
453
|
-
};
|
|
454
|
-
}
|
|
455
392
|
try {
|
|
456
393
|
const response = await this.postRequest(
|
|
457
394
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -515,26 +452,16 @@ var FirecrawlApp = class {
|
|
|
515
452
|
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
516
453
|
* @param id - The ID of the batch scrape operation.
|
|
517
454
|
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
518
|
-
* @param nextURL - The `next` URL from the previous batch scrape status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`.
|
|
519
|
-
* @param skip - How many entries to skip to paginate. Only used when `getAllData = false`.
|
|
520
|
-
* @param limit - How many entries to return. Only used when `getAllData = false`.
|
|
521
455
|
* @returns The response containing the job status.
|
|
522
456
|
*/
|
|
523
|
-
async checkBatchScrapeStatus(id, getAllData = false
|
|
457
|
+
async checkBatchScrapeStatus(id, getAllData = false) {
|
|
524
458
|
if (!id) {
|
|
525
459
|
throw new FirecrawlError("No batch scrape ID provided", 400);
|
|
526
460
|
}
|
|
527
461
|
const headers = this.prepareHeaders();
|
|
528
|
-
const targetURL = new URL(nextURL ?? `${this.apiUrl}/v1/batch/scrape/${id}`);
|
|
529
|
-
if (skip !== void 0) {
|
|
530
|
-
targetURL.searchParams.set("skip", skip.toString());
|
|
531
|
-
}
|
|
532
|
-
if (limit !== void 0) {
|
|
533
|
-
targetURL.searchParams.set("limit", limit.toString());
|
|
534
|
-
}
|
|
535
462
|
try {
|
|
536
463
|
const response = await this.getRequest(
|
|
537
|
-
|
|
464
|
+
`${this.apiUrl}/v1/batch/scrape/${id}`,
|
|
538
465
|
headers
|
|
539
466
|
);
|
|
540
467
|
if (response.status === 200) {
|
|
@@ -559,7 +486,6 @@ var FirecrawlApp = class {
|
|
|
559
486
|
total: response.data.total,
|
|
560
487
|
completed: response.data.completed,
|
|
561
488
|
creditsUsed: response.data.creditsUsed,
|
|
562
|
-
next: getAllData ? void 0 : response.data.next,
|
|
563
489
|
expiresAt: new Date(response.data.expiresAt),
|
|
564
490
|
data: allData
|
|
565
491
|
};
|
|
@@ -582,28 +508,6 @@ var FirecrawlApp = class {
|
|
|
582
508
|
}
|
|
583
509
|
return { success: false, error: "Internal server error." };
|
|
584
510
|
}
|
|
585
|
-
/**
|
|
586
|
-
* Returns information about batch scrape errors.
|
|
587
|
-
* @param id - The ID of the batch scrape operation.
|
|
588
|
-
* @returns Information about batch scrape errors.
|
|
589
|
-
*/
|
|
590
|
-
async checkBatchScrapeErrors(id) {
|
|
591
|
-
const headers = this.prepareHeaders();
|
|
592
|
-
try {
|
|
593
|
-
const response = await this.deleteRequest(
|
|
594
|
-
`${this.apiUrl}/v1/batch/scrape/${id}/errors`,
|
|
595
|
-
headers
|
|
596
|
-
);
|
|
597
|
-
if (response.status === 200) {
|
|
598
|
-
return response.data;
|
|
599
|
-
} else {
|
|
600
|
-
this.handleError(response, "check batch scrape errors");
|
|
601
|
-
}
|
|
602
|
-
} catch (error) {
|
|
603
|
-
throw new FirecrawlError(error.message, 500);
|
|
604
|
-
}
|
|
605
|
-
return { success: false, error: "Internal server error." };
|
|
606
|
-
}
|
|
607
511
|
/**
|
|
608
512
|
* Extracts information from URLs using the Firecrawl API.
|
|
609
513
|
* Currently in Beta. Expect breaking changes on future minor versions.
|
|
@@ -626,66 +530,6 @@ var FirecrawlApp = class {
|
|
|
626
530
|
} catch (error) {
|
|
627
531
|
throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400);
|
|
628
532
|
}
|
|
629
|
-
try {
|
|
630
|
-
const response = await this.postRequest(
|
|
631
|
-
this.apiUrl + `/v1/extract`,
|
|
632
|
-
{ ...jsonData, schema: jsonSchema, origin: params?.origin || "api-sdk" },
|
|
633
|
-
headers
|
|
634
|
-
);
|
|
635
|
-
if (response.status === 200) {
|
|
636
|
-
const jobId = response.data.id;
|
|
637
|
-
let extractStatus;
|
|
638
|
-
do {
|
|
639
|
-
const statusResponse = await this.getRequest(
|
|
640
|
-
`${this.apiUrl}/v1/extract/${jobId}`,
|
|
641
|
-
headers
|
|
642
|
-
);
|
|
643
|
-
extractStatus = statusResponse.data;
|
|
644
|
-
if (extractStatus.status === "completed") {
|
|
645
|
-
if (extractStatus.success) {
|
|
646
|
-
return {
|
|
647
|
-
success: true,
|
|
648
|
-
data: extractStatus.data,
|
|
649
|
-
warning: extractStatus.warning,
|
|
650
|
-
error: extractStatus.error,
|
|
651
|
-
sources: extractStatus?.sources || void 0
|
|
652
|
-
};
|
|
653
|
-
} else {
|
|
654
|
-
throw new FirecrawlError(`Failed to extract data. Error: ${extractStatus.error}`, statusResponse.status);
|
|
655
|
-
}
|
|
656
|
-
} else if (extractStatus.status === "failed" || extractStatus.status === "cancelled") {
|
|
657
|
-
throw new FirecrawlError(`Extract job ${extractStatus.status}. Error: ${extractStatus.error}`, statusResponse.status);
|
|
658
|
-
}
|
|
659
|
-
await new Promise((resolve) => setTimeout(resolve, 1e3));
|
|
660
|
-
} while (extractStatus.status !== "completed");
|
|
661
|
-
} else {
|
|
662
|
-
this.handleError(response, "extract");
|
|
663
|
-
}
|
|
664
|
-
} catch (error) {
|
|
665
|
-
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
|
666
|
-
}
|
|
667
|
-
return { success: false, error: "Internal server error." };
|
|
668
|
-
}
|
|
669
|
-
/**
|
|
670
|
-
* Initiates an asynchronous extract job for a URL using the Firecrawl API.
|
|
671
|
-
* @param url - The URL to extract data from.
|
|
672
|
-
* @param params - Additional parameters for the extract request.
|
|
673
|
-
* @param idempotencyKey - Optional idempotency key for the request.
|
|
674
|
-
* @returns The response from the extract operation.
|
|
675
|
-
*/
|
|
676
|
-
async asyncExtract(urls, params, idempotencyKey) {
|
|
677
|
-
const headers = this.prepareHeaders(idempotencyKey);
|
|
678
|
-
let jsonData = { urls, ...params };
|
|
679
|
-
let jsonSchema;
|
|
680
|
-
try {
|
|
681
|
-
if (params?.schema instanceof zt.ZodType) {
|
|
682
|
-
jsonSchema = (0, import_zod_to_json_schema.zodToJsonSchema)(params.schema);
|
|
683
|
-
} else {
|
|
684
|
-
jsonSchema = params?.schema;
|
|
685
|
-
}
|
|
686
|
-
} catch (error) {
|
|
687
|
-
throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400);
|
|
688
|
-
}
|
|
689
533
|
try {
|
|
690
534
|
const response = await this.postRequest(
|
|
691
535
|
this.apiUrl + `/v1/extract`,
|
|
@@ -693,34 +537,24 @@ var FirecrawlApp = class {
|
|
|
693
537
|
headers
|
|
694
538
|
);
|
|
695
539
|
if (response.status === 200) {
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
* @param jobId - The ID of the extract job.
|
|
708
|
-
* @returns The status of the extract job.
|
|
709
|
-
*/
|
|
710
|
-
async getExtractStatus(jobId) {
|
|
711
|
-
try {
|
|
712
|
-
const response = await this.getRequest(
|
|
713
|
-
`${this.apiUrl}/v1/extract/${jobId}`,
|
|
714
|
-
this.prepareHeaders()
|
|
715
|
-
);
|
|
716
|
-
if (response.status === 200) {
|
|
717
|
-
return response.data;
|
|
540
|
+
const responseData = response.data;
|
|
541
|
+
if (responseData.success) {
|
|
542
|
+
return {
|
|
543
|
+
success: true,
|
|
544
|
+
data: responseData.data,
|
|
545
|
+
warning: responseData.warning,
|
|
546
|
+
error: responseData.error
|
|
547
|
+
};
|
|
548
|
+
} else {
|
|
549
|
+
throw new FirecrawlError(`Failed to scrape URL. Error: ${responseData.error}`, response.status);
|
|
550
|
+
}
|
|
718
551
|
} else {
|
|
719
|
-
this.handleError(response, "
|
|
552
|
+
this.handleError(response, "extract");
|
|
720
553
|
}
|
|
721
554
|
} catch (error) {
|
|
722
555
|
throw new FirecrawlError(error.message, 500);
|
|
723
556
|
}
|
|
557
|
+
return { success: false, error: "Internal server error." };
|
|
724
558
|
}
|
|
725
559
|
/**
|
|
726
560
|
* Prepares the headers for an API request.
|
|
@@ -836,13 +670,11 @@ var FirecrawlApp = class {
|
|
|
836
670
|
* @param {string} action - The action being performed when the error occurred.
|
|
837
671
|
*/
|
|
838
672
|
handleError(response, action) {
|
|
839
|
-
if ([
|
|
673
|
+
if ([402, 408, 409, 500].includes(response.status)) {
|
|
840
674
|
const errorMessage = response.data.error || "Unknown error occurred";
|
|
841
|
-
const details = response.data.details ? ` - ${JSON.stringify(response.data.details)}` : "";
|
|
842
675
|
throw new FirecrawlError(
|
|
843
|
-
`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}
|
|
844
|
-
response.status
|
|
845
|
-
response?.data?.details
|
|
676
|
+
`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`,
|
|
677
|
+
response.status
|
|
846
678
|
);
|
|
847
679
|
} else {
|
|
848
680
|
throw new FirecrawlError(
|
|
@@ -860,8 +692,7 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
860
692
|
constructor(id, app) {
|
|
861
693
|
super();
|
|
862
694
|
this.id = id;
|
|
863
|
-
|
|
864
|
-
this.ws = new import_isows.WebSocket(`${wsUrl}/v1/crawl/${id}`, app.apiKey);
|
|
695
|
+
this.ws = new import_isows.WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
865
696
|
this.status = "scraping";
|
|
866
697
|
this.data = [];
|
|
867
698
|
const messageHandler = (msg) => {
|
package/dist/index.d.cts
CHANGED
|
@@ -61,7 +61,6 @@ interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | neve
|
|
|
61
61
|
rawHtml?: string;
|
|
62
62
|
links?: string[];
|
|
63
63
|
extract?: T;
|
|
64
|
-
json?: T;
|
|
65
64
|
screenshot?: string;
|
|
66
65
|
metadata?: FirecrawlDocumentMetadata;
|
|
67
66
|
actions: ActionsSchema;
|
|
@@ -73,7 +72,7 @@ interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | neve
|
|
|
73
72
|
* Defines the options and configurations available for scraping web content.
|
|
74
73
|
*/
|
|
75
74
|
interface CrawlScrapeOptions {
|
|
76
|
-
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract"
|
|
75
|
+
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract")[];
|
|
77
76
|
headers?: Record<string, string>;
|
|
78
77
|
includeTags?: string[];
|
|
79
78
|
excludeTags?: string[];
|
|
@@ -87,7 +86,6 @@ interface CrawlScrapeOptions {
|
|
|
87
86
|
mobile?: boolean;
|
|
88
87
|
skipTlsVerification?: boolean;
|
|
89
88
|
removeBase64Images?: boolean;
|
|
90
|
-
blockAds?: boolean;
|
|
91
89
|
}
|
|
92
90
|
type Action = {
|
|
93
91
|
type: "wait";
|
|
@@ -121,11 +119,6 @@ interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchema exten
|
|
|
121
119
|
schema?: LLMSchema;
|
|
122
120
|
systemPrompt?: string;
|
|
123
121
|
};
|
|
124
|
-
jsonOptions?: {
|
|
125
|
-
prompt?: string;
|
|
126
|
-
schema?: LLMSchema;
|
|
127
|
-
systemPrompt?: string;
|
|
128
|
-
};
|
|
129
122
|
actions?: ActionsSchema;
|
|
130
123
|
}
|
|
131
124
|
interface ActionsResult {
|
|
@@ -157,7 +150,6 @@ interface CrawlParams {
|
|
|
157
150
|
url: string;
|
|
158
151
|
headers?: Record<string, string>;
|
|
159
152
|
metadata?: Record<string, string>;
|
|
160
|
-
events?: ["completed", "failed", "page", "started"][number][];
|
|
161
153
|
};
|
|
162
154
|
deduplicateSimilarURLs?: boolean;
|
|
163
155
|
ignoreQueryParameters?: boolean;
|
|
@@ -221,7 +213,6 @@ interface MapParams {
|
|
|
221
213
|
includeSubdomains?: boolean;
|
|
222
214
|
sitemapOnly?: boolean;
|
|
223
215
|
limit?: number;
|
|
224
|
-
timeout?: number;
|
|
225
216
|
}
|
|
226
217
|
/**
|
|
227
218
|
* Response interface for mapping operations.
|
|
@@ -241,10 +232,7 @@ interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
|
241
232
|
schema?: LLMSchema | object;
|
|
242
233
|
systemPrompt?: string;
|
|
243
234
|
allowExternalLinks?: boolean;
|
|
244
|
-
enableWebSearch?: boolean;
|
|
245
235
|
includeSubdomains?: boolean;
|
|
246
|
-
origin?: string;
|
|
247
|
-
showSources?: boolean;
|
|
248
236
|
}
|
|
249
237
|
/**
|
|
250
238
|
* Response interface for extracting information from URLs.
|
|
@@ -255,7 +243,6 @@ interface ExtractResponse<LLMSchema extends zt.ZodSchema = any> {
|
|
|
255
243
|
data: LLMSchema;
|
|
256
244
|
error?: string;
|
|
257
245
|
warning?: string;
|
|
258
|
-
sources?: string[];
|
|
259
246
|
}
|
|
260
247
|
/**
|
|
261
248
|
* Error response interface.
|
|
@@ -271,8 +258,7 @@ interface ErrorResponse {
|
|
|
271
258
|
*/
|
|
272
259
|
declare class FirecrawlError extends Error {
|
|
273
260
|
statusCode: number;
|
|
274
|
-
|
|
275
|
-
constructor(message: string, statusCode: number, details?: any);
|
|
261
|
+
constructor(message: string, statusCode: number);
|
|
276
262
|
}
|
|
277
263
|
/**
|
|
278
264
|
* Parameters for search operations.
|
|
@@ -299,24 +285,6 @@ interface SearchResponse {
|
|
|
299
285
|
warning?: string;
|
|
300
286
|
error?: string;
|
|
301
287
|
}
|
|
302
|
-
/**
|
|
303
|
-
* Response interface for crawl/batch scrape error monitoring.
|
|
304
|
-
*/
|
|
305
|
-
interface CrawlErrorsResponse {
|
|
306
|
-
/**
|
|
307
|
-
* Scrapes that errored out + error details
|
|
308
|
-
*/
|
|
309
|
-
errors: {
|
|
310
|
-
id: string;
|
|
311
|
-
timestamp?: string;
|
|
312
|
-
url: string;
|
|
313
|
-
error: string;
|
|
314
|
-
}[];
|
|
315
|
-
/**
|
|
316
|
-
* URLs blocked by robots.txt
|
|
317
|
-
*/
|
|
318
|
-
robotsBlocked: string[];
|
|
319
|
-
}
|
|
320
288
|
/**
|
|
321
289
|
* Main class for interacting with the Firecrawl API.
|
|
322
290
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
@@ -358,18 +326,9 @@ declare class FirecrawlApp {
|
|
|
358
326
|
* Checks the status of a crawl job using the Firecrawl API.
|
|
359
327
|
* @param id - The ID of the crawl operation.
|
|
360
328
|
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
361
|
-
* @param nextURL - The `next` URL from the previous crawl status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`.
|
|
362
|
-
* @param skip - How many entries to skip to paginate. Only required if you're not providing `nextURL`. Only used when `getAllData = false`.
|
|
363
|
-
* @param limit - How many entries to return. Only used when `getAllData = false`.
|
|
364
329
|
* @returns The response containing the job status.
|
|
365
330
|
*/
|
|
366
|
-
checkCrawlStatus(id?: string, getAllData?: boolean
|
|
367
|
-
/**
|
|
368
|
-
* Returns information about crawl errors.
|
|
369
|
-
* @param id - The ID of the crawl operation.
|
|
370
|
-
* @returns Information about crawl errors.
|
|
371
|
-
*/
|
|
372
|
-
checkCrawlErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse>;
|
|
331
|
+
checkCrawlStatus(id?: string, getAllData?: boolean): Promise<CrawlStatusResponse | ErrorResponse>;
|
|
373
332
|
/**
|
|
374
333
|
* Cancels a crawl job using the Firecrawl API.
|
|
375
334
|
* @param id - The ID of the crawl operation.
|
|
@@ -414,18 +373,9 @@ declare class FirecrawlApp {
|
|
|
414
373
|
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
415
374
|
* @param id - The ID of the batch scrape operation.
|
|
416
375
|
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
417
|
-
* @param nextURL - The `next` URL from the previous batch scrape status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`.
|
|
418
|
-
* @param skip - How many entries to skip to paginate. Only used when `getAllData = false`.
|
|
419
|
-
* @param limit - How many entries to return. Only used when `getAllData = false`.
|
|
420
376
|
* @returns The response containing the job status.
|
|
421
377
|
*/
|
|
422
|
-
checkBatchScrapeStatus(id?: string, getAllData?: boolean
|
|
423
|
-
/**
|
|
424
|
-
* Returns information about batch scrape errors.
|
|
425
|
-
* @param id - The ID of the batch scrape operation.
|
|
426
|
-
* @returns Information about batch scrape errors.
|
|
427
|
-
*/
|
|
428
|
-
checkBatchScrapeErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse>;
|
|
378
|
+
checkBatchScrapeStatus(id?: string, getAllData?: boolean): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
429
379
|
/**
|
|
430
380
|
* Extracts information from URLs using the Firecrawl API.
|
|
431
381
|
* Currently in Beta. Expect breaking changes on future minor versions.
|
|
@@ -434,20 +384,6 @@ declare class FirecrawlApp {
|
|
|
434
384
|
* @returns The response from the extract operation.
|
|
435
385
|
*/
|
|
436
386
|
extract<T extends zt.ZodSchema = any>(urls: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse>;
|
|
437
|
-
/**
|
|
438
|
-
* Initiates an asynchronous extract job for a URL using the Firecrawl API.
|
|
439
|
-
* @param url - The URL to extract data from.
|
|
440
|
-
* @param params - Additional parameters for the extract request.
|
|
441
|
-
* @param idempotencyKey - Optional idempotency key for the request.
|
|
442
|
-
* @returns The response from the extract operation.
|
|
443
|
-
*/
|
|
444
|
-
asyncExtract(urls: string[], params?: ExtractParams, idempotencyKey?: string): Promise<ExtractResponse | ErrorResponse>;
|
|
445
|
-
/**
|
|
446
|
-
* Retrieves the status of an extract job.
|
|
447
|
-
* @param jobId - The ID of the extract job.
|
|
448
|
-
* @returns The status of the extract job.
|
|
449
|
-
*/
|
|
450
|
-
getExtractStatus(jobId: string): Promise<any>;
|
|
451
387
|
/**
|
|
452
388
|
* Prepares the headers for an API request.
|
|
453
389
|
* @param idempotencyKey - Optional key to ensure idempotency.
|
|
@@ -513,4 +449,4 @@ declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
513
449
|
close(): void;
|
|
514
450
|
}
|
|
515
451
|
|
|
516
|
-
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type
|
|
452
|
+
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type ExtractParams, type ExtractResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, type SearchParams, type SearchResponse, FirecrawlApp as default };
|
package/dist/index.d.ts
CHANGED
|
@@ -61,7 +61,6 @@ interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | neve
|
|
|
61
61
|
rawHtml?: string;
|
|
62
62
|
links?: string[];
|
|
63
63
|
extract?: T;
|
|
64
|
-
json?: T;
|
|
65
64
|
screenshot?: string;
|
|
66
65
|
metadata?: FirecrawlDocumentMetadata;
|
|
67
66
|
actions: ActionsSchema;
|
|
@@ -73,7 +72,7 @@ interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | neve
|
|
|
73
72
|
* Defines the options and configurations available for scraping web content.
|
|
74
73
|
*/
|
|
75
74
|
interface CrawlScrapeOptions {
|
|
76
|
-
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract"
|
|
75
|
+
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract")[];
|
|
77
76
|
headers?: Record<string, string>;
|
|
78
77
|
includeTags?: string[];
|
|
79
78
|
excludeTags?: string[];
|
|
@@ -87,7 +86,6 @@ interface CrawlScrapeOptions {
|
|
|
87
86
|
mobile?: boolean;
|
|
88
87
|
skipTlsVerification?: boolean;
|
|
89
88
|
removeBase64Images?: boolean;
|
|
90
|
-
blockAds?: boolean;
|
|
91
89
|
}
|
|
92
90
|
type Action = {
|
|
93
91
|
type: "wait";
|
|
@@ -121,11 +119,6 @@ interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchema exten
|
|
|
121
119
|
schema?: LLMSchema;
|
|
122
120
|
systemPrompt?: string;
|
|
123
121
|
};
|
|
124
|
-
jsonOptions?: {
|
|
125
|
-
prompt?: string;
|
|
126
|
-
schema?: LLMSchema;
|
|
127
|
-
systemPrompt?: string;
|
|
128
|
-
};
|
|
129
122
|
actions?: ActionsSchema;
|
|
130
123
|
}
|
|
131
124
|
interface ActionsResult {
|
|
@@ -157,7 +150,6 @@ interface CrawlParams {
|
|
|
157
150
|
url: string;
|
|
158
151
|
headers?: Record<string, string>;
|
|
159
152
|
metadata?: Record<string, string>;
|
|
160
|
-
events?: ["completed", "failed", "page", "started"][number][];
|
|
161
153
|
};
|
|
162
154
|
deduplicateSimilarURLs?: boolean;
|
|
163
155
|
ignoreQueryParameters?: boolean;
|
|
@@ -221,7 +213,6 @@ interface MapParams {
|
|
|
221
213
|
includeSubdomains?: boolean;
|
|
222
214
|
sitemapOnly?: boolean;
|
|
223
215
|
limit?: number;
|
|
224
|
-
timeout?: number;
|
|
225
216
|
}
|
|
226
217
|
/**
|
|
227
218
|
* Response interface for mapping operations.
|
|
@@ -241,10 +232,7 @@ interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
|
241
232
|
schema?: LLMSchema | object;
|
|
242
233
|
systemPrompt?: string;
|
|
243
234
|
allowExternalLinks?: boolean;
|
|
244
|
-
enableWebSearch?: boolean;
|
|
245
235
|
includeSubdomains?: boolean;
|
|
246
|
-
origin?: string;
|
|
247
|
-
showSources?: boolean;
|
|
248
236
|
}
|
|
249
237
|
/**
|
|
250
238
|
* Response interface for extracting information from URLs.
|
|
@@ -255,7 +243,6 @@ interface ExtractResponse<LLMSchema extends zt.ZodSchema = any> {
|
|
|
255
243
|
data: LLMSchema;
|
|
256
244
|
error?: string;
|
|
257
245
|
warning?: string;
|
|
258
|
-
sources?: string[];
|
|
259
246
|
}
|
|
260
247
|
/**
|
|
261
248
|
* Error response interface.
|
|
@@ -271,8 +258,7 @@ interface ErrorResponse {
|
|
|
271
258
|
*/
|
|
272
259
|
declare class FirecrawlError extends Error {
|
|
273
260
|
statusCode: number;
|
|
274
|
-
|
|
275
|
-
constructor(message: string, statusCode: number, details?: any);
|
|
261
|
+
constructor(message: string, statusCode: number);
|
|
276
262
|
}
|
|
277
263
|
/**
|
|
278
264
|
* Parameters for search operations.
|
|
@@ -299,24 +285,6 @@ interface SearchResponse {
|
|
|
299
285
|
warning?: string;
|
|
300
286
|
error?: string;
|
|
301
287
|
}
|
|
302
|
-
/**
|
|
303
|
-
* Response interface for crawl/batch scrape error monitoring.
|
|
304
|
-
*/
|
|
305
|
-
interface CrawlErrorsResponse {
|
|
306
|
-
/**
|
|
307
|
-
* Scrapes that errored out + error details
|
|
308
|
-
*/
|
|
309
|
-
errors: {
|
|
310
|
-
id: string;
|
|
311
|
-
timestamp?: string;
|
|
312
|
-
url: string;
|
|
313
|
-
error: string;
|
|
314
|
-
}[];
|
|
315
|
-
/**
|
|
316
|
-
* URLs blocked by robots.txt
|
|
317
|
-
*/
|
|
318
|
-
robotsBlocked: string[];
|
|
319
|
-
}
|
|
320
288
|
/**
|
|
321
289
|
* Main class for interacting with the Firecrawl API.
|
|
322
290
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
@@ -358,18 +326,9 @@ declare class FirecrawlApp {
|
|
|
358
326
|
* Checks the status of a crawl job using the Firecrawl API.
|
|
359
327
|
* @param id - The ID of the crawl operation.
|
|
360
328
|
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
361
|
-
* @param nextURL - The `next` URL from the previous crawl status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`.
|
|
362
|
-
* @param skip - How many entries to skip to paginate. Only required if you're not providing `nextURL`. Only used when `getAllData = false`.
|
|
363
|
-
* @param limit - How many entries to return. Only used when `getAllData = false`.
|
|
364
329
|
* @returns The response containing the job status.
|
|
365
330
|
*/
|
|
366
|
-
checkCrawlStatus(id?: string, getAllData?: boolean
|
|
367
|
-
/**
|
|
368
|
-
* Returns information about crawl errors.
|
|
369
|
-
* @param id - The ID of the crawl operation.
|
|
370
|
-
* @returns Information about crawl errors.
|
|
371
|
-
*/
|
|
372
|
-
checkCrawlErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse>;
|
|
331
|
+
checkCrawlStatus(id?: string, getAllData?: boolean): Promise<CrawlStatusResponse | ErrorResponse>;
|
|
373
332
|
/**
|
|
374
333
|
* Cancels a crawl job using the Firecrawl API.
|
|
375
334
|
* @param id - The ID of the crawl operation.
|
|
@@ -414,18 +373,9 @@ declare class FirecrawlApp {
|
|
|
414
373
|
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
415
374
|
* @param id - The ID of the batch scrape operation.
|
|
416
375
|
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
417
|
-
* @param nextURL - The `next` URL from the previous batch scrape status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`.
|
|
418
|
-
* @param skip - How many entries to skip to paginate. Only used when `getAllData = false`.
|
|
419
|
-
* @param limit - How many entries to return. Only used when `getAllData = false`.
|
|
420
376
|
* @returns The response containing the job status.
|
|
421
377
|
*/
|
|
422
|
-
checkBatchScrapeStatus(id?: string, getAllData?: boolean
|
|
423
|
-
/**
|
|
424
|
-
* Returns information about batch scrape errors.
|
|
425
|
-
* @param id - The ID of the batch scrape operation.
|
|
426
|
-
* @returns Information about batch scrape errors.
|
|
427
|
-
*/
|
|
428
|
-
checkBatchScrapeErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse>;
|
|
378
|
+
checkBatchScrapeStatus(id?: string, getAllData?: boolean): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
429
379
|
/**
|
|
430
380
|
* Extracts information from URLs using the Firecrawl API.
|
|
431
381
|
* Currently in Beta. Expect breaking changes on future minor versions.
|
|
@@ -434,20 +384,6 @@ declare class FirecrawlApp {
|
|
|
434
384
|
* @returns The response from the extract operation.
|
|
435
385
|
*/
|
|
436
386
|
extract<T extends zt.ZodSchema = any>(urls: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse>;
|
|
437
|
-
/**
|
|
438
|
-
* Initiates an asynchronous extract job for a URL using the Firecrawl API.
|
|
439
|
-
* @param url - The URL to extract data from.
|
|
440
|
-
* @param params - Additional parameters for the extract request.
|
|
441
|
-
* @param idempotencyKey - Optional idempotency key for the request.
|
|
442
|
-
* @returns The response from the extract operation.
|
|
443
|
-
*/
|
|
444
|
-
asyncExtract(urls: string[], params?: ExtractParams, idempotencyKey?: string): Promise<ExtractResponse | ErrorResponse>;
|
|
445
|
-
/**
|
|
446
|
-
* Retrieves the status of an extract job.
|
|
447
|
-
* @param jobId - The ID of the extract job.
|
|
448
|
-
* @returns The status of the extract job.
|
|
449
|
-
*/
|
|
450
|
-
getExtractStatus(jobId: string): Promise<any>;
|
|
451
387
|
/**
|
|
452
388
|
* Prepares the headers for an API request.
|
|
453
389
|
* @param idempotencyKey - Optional key to ensure idempotency.
|
|
@@ -513,4 +449,4 @@ declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
513
449
|
close(): void;
|
|
514
450
|
}
|
|
515
451
|
|
|
516
|
-
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type
|
|
452
|
+
export { type Action, type ActionsResult, type BatchScrapeResponse, type BatchScrapeStatusResponse, type CrawlParams, type CrawlResponse, type CrawlScrapeOptions, type CrawlStatusResponse, CrawlWatcher, type ErrorResponse, type ExtractParams, type ExtractResponse, type FirecrawlAppConfig, type FirecrawlDocument, type FirecrawlDocumentMetadata, FirecrawlError, type MapParams, type MapResponse, type ScrapeParams, type ScrapeResponse, type SearchParams, type SearchResponse, FirecrawlApp as default };
|
package/dist/index.js
CHANGED
|
@@ -6,11 +6,9 @@ import { WebSocket } from "isows";
|
|
|
6
6
|
import { TypedEventTarget } from "typescript-event-target";
|
|
7
7
|
var FirecrawlError = class extends Error {
|
|
8
8
|
statusCode;
|
|
9
|
-
|
|
10
|
-
constructor(message, statusCode, details) {
|
|
9
|
+
constructor(message, statusCode) {
|
|
11
10
|
super(message);
|
|
12
11
|
this.statusCode = statusCode;
|
|
13
|
-
this.details = details;
|
|
14
12
|
}
|
|
15
13
|
};
|
|
16
14
|
var FirecrawlApp = class {
|
|
@@ -57,20 +55,6 @@ var FirecrawlApp = class {
|
|
|
57
55
|
}
|
|
58
56
|
};
|
|
59
57
|
}
|
|
60
|
-
if (jsonData?.jsonOptions?.schema) {
|
|
61
|
-
let schema = jsonData.jsonOptions.schema;
|
|
62
|
-
try {
|
|
63
|
-
schema = zodToJsonSchema(schema);
|
|
64
|
-
} catch (error) {
|
|
65
|
-
}
|
|
66
|
-
jsonData = {
|
|
67
|
-
...jsonData,
|
|
68
|
-
jsonOptions: {
|
|
69
|
-
...jsonData.jsonOptions,
|
|
70
|
-
schema
|
|
71
|
-
}
|
|
72
|
-
};
|
|
73
|
-
}
|
|
74
58
|
try {
|
|
75
59
|
const response = await axios.post(
|
|
76
60
|
this.apiUrl + `/v1/scrape`,
|
|
@@ -225,26 +209,16 @@ var FirecrawlApp = class {
|
|
|
225
209
|
* Checks the status of a crawl job using the Firecrawl API.
|
|
226
210
|
* @param id - The ID of the crawl operation.
|
|
227
211
|
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
228
|
-
* @param nextURL - The `next` URL from the previous crawl status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`.
|
|
229
|
-
* @param skip - How many entries to skip to paginate. Only required if you're not providing `nextURL`. Only used when `getAllData = false`.
|
|
230
|
-
* @param limit - How many entries to return. Only used when `getAllData = false`.
|
|
231
212
|
* @returns The response containing the job status.
|
|
232
213
|
*/
|
|
233
|
-
async checkCrawlStatus(id, getAllData = false
|
|
214
|
+
async checkCrawlStatus(id, getAllData = false) {
|
|
234
215
|
if (!id) {
|
|
235
216
|
throw new FirecrawlError("No crawl ID provided", 400);
|
|
236
217
|
}
|
|
237
218
|
const headers = this.prepareHeaders();
|
|
238
|
-
const targetURL = new URL(nextURL ?? `${this.apiUrl}/v1/crawl/${id}`);
|
|
239
|
-
if (skip !== void 0) {
|
|
240
|
-
targetURL.searchParams.set("skip", skip.toString());
|
|
241
|
-
}
|
|
242
|
-
if (limit !== void 0) {
|
|
243
|
-
targetURL.searchParams.set("limit", limit.toString());
|
|
244
|
-
}
|
|
245
219
|
try {
|
|
246
220
|
const response = await this.getRequest(
|
|
247
|
-
|
|
221
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
248
222
|
headers
|
|
249
223
|
);
|
|
250
224
|
if (response.status === 200) {
|
|
@@ -269,7 +243,6 @@ var FirecrawlApp = class {
|
|
|
269
243
|
total: response.data.total,
|
|
270
244
|
completed: response.data.completed,
|
|
271
245
|
creditsUsed: response.data.creditsUsed,
|
|
272
|
-
next: getAllData ? void 0 : response.data.next,
|
|
273
246
|
expiresAt: new Date(response.data.expiresAt),
|
|
274
247
|
data: allData
|
|
275
248
|
};
|
|
@@ -292,28 +265,6 @@ var FirecrawlApp = class {
|
|
|
292
265
|
}
|
|
293
266
|
return { success: false, error: "Internal server error." };
|
|
294
267
|
}
|
|
295
|
-
/**
|
|
296
|
-
* Returns information about crawl errors.
|
|
297
|
-
* @param id - The ID of the crawl operation.
|
|
298
|
-
* @returns Information about crawl errors.
|
|
299
|
-
*/
|
|
300
|
-
async checkCrawlErrors(id) {
|
|
301
|
-
const headers = this.prepareHeaders();
|
|
302
|
-
try {
|
|
303
|
-
const response = await this.deleteRequest(
|
|
304
|
-
`${this.apiUrl}/v1/crawl/${id}/errors`,
|
|
305
|
-
headers
|
|
306
|
-
);
|
|
307
|
-
if (response.status === 200) {
|
|
308
|
-
return response.data;
|
|
309
|
-
} else {
|
|
310
|
-
this.handleError(response, "check crawl errors");
|
|
311
|
-
}
|
|
312
|
-
} catch (error) {
|
|
313
|
-
throw new FirecrawlError(error.message, 500);
|
|
314
|
-
}
|
|
315
|
-
return { success: false, error: "Internal server error." };
|
|
316
|
-
}
|
|
317
268
|
/**
|
|
318
269
|
* Cancels a crawl job using the Firecrawl API.
|
|
319
270
|
* @param id - The ID of the crawl operation.
|
|
@@ -402,20 +353,6 @@ var FirecrawlApp = class {
|
|
|
402
353
|
}
|
|
403
354
|
};
|
|
404
355
|
}
|
|
405
|
-
if (jsonData?.jsonOptions?.schema) {
|
|
406
|
-
let schema = jsonData.jsonOptions.schema;
|
|
407
|
-
try {
|
|
408
|
-
schema = zodToJsonSchema(schema);
|
|
409
|
-
} catch (error) {
|
|
410
|
-
}
|
|
411
|
-
jsonData = {
|
|
412
|
-
...jsonData,
|
|
413
|
-
jsonOptions: {
|
|
414
|
-
...jsonData.jsonOptions,
|
|
415
|
-
schema
|
|
416
|
-
}
|
|
417
|
-
};
|
|
418
|
-
}
|
|
419
356
|
try {
|
|
420
357
|
const response = await this.postRequest(
|
|
421
358
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -479,26 +416,16 @@ var FirecrawlApp = class {
|
|
|
479
416
|
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
480
417
|
* @param id - The ID of the batch scrape operation.
|
|
481
418
|
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
|
482
|
-
* @param nextURL - The `next` URL from the previous batch scrape status. Only required if you're not manually increasing `skip`. Only used when `getAllData = false`.
|
|
483
|
-
* @param skip - How many entries to skip to paginate. Only used when `getAllData = false`.
|
|
484
|
-
* @param limit - How many entries to return. Only used when `getAllData = false`.
|
|
485
419
|
* @returns The response containing the job status.
|
|
486
420
|
*/
|
|
487
|
-
async checkBatchScrapeStatus(id, getAllData = false
|
|
421
|
+
async checkBatchScrapeStatus(id, getAllData = false) {
|
|
488
422
|
if (!id) {
|
|
489
423
|
throw new FirecrawlError("No batch scrape ID provided", 400);
|
|
490
424
|
}
|
|
491
425
|
const headers = this.prepareHeaders();
|
|
492
|
-
const targetURL = new URL(nextURL ?? `${this.apiUrl}/v1/batch/scrape/${id}`);
|
|
493
|
-
if (skip !== void 0) {
|
|
494
|
-
targetURL.searchParams.set("skip", skip.toString());
|
|
495
|
-
}
|
|
496
|
-
if (limit !== void 0) {
|
|
497
|
-
targetURL.searchParams.set("limit", limit.toString());
|
|
498
|
-
}
|
|
499
426
|
try {
|
|
500
427
|
const response = await this.getRequest(
|
|
501
|
-
|
|
428
|
+
`${this.apiUrl}/v1/batch/scrape/${id}`,
|
|
502
429
|
headers
|
|
503
430
|
);
|
|
504
431
|
if (response.status === 200) {
|
|
@@ -523,7 +450,6 @@ var FirecrawlApp = class {
|
|
|
523
450
|
total: response.data.total,
|
|
524
451
|
completed: response.data.completed,
|
|
525
452
|
creditsUsed: response.data.creditsUsed,
|
|
526
|
-
next: getAllData ? void 0 : response.data.next,
|
|
527
453
|
expiresAt: new Date(response.data.expiresAt),
|
|
528
454
|
data: allData
|
|
529
455
|
};
|
|
@@ -546,28 +472,6 @@ var FirecrawlApp = class {
|
|
|
546
472
|
}
|
|
547
473
|
return { success: false, error: "Internal server error." };
|
|
548
474
|
}
|
|
549
|
-
/**
|
|
550
|
-
* Returns information about batch scrape errors.
|
|
551
|
-
* @param id - The ID of the batch scrape operation.
|
|
552
|
-
* @returns Information about batch scrape errors.
|
|
553
|
-
*/
|
|
554
|
-
async checkBatchScrapeErrors(id) {
|
|
555
|
-
const headers = this.prepareHeaders();
|
|
556
|
-
try {
|
|
557
|
-
const response = await this.deleteRequest(
|
|
558
|
-
`${this.apiUrl}/v1/batch/scrape/${id}/errors`,
|
|
559
|
-
headers
|
|
560
|
-
);
|
|
561
|
-
if (response.status === 200) {
|
|
562
|
-
return response.data;
|
|
563
|
-
} else {
|
|
564
|
-
this.handleError(response, "check batch scrape errors");
|
|
565
|
-
}
|
|
566
|
-
} catch (error) {
|
|
567
|
-
throw new FirecrawlError(error.message, 500);
|
|
568
|
-
}
|
|
569
|
-
return { success: false, error: "Internal server error." };
|
|
570
|
-
}
|
|
571
475
|
/**
|
|
572
476
|
* Extracts information from URLs using the Firecrawl API.
|
|
573
477
|
* Currently in Beta. Expect breaking changes on future minor versions.
|
|
@@ -590,66 +494,6 @@ var FirecrawlApp = class {
|
|
|
590
494
|
} catch (error) {
|
|
591
495
|
throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400);
|
|
592
496
|
}
|
|
593
|
-
try {
|
|
594
|
-
const response = await this.postRequest(
|
|
595
|
-
this.apiUrl + `/v1/extract`,
|
|
596
|
-
{ ...jsonData, schema: jsonSchema, origin: params?.origin || "api-sdk" },
|
|
597
|
-
headers
|
|
598
|
-
);
|
|
599
|
-
if (response.status === 200) {
|
|
600
|
-
const jobId = response.data.id;
|
|
601
|
-
let extractStatus;
|
|
602
|
-
do {
|
|
603
|
-
const statusResponse = await this.getRequest(
|
|
604
|
-
`${this.apiUrl}/v1/extract/${jobId}`,
|
|
605
|
-
headers
|
|
606
|
-
);
|
|
607
|
-
extractStatus = statusResponse.data;
|
|
608
|
-
if (extractStatus.status === "completed") {
|
|
609
|
-
if (extractStatus.success) {
|
|
610
|
-
return {
|
|
611
|
-
success: true,
|
|
612
|
-
data: extractStatus.data,
|
|
613
|
-
warning: extractStatus.warning,
|
|
614
|
-
error: extractStatus.error,
|
|
615
|
-
sources: extractStatus?.sources || void 0
|
|
616
|
-
};
|
|
617
|
-
} else {
|
|
618
|
-
throw new FirecrawlError(`Failed to extract data. Error: ${extractStatus.error}`, statusResponse.status);
|
|
619
|
-
}
|
|
620
|
-
} else if (extractStatus.status === "failed" || extractStatus.status === "cancelled") {
|
|
621
|
-
throw new FirecrawlError(`Extract job ${extractStatus.status}. Error: ${extractStatus.error}`, statusResponse.status);
|
|
622
|
-
}
|
|
623
|
-
await new Promise((resolve) => setTimeout(resolve, 1e3));
|
|
624
|
-
} while (extractStatus.status !== "completed");
|
|
625
|
-
} else {
|
|
626
|
-
this.handleError(response, "extract");
|
|
627
|
-
}
|
|
628
|
-
} catch (error) {
|
|
629
|
-
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
|
630
|
-
}
|
|
631
|
-
return { success: false, error: "Internal server error." };
|
|
632
|
-
}
|
|
633
|
-
/**
|
|
634
|
-
* Initiates an asynchronous extract job for a URL using the Firecrawl API.
|
|
635
|
-
* @param url - The URL to extract data from.
|
|
636
|
-
* @param params - Additional parameters for the extract request.
|
|
637
|
-
* @param idempotencyKey - Optional idempotency key for the request.
|
|
638
|
-
* @returns The response from the extract operation.
|
|
639
|
-
*/
|
|
640
|
-
async asyncExtract(urls, params, idempotencyKey) {
|
|
641
|
-
const headers = this.prepareHeaders(idempotencyKey);
|
|
642
|
-
let jsonData = { urls, ...params };
|
|
643
|
-
let jsonSchema;
|
|
644
|
-
try {
|
|
645
|
-
if (params?.schema instanceof zt.ZodType) {
|
|
646
|
-
jsonSchema = zodToJsonSchema(params.schema);
|
|
647
|
-
} else {
|
|
648
|
-
jsonSchema = params?.schema;
|
|
649
|
-
}
|
|
650
|
-
} catch (error) {
|
|
651
|
-
throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400);
|
|
652
|
-
}
|
|
653
497
|
try {
|
|
654
498
|
const response = await this.postRequest(
|
|
655
499
|
this.apiUrl + `/v1/extract`,
|
|
@@ -657,34 +501,24 @@ var FirecrawlApp = class {
|
|
|
657
501
|
headers
|
|
658
502
|
);
|
|
659
503
|
if (response.status === 200) {
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
* @param jobId - The ID of the extract job.
|
|
672
|
-
* @returns The status of the extract job.
|
|
673
|
-
*/
|
|
674
|
-
async getExtractStatus(jobId) {
|
|
675
|
-
try {
|
|
676
|
-
const response = await this.getRequest(
|
|
677
|
-
`${this.apiUrl}/v1/extract/${jobId}`,
|
|
678
|
-
this.prepareHeaders()
|
|
679
|
-
);
|
|
680
|
-
if (response.status === 200) {
|
|
681
|
-
return response.data;
|
|
504
|
+
const responseData = response.data;
|
|
505
|
+
if (responseData.success) {
|
|
506
|
+
return {
|
|
507
|
+
success: true,
|
|
508
|
+
data: responseData.data,
|
|
509
|
+
warning: responseData.warning,
|
|
510
|
+
error: responseData.error
|
|
511
|
+
};
|
|
512
|
+
} else {
|
|
513
|
+
throw new FirecrawlError(`Failed to scrape URL. Error: ${responseData.error}`, response.status);
|
|
514
|
+
}
|
|
682
515
|
} else {
|
|
683
|
-
this.handleError(response, "
|
|
516
|
+
this.handleError(response, "extract");
|
|
684
517
|
}
|
|
685
518
|
} catch (error) {
|
|
686
519
|
throw new FirecrawlError(error.message, 500);
|
|
687
520
|
}
|
|
521
|
+
return { success: false, error: "Internal server error." };
|
|
688
522
|
}
|
|
689
523
|
/**
|
|
690
524
|
* Prepares the headers for an API request.
|
|
@@ -800,13 +634,11 @@ var FirecrawlApp = class {
|
|
|
800
634
|
* @param {string} action - The action being performed when the error occurred.
|
|
801
635
|
*/
|
|
802
636
|
handleError(response, action) {
|
|
803
|
-
if ([
|
|
637
|
+
if ([402, 408, 409, 500].includes(response.status)) {
|
|
804
638
|
const errorMessage = response.data.error || "Unknown error occurred";
|
|
805
|
-
const details = response.data.details ? ` - ${JSON.stringify(response.data.details)}` : "";
|
|
806
639
|
throw new FirecrawlError(
|
|
807
|
-
`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}
|
|
808
|
-
response.status
|
|
809
|
-
response?.data?.details
|
|
640
|
+
`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`,
|
|
641
|
+
response.status
|
|
810
642
|
);
|
|
811
643
|
} else {
|
|
812
644
|
throw new FirecrawlError(
|
|
@@ -824,8 +656,7 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
824
656
|
constructor(id, app) {
|
|
825
657
|
super();
|
|
826
658
|
this.id = id;
|
|
827
|
-
|
|
828
|
-
this.ws = new WebSocket(`${wsUrl}/v1/crawl/${id}`, app.apiKey);
|
|
659
|
+
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
829
660
|
this.status = "scraping";
|
|
830
661
|
this.data = [];
|
|
831
662
|
const messageHandler = (msg) => {
|
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -94,6 +94,7 @@ export interface CrawlScrapeOptions {
|
|
|
94
94
|
skipTlsVerification?: boolean;
|
|
95
95
|
removeBase64Images?: boolean;
|
|
96
96
|
blockAds?: boolean;
|
|
97
|
+
proxy?: "basic" | "stealth";
|
|
97
98
|
}
|
|
98
99
|
|
|
99
100
|
export type Action = {
|
|
@@ -263,6 +264,7 @@ export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
|
263
264
|
includeSubdomains?: boolean;
|
|
264
265
|
origin?: string;
|
|
265
266
|
showSources?: boolean;
|
|
267
|
+
scrapeOptions?: CrawlScrapeOptions;
|
|
266
268
|
}
|
|
267
269
|
|
|
268
270
|
/**
|
|
@@ -347,6 +349,70 @@ export interface CrawlErrorsResponse {
|
|
|
347
349
|
robotsBlocked: string[];
|
|
348
350
|
};
|
|
349
351
|
|
|
352
|
+
/**
|
|
353
|
+
* Parameters for deep research operations.
|
|
354
|
+
* Defines options for conducting deep research on a topic.
|
|
355
|
+
*/
|
|
356
|
+
export interface DeepResearchParams {
|
|
357
|
+
/**
|
|
358
|
+
* Maximum depth of research iterations (1-10)
|
|
359
|
+
* @default 7
|
|
360
|
+
*/
|
|
361
|
+
maxDepth?: number;
|
|
362
|
+
/**
|
|
363
|
+
* Time limit in seconds (30-300)
|
|
364
|
+
* @default 270
|
|
365
|
+
*/
|
|
366
|
+
timeLimit?: number;
|
|
367
|
+
/**
|
|
368
|
+
* Experimental flag for streaming steps
|
|
369
|
+
*/
|
|
370
|
+
__experimental_streamSteps?: boolean;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* Response interface for deep research operations.
|
|
375
|
+
*/
|
|
376
|
+
export interface DeepResearchResponse {
|
|
377
|
+
success: boolean;
|
|
378
|
+
id: string;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
/**
|
|
382
|
+
* Status response interface for deep research operations.
|
|
383
|
+
*/
|
|
384
|
+
export interface DeepResearchStatusResponse {
|
|
385
|
+
success: boolean;
|
|
386
|
+
data: {
|
|
387
|
+
findings: Array<{
|
|
388
|
+
text: string;
|
|
389
|
+
source: string;
|
|
390
|
+
}>;
|
|
391
|
+
finalAnalysis: string;
|
|
392
|
+
analysis: string;
|
|
393
|
+
completedSteps: number;
|
|
394
|
+
totalSteps: number;
|
|
395
|
+
};
|
|
396
|
+
status: "processing" | "completed" | "failed";
|
|
397
|
+
error?: string;
|
|
398
|
+
expiresAt: string;
|
|
399
|
+
currentDepth: number;
|
|
400
|
+
maxDepth: number;
|
|
401
|
+
activities: Array<{
|
|
402
|
+
type: string;
|
|
403
|
+
status: string;
|
|
404
|
+
message: string;
|
|
405
|
+
timestamp: string;
|
|
406
|
+
depth: number;
|
|
407
|
+
}>;
|
|
408
|
+
sources: Array<{
|
|
409
|
+
url: string;
|
|
410
|
+
title: string;
|
|
411
|
+
description: string;
|
|
412
|
+
}>;
|
|
413
|
+
summaries: string[];
|
|
414
|
+
}
|
|
415
|
+
|
|
350
416
|
/**
|
|
351
417
|
* Main class for interacting with the Firecrawl API.
|
|
352
418
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
@@ -1280,6 +1346,119 @@ export default class FirecrawlApp {
|
|
|
1280
1346
|
);
|
|
1281
1347
|
}
|
|
1282
1348
|
}
|
|
1349
|
+
|
|
1350
|
+
/**
|
|
1351
|
+
* Initiates a deep research operation on a given topic and polls until completion.
|
|
1352
|
+
* @param params - Parameters for the deep research operation.
|
|
1353
|
+
* @returns The final research results.
|
|
1354
|
+
*/
|
|
1355
|
+
async __deepResearch(topic: string, params: DeepResearchParams): Promise<DeepResearchStatusResponse | ErrorResponse> {
|
|
1356
|
+
try {
|
|
1357
|
+
const response = await this.__asyncDeepResearch(topic, params);
|
|
1358
|
+
|
|
1359
|
+
if (!response.success || 'error' in response) {
|
|
1360
|
+
return { success: false, error: 'error' in response ? response.error : 'Unknown error' };
|
|
1361
|
+
}
|
|
1362
|
+
|
|
1363
|
+
if (!response.id) {
|
|
1364
|
+
throw new FirecrawlError(`Failed to start research. No job ID returned.`, 500);
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
const jobId = response.id;
|
|
1368
|
+
let researchStatus;
|
|
1369
|
+
|
|
1370
|
+
while (true) {
|
|
1371
|
+
// console.log("Checking research status...");
|
|
1372
|
+
researchStatus = await this.__checkDeepResearchStatus(jobId);
|
|
1373
|
+
// console.log("Research status:", researchStatus);
|
|
1374
|
+
|
|
1375
|
+
if ('error' in researchStatus && !researchStatus.success) {
|
|
1376
|
+
return researchStatus;
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
if (researchStatus.status === "completed") {
|
|
1380
|
+
return researchStatus;
|
|
1381
|
+
}
|
|
1382
|
+
|
|
1383
|
+
if (researchStatus.status === "failed") {
|
|
1384
|
+
throw new FirecrawlError(
|
|
1385
|
+
`Research job ${researchStatus.status}. Error: ${researchStatus.error}`,
|
|
1386
|
+
500
|
|
1387
|
+
);
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
if (researchStatus.status !== "processing") {
|
|
1391
|
+
break;
|
|
1392
|
+
}
|
|
1393
|
+
|
|
1394
|
+
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
1395
|
+
}
|
|
1396
|
+
// console.log("Research status finished:", researchStatus);
|
|
1397
|
+
|
|
1398
|
+
return { success: false, error: "Research job terminated unexpectedly" };
|
|
1399
|
+
} catch (error: any) {
|
|
1400
|
+
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
|
1401
|
+
}
|
|
1402
|
+
}
|
|
1403
|
+
|
|
1404
|
+
/**
|
|
1405
|
+
* Initiates a deep research operation on a given topic without polling.
|
|
1406
|
+
* @param params - Parameters for the deep research operation.
|
|
1407
|
+
* @returns The response containing the research job ID.
|
|
1408
|
+
*/
|
|
1409
|
+
async __asyncDeepResearch(topic: string, params: DeepResearchParams): Promise<DeepResearchResponse | ErrorResponse> {
|
|
1410
|
+
const headers = this.prepareHeaders();
|
|
1411
|
+
try {
|
|
1412
|
+
const response: AxiosResponse = await this.postRequest(
|
|
1413
|
+
`${this.apiUrl}/v1/deep-research`,
|
|
1414
|
+
{ topic, ...params },
|
|
1415
|
+
headers
|
|
1416
|
+
);
|
|
1417
|
+
|
|
1418
|
+
if (response.status === 200) {
|
|
1419
|
+
return response.data;
|
|
1420
|
+
} else {
|
|
1421
|
+
this.handleError(response, "start deep research");
|
|
1422
|
+
}
|
|
1423
|
+
} catch (error: any) {
|
|
1424
|
+
if (error.response?.data?.error) {
|
|
1425
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
1426
|
+
} else {
|
|
1427
|
+
throw new FirecrawlError(error.message, 500);
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
return { success: false, error: "Internal server error." };
|
|
1431
|
+
}
|
|
1432
|
+
|
|
1433
|
+
/**
|
|
1434
|
+
* Checks the status of a deep research operation.
|
|
1435
|
+
* @param id - The ID of the deep research operation.
|
|
1436
|
+
* @returns The current status and results of the research operation.
|
|
1437
|
+
*/
|
|
1438
|
+
async __checkDeepResearchStatus(id: string): Promise<DeepResearchStatusResponse | ErrorResponse> {
|
|
1439
|
+
const headers = this.prepareHeaders();
|
|
1440
|
+
try {
|
|
1441
|
+
const response: AxiosResponse = await this.getRequest(
|
|
1442
|
+
`${this.apiUrl}/v1/deep-research/${id}`,
|
|
1443
|
+
headers
|
|
1444
|
+
);
|
|
1445
|
+
|
|
1446
|
+
if (response.status === 200) {
|
|
1447
|
+
return response.data;
|
|
1448
|
+
} else if (response.status === 404) {
|
|
1449
|
+
throw new FirecrawlError("Deep research job not found", 404);
|
|
1450
|
+
} else {
|
|
1451
|
+
this.handleError(response, "check deep research status");
|
|
1452
|
+
}
|
|
1453
|
+
} catch (error: any) {
|
|
1454
|
+
if (error.response?.data?.error) {
|
|
1455
|
+
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
|
1456
|
+
} else {
|
|
1457
|
+
throw new FirecrawlError(error.message, 500);
|
|
1458
|
+
}
|
|
1459
|
+
}
|
|
1460
|
+
return { success: false, error: "Internal server error." };
|
|
1461
|
+
}
|
|
1283
1462
|
}
|
|
1284
1463
|
|
|
1285
1464
|
interface CrawlWatcherEvents {
|
package/dump.rdb
DELETED
|
Binary file
|