@mendable/firecrawl-js 0.0.26 → 0.0.29-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -0
- package/build/index.js +129 -136
- package/package.json +2 -1
- package/src/__tests__/e2e_withAuth/index.test.ts +16 -15
- package/src/__tests__/index.test.ts +1 -1
- package/src/index.ts +85 -24
- package/tsconfig.json +9 -3
- package/types/index.d.ts +70 -13
package/README.md
CHANGED
|
@@ -176,6 +176,11 @@ async function checkStatusExample(jobId) {
|
|
|
176
176
|
checkStatusExample('your_job_id_here');
|
|
177
177
|
```
|
|
178
178
|
|
|
179
|
+
## Running Locally
|
|
180
|
+
To use the SDK when running Firecrawl locally, you can change the initial Firecrawl app instance to:
|
|
181
|
+
```js
|
|
182
|
+
const app = new FirecrawlApp({ apiKey: "YOUR_API_KEY", apiUrl: "http://localhost:3002" });
|
|
183
|
+
```
|
|
179
184
|
|
|
180
185
|
## Error Handling
|
|
181
186
|
|
package/build/index.js
CHANGED
|
@@ -1,12 +1,3 @@
|
|
|
1
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
-
});
|
|
9
|
-
};
|
|
10
1
|
import axios from "axios";
|
|
11
2
|
import { z } from "zod";
|
|
12
3
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
@@ -18,9 +9,9 @@ export default class FirecrawlApp {
|
|
|
18
9
|
* Initializes a new instance of the FirecrawlApp class.
|
|
19
10
|
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
|
|
20
11
|
*/
|
|
21
|
-
constructor({ apiKey = null }) {
|
|
22
|
-
this.apiUrl = "https://api.firecrawl.dev";
|
|
12
|
+
constructor({ apiKey = null, apiUrl = null }) {
|
|
23
13
|
this.apiKey = apiKey || "";
|
|
14
|
+
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
|
24
15
|
if (!this.apiKey) {
|
|
25
16
|
throw new Error("No API key provided");
|
|
26
17
|
}
|
|
@@ -31,42 +22,46 @@ export default class FirecrawlApp {
|
|
|
31
22
|
* @param {Params | null} params - Additional parameters for the scrape request.
|
|
32
23
|
* @returns {Promise<ScrapeResponse>} The response from the scrape operation.
|
|
33
24
|
*/
|
|
34
|
-
scrapeUrl(
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
let
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
if (schema instanceof z.ZodSchema) {
|
|
46
|
-
schema = zodToJsonSchema(schema);
|
|
47
|
-
}
|
|
48
|
-
jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) });
|
|
25
|
+
async scrapeUrl(url, params = null) {
|
|
26
|
+
const headers = {
|
|
27
|
+
"Content-Type": "application/json",
|
|
28
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
29
|
+
};
|
|
30
|
+
let jsonData = { url, ...params };
|
|
31
|
+
if (params?.extractorOptions?.extractionSchema) {
|
|
32
|
+
let schema = params.extractorOptions.extractionSchema;
|
|
33
|
+
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
|
34
|
+
if (schema instanceof z.ZodSchema) {
|
|
35
|
+
schema = zodToJsonSchema(schema);
|
|
49
36
|
}
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
37
|
+
jsonData = {
|
|
38
|
+
...jsonData,
|
|
39
|
+
extractorOptions: {
|
|
40
|
+
...params.extractorOptions,
|
|
41
|
+
extractionSchema: schema,
|
|
42
|
+
mode: params.extractorOptions.mode || "llm-extraction",
|
|
43
|
+
},
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
try {
|
|
47
|
+
const response = await axios.post(this.apiUrl + "/v0/scrape", jsonData, { headers });
|
|
48
|
+
if (response.status === 200) {
|
|
49
|
+
const responseData = response.data;
|
|
50
|
+
if (responseData.success) {
|
|
51
|
+
return responseData;
|
|
60
52
|
}
|
|
61
53
|
else {
|
|
62
|
-
|
|
54
|
+
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
|
|
63
55
|
}
|
|
64
56
|
}
|
|
65
|
-
|
|
66
|
-
|
|
57
|
+
else {
|
|
58
|
+
this.handleError(response, "scrape URL");
|
|
67
59
|
}
|
|
68
|
-
|
|
69
|
-
|
|
60
|
+
}
|
|
61
|
+
catch (error) {
|
|
62
|
+
throw new Error(error.message);
|
|
63
|
+
}
|
|
64
|
+
return { success: false, error: "Internal server error." };
|
|
70
65
|
}
|
|
71
66
|
/**
|
|
72
67
|
* Searches for a query using the Firecrawl API.
|
|
@@ -74,36 +69,34 @@ export default class FirecrawlApp {
|
|
|
74
69
|
* @param {Params | null} params - Additional parameters for the search request.
|
|
75
70
|
* @returns {Promise<SearchResponse>} The response from the search operation.
|
|
76
71
|
*/
|
|
77
|
-
search(
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
return responseData;
|
|
93
|
-
}
|
|
94
|
-
else {
|
|
95
|
-
throw new Error(`Failed to search. Error: ${responseData.error}`);
|
|
96
|
-
}
|
|
72
|
+
async search(query, params = null) {
|
|
73
|
+
const headers = {
|
|
74
|
+
"Content-Type": "application/json",
|
|
75
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
76
|
+
};
|
|
77
|
+
let jsonData = { query };
|
|
78
|
+
if (params) {
|
|
79
|
+
jsonData = { ...jsonData, ...params };
|
|
80
|
+
}
|
|
81
|
+
try {
|
|
82
|
+
const response = await axios.post(this.apiUrl + "/v0/search", jsonData, { headers });
|
|
83
|
+
if (response.status === 200) {
|
|
84
|
+
const responseData = response.data;
|
|
85
|
+
if (responseData.success) {
|
|
86
|
+
return responseData;
|
|
97
87
|
}
|
|
98
88
|
else {
|
|
99
|
-
|
|
89
|
+
throw new Error(`Failed to search. Error: ${responseData.error}`);
|
|
100
90
|
}
|
|
101
91
|
}
|
|
102
|
-
|
|
103
|
-
|
|
92
|
+
else {
|
|
93
|
+
this.handleError(response, "search");
|
|
104
94
|
}
|
|
105
|
-
|
|
106
|
-
|
|
95
|
+
}
|
|
96
|
+
catch (error) {
|
|
97
|
+
throw new Error(error.message);
|
|
98
|
+
}
|
|
99
|
+
return { success: false, error: "Internal server error." };
|
|
107
100
|
}
|
|
108
101
|
/**
|
|
109
102
|
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
@@ -114,73 +107,75 @@ export default class FirecrawlApp {
|
|
|
114
107
|
* @param {string} idempotencyKey - Optional idempotency key for the request.
|
|
115
108
|
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
|
|
116
109
|
*/
|
|
117
|
-
crawlUrl(
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
return this.monitorJobStatus(jobId, headers, pollInterval);
|
|
130
|
-
}
|
|
131
|
-
else {
|
|
132
|
-
return { success: true, jobId };
|
|
133
|
-
}
|
|
110
|
+
async crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) {
|
|
111
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
112
|
+
let jsonData = { url };
|
|
113
|
+
if (params) {
|
|
114
|
+
jsonData = { ...jsonData, ...params };
|
|
115
|
+
}
|
|
116
|
+
try {
|
|
117
|
+
const response = await this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers);
|
|
118
|
+
if (response.status === 200) {
|
|
119
|
+
const jobId = response.data.jobId;
|
|
120
|
+
if (waitUntilDone) {
|
|
121
|
+
return this.monitorJobStatus(jobId, headers, pollInterval);
|
|
134
122
|
}
|
|
135
123
|
else {
|
|
136
|
-
|
|
124
|
+
return { success: true, jobId };
|
|
137
125
|
}
|
|
138
126
|
}
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
throw new Error(error.message);
|
|
127
|
+
else {
|
|
128
|
+
this.handleError(response, "start crawl job");
|
|
142
129
|
}
|
|
143
|
-
|
|
144
|
-
|
|
130
|
+
}
|
|
131
|
+
catch (error) {
|
|
132
|
+
console.log(error);
|
|
133
|
+
throw new Error(error.message);
|
|
134
|
+
}
|
|
135
|
+
return { success: false, error: "Internal server error." };
|
|
145
136
|
}
|
|
146
137
|
/**
|
|
147
138
|
* Checks the status of a crawl job using the Firecrawl API.
|
|
148
139
|
* @param {string} jobId - The job ID of the crawl operation.
|
|
149
140
|
* @returns {Promise<JobStatusResponse>} The response containing the job status.
|
|
150
141
|
*/
|
|
151
|
-
checkCrawlStatus(jobId) {
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
}
|
|
164
|
-
else {
|
|
165
|
-
this.handleError(response, "check crawl status");
|
|
166
|
-
}
|
|
142
|
+
async checkCrawlStatus(jobId) {
|
|
143
|
+
const headers = this.prepareHeaders();
|
|
144
|
+
try {
|
|
145
|
+
const response = await this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers);
|
|
146
|
+
if (response.status === 200) {
|
|
147
|
+
return {
|
|
148
|
+
success: true,
|
|
149
|
+
status: response.data.status,
|
|
150
|
+
data: response.data.data,
|
|
151
|
+
partial_data: !response.data.data
|
|
152
|
+
? response.data.partial_data
|
|
153
|
+
: undefined,
|
|
154
|
+
};
|
|
167
155
|
}
|
|
168
|
-
|
|
169
|
-
|
|
156
|
+
else {
|
|
157
|
+
this.handleError(response, "check crawl status");
|
|
170
158
|
}
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
159
|
+
}
|
|
160
|
+
catch (error) {
|
|
161
|
+
throw new Error(error.message);
|
|
162
|
+
}
|
|
163
|
+
return {
|
|
164
|
+
success: false,
|
|
165
|
+
status: "unknown",
|
|
166
|
+
error: "Internal server error.",
|
|
167
|
+
};
|
|
177
168
|
}
|
|
178
169
|
/**
|
|
179
170
|
* Prepares the headers for an API request.
|
|
180
171
|
* @returns {AxiosRequestHeaders} The prepared headers.
|
|
181
172
|
*/
|
|
182
173
|
prepareHeaders(idempotencyKey) {
|
|
183
|
-
return
|
|
174
|
+
return {
|
|
175
|
+
"Content-Type": "application/json",
|
|
176
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
177
|
+
...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}),
|
|
178
|
+
};
|
|
184
179
|
}
|
|
185
180
|
/**
|
|
186
181
|
* Sends a POST request to the specified URL.
|
|
@@ -208,35 +203,33 @@ export default class FirecrawlApp {
|
|
|
208
203
|
* @param {number} timeout - Timeout in seconds for job status checks.
|
|
209
204
|
* @returns {Promise<any>} The final job status or data.
|
|
210
205
|
*/
|
|
211
|
-
monitorJobStatus(jobId, headers, checkInterval) {
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
if (
|
|
218
|
-
|
|
219
|
-
return statusData.data;
|
|
220
|
-
}
|
|
221
|
-
else {
|
|
222
|
-
throw new Error("Crawl job completed but no data was returned");
|
|
223
|
-
}
|
|
224
|
-
}
|
|
225
|
-
else if (["active", "paused", "pending", "queued"].includes(statusData.status)) {
|
|
226
|
-
if (checkInterval < 2) {
|
|
227
|
-
checkInterval = 2;
|
|
228
|
-
}
|
|
229
|
-
yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again
|
|
206
|
+
async monitorJobStatus(jobId, headers, checkInterval) {
|
|
207
|
+
while (true) {
|
|
208
|
+
const statusResponse = await this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers);
|
|
209
|
+
if (statusResponse.status === 200) {
|
|
210
|
+
const statusData = statusResponse.data;
|
|
211
|
+
if (statusData.status === "completed") {
|
|
212
|
+
if ("data" in statusData) {
|
|
213
|
+
return statusData.data;
|
|
230
214
|
}
|
|
231
215
|
else {
|
|
232
|
-
throw new Error(
|
|
216
|
+
throw new Error("Crawl job completed but no data was returned");
|
|
233
217
|
}
|
|
234
218
|
}
|
|
219
|
+
else if (["active", "paused", "pending", "queued"].includes(statusData.status)) {
|
|
220
|
+
if (checkInterval < 2) {
|
|
221
|
+
checkInterval = 2;
|
|
222
|
+
}
|
|
223
|
+
await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again
|
|
224
|
+
}
|
|
235
225
|
else {
|
|
236
|
-
|
|
226
|
+
throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`);
|
|
237
227
|
}
|
|
238
228
|
}
|
|
239
|
-
|
|
229
|
+
else {
|
|
230
|
+
this.handleError(statusResponse, "check crawl status");
|
|
231
|
+
}
|
|
232
|
+
}
|
|
240
233
|
}
|
|
241
234
|
/**
|
|
242
235
|
* Handles errors from API responses.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mendable/firecrawl-js",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.29-beta.1",
|
|
4
4
|
"description": "JavaScript SDK for Firecrawl API",
|
|
5
5
|
"main": "build/index.js",
|
|
6
6
|
"types": "types/index.d.ts",
|
|
@@ -33,6 +33,7 @@
|
|
|
33
33
|
"@types/axios": "^0.14.0",
|
|
34
34
|
"@types/dotenv": "^8.2.0",
|
|
35
35
|
"@types/jest": "^29.5.12",
|
|
36
|
+
"@types/mocha": "^10.0.6",
|
|
36
37
|
"@types/node": "^20.12.12",
|
|
37
38
|
"@types/uuid": "^9.0.8",
|
|
38
39
|
"jest": "^29.7.0",
|
|
@@ -2,6 +2,7 @@ import FirecrawlApp from '../../index';
|
|
|
2
2
|
import { v4 as uuidv4 } from 'uuid';
|
|
3
3
|
import dotenv from 'dotenv';
|
|
4
4
|
|
|
5
|
+
|
|
5
6
|
dotenv.config();
|
|
6
7
|
|
|
7
8
|
const TEST_API_KEY = process.env.TEST_API_KEY;
|
|
@@ -29,14 +30,14 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
29
30
|
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
|
|
30
31
|
const response = await app.scrapeUrl('https://roastmywebsite.ai');
|
|
31
32
|
expect(response).not.toBeNull();
|
|
32
|
-
expect(response.data
|
|
33
|
+
expect(response.data?.content).toContain("_Roast_");
|
|
33
34
|
}, 30000); // 30 seconds timeout
|
|
34
35
|
|
|
35
36
|
test.concurrent('should return successful response for valid scrape', async () => {
|
|
36
37
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
37
38
|
const response = await app.scrapeUrl('https://roastmywebsite.ai');
|
|
38
39
|
expect(response).not.toBeNull();
|
|
39
|
-
expect(response.data
|
|
40
|
+
expect(response.data?.content).toContain("_Roast_");
|
|
40
41
|
expect(response.data).toHaveProperty('markdown');
|
|
41
42
|
expect(response.data).toHaveProperty('metadata');
|
|
42
43
|
expect(response.data).not.toHaveProperty('html');
|
|
@@ -46,23 +47,23 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
46
47
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
47
48
|
const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } });
|
|
48
49
|
expect(response).not.toBeNull();
|
|
49
|
-
expect(response.data
|
|
50
|
-
expect(response.data
|
|
51
|
-
expect(response.data
|
|
50
|
+
expect(response.data?.content).toContain("_Roast_");
|
|
51
|
+
expect(response.data?.markdown).toContain("_Roast_");
|
|
52
|
+
expect(response.data?.html).toContain("<h1");
|
|
52
53
|
}, 30000); // 30 seconds timeout
|
|
53
54
|
|
|
54
55
|
test.concurrent('should return successful response for valid scrape with PDF file', async () => {
|
|
55
56
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
56
57
|
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf');
|
|
57
58
|
expect(response).not.toBeNull();
|
|
58
|
-
expect(response.data
|
|
59
|
+
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
|
59
60
|
}, 30000); // 30 seconds timeout
|
|
60
61
|
|
|
61
62
|
test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => {
|
|
62
63
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
63
64
|
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001');
|
|
64
65
|
expect(response).not.toBeNull();
|
|
65
|
-
expect(response.data
|
|
66
|
+
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
|
66
67
|
}, 30000); // 30 seconds timeout
|
|
67
68
|
|
|
68
69
|
test.concurrent('should throw error for invalid API key on crawl', async () => {
|
|
@@ -112,15 +113,15 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
112
113
|
|
|
113
114
|
expect(statusResponse).not.toBeNull();
|
|
114
115
|
expect(statusResponse.status).toBe('completed');
|
|
115
|
-
expect(statusResponse
|
|
116
|
+
expect(statusResponse?.data?.length).toBeGreaterThan(0);
|
|
116
117
|
}, 35000); // 35 seconds timeout
|
|
117
118
|
|
|
118
119
|
test.concurrent('should return successful response for search', async () => {
|
|
119
120
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
120
121
|
const response = await app.search("test query");
|
|
121
122
|
expect(response).not.toBeNull();
|
|
122
|
-
expect(response
|
|
123
|
-
expect(response
|
|
123
|
+
expect(response?.data?.[0]?.content).toBeDefined();
|
|
124
|
+
expect(response?.data?.length).toBeGreaterThan(2);
|
|
124
125
|
}, 30000); // 30 seconds timeout
|
|
125
126
|
|
|
126
127
|
test.concurrent('should throw error for invalid API key on search', async () => {
|
|
@@ -146,10 +147,10 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
146
147
|
}
|
|
147
148
|
});
|
|
148
149
|
expect(response).not.toBeNull();
|
|
149
|
-
expect(response.data
|
|
150
|
-
const llmExtraction = response.data
|
|
151
|
-
expect(llmExtraction
|
|
152
|
-
expect(typeof llmExtraction
|
|
153
|
-
expect(typeof llmExtraction
|
|
150
|
+
expect(response.data?.llm_extraction).toBeDefined();
|
|
151
|
+
const llmExtraction = response.data?.llm_extraction;
|
|
152
|
+
expect(llmExtraction?.company_mission).toBeDefined();
|
|
153
|
+
expect(typeof llmExtraction?.supports_sso).toBe('boolean');
|
|
154
|
+
expect(typeof llmExtraction?.is_open_source).toBe('boolean');
|
|
154
155
|
}, 30000); // 30 seconds timeout
|
|
155
156
|
});
|
|
@@ -43,6 +43,6 @@ describe('the firecrawl JS SDK', () => {
|
|
|
43
43
|
expect.objectContaining({ headers: expect.objectContaining({'Authorization': `Bearer ${apiKey}`}) }),
|
|
44
44
|
)
|
|
45
45
|
expect(scrapedData.success).toBe(true);
|
|
46
|
-
expect(scrapedData
|
|
46
|
+
expect(scrapedData?.data?.metadata.title).toEqual('Mendable');
|
|
47
47
|
});
|
|
48
48
|
})
|
package/src/index.ts
CHANGED
|
@@ -10,15 +10,63 @@ export interface FirecrawlAppConfig {
|
|
|
10
10
|
}
|
|
11
11
|
|
|
12
12
|
/**
|
|
13
|
-
*
|
|
13
|
+
* Metadata for a Firecrawl document.
|
|
14
14
|
*/
|
|
15
|
-
export interface
|
|
15
|
+
export interface FirecrawlDocumentMetadata {
|
|
16
|
+
title?: string;
|
|
17
|
+
description?: string;
|
|
18
|
+
language?: string;
|
|
19
|
+
keywords?: string;
|
|
20
|
+
robots?: string;
|
|
21
|
+
ogTitle?: string;
|
|
22
|
+
ogDescription?: string;
|
|
23
|
+
ogUrl?: string;
|
|
24
|
+
ogImage?: string;
|
|
25
|
+
ogAudio?: string;
|
|
26
|
+
ogDeterminer?: string;
|
|
27
|
+
ogLocale?: string;
|
|
28
|
+
ogLocaleAlternate?: string[];
|
|
29
|
+
ogSiteName?: string;
|
|
30
|
+
ogVideo?: string;
|
|
31
|
+
dctermsCreated?: string;
|
|
32
|
+
dcDateCreated?: string;
|
|
33
|
+
dcDate?: string;
|
|
34
|
+
dctermsType?: string;
|
|
35
|
+
dcType?: string;
|
|
36
|
+
dctermsAudience?: string;
|
|
37
|
+
dctermsSubject?: string;
|
|
38
|
+
dcSubject?: string;
|
|
39
|
+
dcDescription?: string;
|
|
40
|
+
dctermsKeywords?: string;
|
|
41
|
+
modifiedTime?: string;
|
|
42
|
+
publishedTime?: string;
|
|
43
|
+
articleTag?: string;
|
|
44
|
+
articleSection?: string;
|
|
45
|
+
sourceURL?: string;
|
|
46
|
+
pageStatusCode?: number;
|
|
47
|
+
pageError?: string;
|
|
16
48
|
[key: string]: any;
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Document interface for Firecrawl.
|
|
53
|
+
*/
|
|
54
|
+
export interface FirecrawlDocument {
|
|
55
|
+
id?: string;
|
|
56
|
+
url?: string;
|
|
57
|
+
content: string;
|
|
58
|
+
markdown?: string;
|
|
59
|
+
html?: string;
|
|
60
|
+
llm_extraction?: Record<string, any>;
|
|
61
|
+
createdAt?: Date;
|
|
62
|
+
updatedAt?: Date;
|
|
63
|
+
type?: string;
|
|
64
|
+
metadata: FirecrawlDocumentMetadata;
|
|
65
|
+
childrenLinks?: string[];
|
|
66
|
+
provider?: string;
|
|
67
|
+
warning?: string;
|
|
68
|
+
|
|
69
|
+
index?: number;
|
|
22
70
|
}
|
|
23
71
|
|
|
24
72
|
/**
|
|
@@ -26,16 +74,15 @@ export interface Params {
|
|
|
26
74
|
*/
|
|
27
75
|
export interface ScrapeResponse {
|
|
28
76
|
success: boolean;
|
|
29
|
-
data?:
|
|
77
|
+
data?: FirecrawlDocument;
|
|
30
78
|
error?: string;
|
|
31
79
|
}
|
|
32
|
-
|
|
33
80
|
/**
|
|
34
81
|
* Response interface for searching operations.
|
|
35
82
|
*/
|
|
36
83
|
export interface SearchResponse {
|
|
37
84
|
success: boolean;
|
|
38
|
-
data?:
|
|
85
|
+
data?: FirecrawlDocument[];
|
|
39
86
|
error?: string;
|
|
40
87
|
}
|
|
41
88
|
/**
|
|
@@ -44,10 +91,9 @@ export interface SearchResponse {
|
|
|
44
91
|
export interface CrawlResponse {
|
|
45
92
|
success: boolean;
|
|
46
93
|
jobId?: string;
|
|
47
|
-
data?:
|
|
94
|
+
data?: FirecrawlDocument[];
|
|
48
95
|
error?: string;
|
|
49
96
|
}
|
|
50
|
-
|
|
51
97
|
/**
|
|
52
98
|
* Response interface for job status checks.
|
|
53
99
|
*/
|
|
@@ -55,24 +101,35 @@ export interface JobStatusResponse {
|
|
|
55
101
|
success: boolean;
|
|
56
102
|
status: string;
|
|
57
103
|
jobId?: string;
|
|
58
|
-
data?:
|
|
59
|
-
partial_data?:
|
|
104
|
+
data?: FirecrawlDocument[];
|
|
105
|
+
partial_data?: FirecrawlDocument[];
|
|
60
106
|
error?: string;
|
|
61
107
|
}
|
|
62
|
-
|
|
108
|
+
/**
|
|
109
|
+
* Generic parameter interface.
|
|
110
|
+
*/
|
|
111
|
+
export interface Params {
|
|
112
|
+
[key: string]: any;
|
|
113
|
+
extractorOptions?: {
|
|
114
|
+
extractionSchema: z.ZodSchema | any;
|
|
115
|
+
mode?: "llm-extraction";
|
|
116
|
+
extractionPrompt?: string;
|
|
117
|
+
};
|
|
118
|
+
}
|
|
63
119
|
/**
|
|
64
120
|
* Main class for interacting with the Firecrawl API.
|
|
65
121
|
*/
|
|
66
122
|
export default class FirecrawlApp {
|
|
67
123
|
private apiKey: string;
|
|
68
|
-
private apiUrl: string
|
|
124
|
+
private apiUrl: string;
|
|
69
125
|
|
|
70
126
|
/**
|
|
71
127
|
* Initializes a new instance of the FirecrawlApp class.
|
|
72
128
|
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
|
|
73
129
|
*/
|
|
74
|
-
constructor({ apiKey = null }: FirecrawlAppConfig) {
|
|
130
|
+
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
|
75
131
|
this.apiKey = apiKey || "";
|
|
132
|
+
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
|
76
133
|
if (!this.apiKey) {
|
|
77
134
|
throw new Error("No API key provided");
|
|
78
135
|
}
|
|
@@ -112,7 +169,7 @@ export default class FirecrawlApp {
|
|
|
112
169
|
const response: AxiosResponse = await axios.post(
|
|
113
170
|
this.apiUrl + "/v0/scrape",
|
|
114
171
|
jsonData,
|
|
115
|
-
{ headers }
|
|
172
|
+
{ headers }
|
|
116
173
|
);
|
|
117
174
|
if (response.status === 200) {
|
|
118
175
|
const responseData = response.data;
|
|
@@ -231,7 +288,9 @@ export default class FirecrawlApp {
|
|
|
231
288
|
success: true,
|
|
232
289
|
status: response.data.status,
|
|
233
290
|
data: response.data.data,
|
|
234
|
-
partial_data: !response.data.data
|
|
291
|
+
partial_data: !response.data.data
|
|
292
|
+
? response.data.partial_data
|
|
293
|
+
: undefined,
|
|
235
294
|
};
|
|
236
295
|
} else {
|
|
237
296
|
this.handleError(response, "check crawl status");
|
|
@@ -252,10 +311,10 @@ export default class FirecrawlApp {
|
|
|
252
311
|
*/
|
|
253
312
|
prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
|
|
254
313
|
return {
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
...(idempotencyKey ? {
|
|
258
|
-
} as AxiosRequestHeaders & {
|
|
314
|
+
"Content-Type": "application/json",
|
|
315
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
316
|
+
...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}),
|
|
317
|
+
} as AxiosRequestHeaders & { "x-idempotency-key"?: string };
|
|
259
318
|
}
|
|
260
319
|
|
|
261
320
|
/**
|
|
@@ -317,7 +376,9 @@ export default class FirecrawlApp {
|
|
|
317
376
|
if (checkInterval < 2) {
|
|
318
377
|
checkInterval = 2;
|
|
319
378
|
}
|
|
320
|
-
await new Promise((resolve) =>
|
|
379
|
+
await new Promise((resolve) =>
|
|
380
|
+
setTimeout(resolve, checkInterval * 1000)
|
|
381
|
+
); // Wait for the specified timeout before checking again
|
|
321
382
|
} else {
|
|
322
383
|
throw new Error(
|
|
323
384
|
`Crawl job failed or was stopped. Status: ${statusData.status}`
|
package/tsconfig.json
CHANGED
|
@@ -11,7 +11,6 @@
|
|
|
11
11
|
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
|
12
12
|
|
|
13
13
|
/* Language and Environment */
|
|
14
|
-
"target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
|
15
14
|
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
|
|
16
15
|
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
|
17
16
|
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
|
|
@@ -25,9 +24,16 @@
|
|
|
25
24
|
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
|
26
25
|
|
|
27
26
|
/* Modules */
|
|
28
|
-
"module": "NodeNext", /* Specify what module code is generated. */
|
|
29
27
|
"rootDir": "./src", /* Specify the root folder within your source files. */
|
|
30
|
-
|
|
28
|
+
|
|
29
|
+
"target": "ES2021",
|
|
30
|
+
"lib": [
|
|
31
|
+
"ES2021",
|
|
32
|
+
"ES2022.Object",
|
|
33
|
+
"DOM"
|
|
34
|
+
],
|
|
35
|
+
"module": "NodeNext",
|
|
36
|
+
"moduleResolution": "nodenext",/* Specify how TypeScript looks up a file from a given module specifier. */
|
|
31
37
|
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
|
|
32
38
|
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
|
33
39
|
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
|
package/types/index.d.ts
CHANGED
|
@@ -8,22 +8,68 @@ export interface FirecrawlAppConfig {
|
|
|
8
8
|
apiUrl?: string | null;
|
|
9
9
|
}
|
|
10
10
|
/**
|
|
11
|
-
*
|
|
11
|
+
* Metadata for a Firecrawl document.
|
|
12
12
|
*/
|
|
13
|
-
export interface
|
|
13
|
+
export interface FirecrawlDocumentMetadata {
|
|
14
|
+
title?: string;
|
|
15
|
+
description?: string;
|
|
16
|
+
language?: string;
|
|
17
|
+
keywords?: string;
|
|
18
|
+
robots?: string;
|
|
19
|
+
ogTitle?: string;
|
|
20
|
+
ogDescription?: string;
|
|
21
|
+
ogUrl?: string;
|
|
22
|
+
ogImage?: string;
|
|
23
|
+
ogAudio?: string;
|
|
24
|
+
ogDeterminer?: string;
|
|
25
|
+
ogLocale?: string;
|
|
26
|
+
ogLocaleAlternate?: string[];
|
|
27
|
+
ogSiteName?: string;
|
|
28
|
+
ogVideo?: string;
|
|
29
|
+
dctermsCreated?: string;
|
|
30
|
+
dcDateCreated?: string;
|
|
31
|
+
dcDate?: string;
|
|
32
|
+
dctermsType?: string;
|
|
33
|
+
dcType?: string;
|
|
34
|
+
dctermsAudience?: string;
|
|
35
|
+
dctermsSubject?: string;
|
|
36
|
+
dcSubject?: string;
|
|
37
|
+
dcDescription?: string;
|
|
38
|
+
dctermsKeywords?: string;
|
|
39
|
+
modifiedTime?: string;
|
|
40
|
+
publishedTime?: string;
|
|
41
|
+
articleTag?: string;
|
|
42
|
+
articleSection?: string;
|
|
43
|
+
sourceURL?: string;
|
|
44
|
+
pageStatusCode?: number;
|
|
45
|
+
pageError?: string;
|
|
14
46
|
[key: string]: any;
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Document interface for Firecrawl.
|
|
50
|
+
*/
|
|
51
|
+
export interface FirecrawlDocument {
|
|
52
|
+
id?: string;
|
|
53
|
+
url?: string;
|
|
54
|
+
content: string;
|
|
55
|
+
markdown?: string;
|
|
56
|
+
html?: string;
|
|
57
|
+
llm_extraction?: Record<string, any>;
|
|
58
|
+
createdAt?: Date;
|
|
59
|
+
updatedAt?: Date;
|
|
60
|
+
type?: string;
|
|
61
|
+
metadata: FirecrawlDocumentMetadata;
|
|
62
|
+
childrenLinks?: string[];
|
|
63
|
+
provider?: string;
|
|
64
|
+
warning?: string;
|
|
65
|
+
index?: number;
|
|
20
66
|
}
|
|
21
67
|
/**
|
|
22
68
|
* Response interface for scraping operations.
|
|
23
69
|
*/
|
|
24
70
|
export interface ScrapeResponse {
|
|
25
71
|
success: boolean;
|
|
26
|
-
data?:
|
|
72
|
+
data?: FirecrawlDocument;
|
|
27
73
|
error?: string;
|
|
28
74
|
}
|
|
29
75
|
/**
|
|
@@ -31,7 +77,7 @@ export interface ScrapeResponse {
|
|
|
31
77
|
*/
|
|
32
78
|
export interface SearchResponse {
|
|
33
79
|
success: boolean;
|
|
34
|
-
data?:
|
|
80
|
+
data?: FirecrawlDocument[];
|
|
35
81
|
error?: string;
|
|
36
82
|
}
|
|
37
83
|
/**
|
|
@@ -40,7 +86,7 @@ export interface SearchResponse {
|
|
|
40
86
|
export interface CrawlResponse {
|
|
41
87
|
success: boolean;
|
|
42
88
|
jobId?: string;
|
|
43
|
-
data?:
|
|
89
|
+
data?: FirecrawlDocument[];
|
|
44
90
|
error?: string;
|
|
45
91
|
}
|
|
46
92
|
/**
|
|
@@ -50,10 +96,21 @@ export interface JobStatusResponse {
|
|
|
50
96
|
success: boolean;
|
|
51
97
|
status: string;
|
|
52
98
|
jobId?: string;
|
|
53
|
-
data?:
|
|
54
|
-
partial_data?:
|
|
99
|
+
data?: FirecrawlDocument[];
|
|
100
|
+
partial_data?: FirecrawlDocument[];
|
|
55
101
|
error?: string;
|
|
56
102
|
}
|
|
103
|
+
/**
|
|
104
|
+
* Generic parameter interface.
|
|
105
|
+
*/
|
|
106
|
+
export interface Params {
|
|
107
|
+
[key: string]: any;
|
|
108
|
+
extractorOptions?: {
|
|
109
|
+
extractionSchema: z.ZodSchema | any;
|
|
110
|
+
mode?: "llm-extraction";
|
|
111
|
+
extractionPrompt?: string;
|
|
112
|
+
};
|
|
113
|
+
}
|
|
57
114
|
/**
|
|
58
115
|
* Main class for interacting with the Firecrawl API.
|
|
59
116
|
*/
|
|
@@ -64,7 +121,7 @@ export default class FirecrawlApp {
|
|
|
64
121
|
* Initializes a new instance of the FirecrawlApp class.
|
|
65
122
|
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
|
|
66
123
|
*/
|
|
67
|
-
constructor({ apiKey }: FirecrawlAppConfig);
|
|
124
|
+
constructor({ apiKey, apiUrl }: FirecrawlAppConfig);
|
|
68
125
|
/**
|
|
69
126
|
* Scrapes a URL using the Firecrawl API.
|
|
70
127
|
* @param {string} url - The URL to scrape.
|