@mendable/firecrawl-js 1.0.4 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -9
- package/build/cjs/index.js +131 -124
- package/build/esm/index.js +129 -124
- package/package.json +3 -1
- package/src/__tests__/e2e_withAuth/index.test.ts +0 -1
- package/src/index.ts +207 -298
- package/types/index.d.ts +48 -163
package/README.md
CHANGED
|
@@ -37,11 +37,9 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
|
|
37
37
|
scrapeOptions: {
|
|
38
38
|
formats: ['markdown', 'html'],
|
|
39
39
|
}
|
|
40
|
-
}
|
|
40
|
+
})
|
|
41
41
|
|
|
42
|
-
|
|
43
|
-
console.log(crawlResponse)
|
|
44
|
-
}
|
|
42
|
+
console.log(crawlResponse)
|
|
45
43
|
```
|
|
46
44
|
|
|
47
45
|
### Scraping a URL
|
|
@@ -63,16 +61,21 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
|
|
63
61
|
scrapeOptions: {
|
|
64
62
|
formats: ['markdown', 'html'],
|
|
65
63
|
}
|
|
66
|
-
}
|
|
64
|
+
})
|
|
65
|
+
```
|
|
67
66
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
67
|
+
|
|
68
|
+
### Asynchronous Crawl
|
|
69
|
+
|
|
70
|
+
To initiate an asynchronous crawl of a website, utilize the AsyncCrawlURL method. This method requires the starting URL and optional parameters as inputs. The params argument enables you to define various settings for the asynchronous crawl, such as the maximum number of pages to crawl, permitted domains, and the output format. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the crawl.
|
|
71
|
+
|
|
72
|
+
```js
|
|
73
|
+
const asyncCrawlResult = await app.asyncCrawlUrl('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
|
|
71
74
|
```
|
|
72
75
|
|
|
73
76
|
### Checking Crawl Status
|
|
74
77
|
|
|
75
|
-
To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job
|
|
78
|
+
To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job`
|
|
76
79
|
|
|
77
80
|
```js
|
|
78
81
|
const status = await app.checkCrawlStatus(id);
|
|
@@ -121,6 +124,27 @@ const mapResult = await app.mapUrl('https://example.com') as MapResponse;
|
|
|
121
124
|
console.log(mapResult)
|
|
122
125
|
```
|
|
123
126
|
|
|
127
|
+
### Crawl a website with WebSockets
|
|
128
|
+
|
|
129
|
+
To crawl a website with WebSockets, use the `crawlUrlAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
|
130
|
+
|
|
131
|
+
```js
|
|
132
|
+
// Crawl a website with WebSockets:
|
|
133
|
+
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
|
|
134
|
+
|
|
135
|
+
watch.addEventListener("document", doc => {
|
|
136
|
+
console.log("DOC", doc.detail);
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
watch.addEventListener("error", err => {
|
|
140
|
+
console.error("ERR", err.detail.error);
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
watch.addEventListener("done", state => {
|
|
144
|
+
console.log("DONE", state.detail.status);
|
|
145
|
+
});
|
|
146
|
+
```
|
|
147
|
+
|
|
124
148
|
## Error Handling
|
|
125
149
|
|
|
126
150
|
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks.
|
package/build/cjs/index.js
CHANGED
|
@@ -3,9 +3,12 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
3
3
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
4
|
};
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.CrawlWatcher = void 0;
|
|
6
7
|
const axios_1 = __importDefault(require("axios"));
|
|
7
8
|
const zod_1 = require("zod");
|
|
8
9
|
const zod_to_json_schema_1 = require("zod-to-json-schema");
|
|
10
|
+
const isows_1 = require("isows");
|
|
11
|
+
const typescript_event_target_1 = require("typescript-event-target");
|
|
9
12
|
/**
|
|
10
13
|
* Main class for interacting with the Firecrawl API.
|
|
11
14
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
|
@@ -15,13 +18,9 @@ class FirecrawlApp {
|
|
|
15
18
|
* Initializes a new instance of the FirecrawlApp class.
|
|
16
19
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
17
20
|
*/
|
|
18
|
-
constructor({ apiKey = null, apiUrl = null
|
|
21
|
+
constructor({ apiKey = null, apiUrl = null }) {
|
|
19
22
|
this.apiKey = apiKey || "";
|
|
20
23
|
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
|
21
|
-
this.version = version;
|
|
22
|
-
if (!this.apiKey) {
|
|
23
|
-
throw new Error("No API key provided");
|
|
24
|
-
}
|
|
25
24
|
}
|
|
26
25
|
/**
|
|
27
26
|
* Scrapes a URL using the Firecrawl API.
|
|
@@ -51,16 +50,16 @@ class FirecrawlApp {
|
|
|
51
50
|
};
|
|
52
51
|
}
|
|
53
52
|
try {
|
|
54
|
-
const response = await axios_1.default.post(this.apiUrl +
|
|
53
|
+
const response = await axios_1.default.post(this.apiUrl + `/v1/scrape`, jsonData, { headers });
|
|
55
54
|
if (response.status === 200) {
|
|
56
55
|
const responseData = response.data;
|
|
57
56
|
if (responseData.success) {
|
|
58
|
-
return
|
|
57
|
+
return {
|
|
59
58
|
success: true,
|
|
60
59
|
warning: responseData.warning,
|
|
61
60
|
error: responseData.error,
|
|
62
61
|
...responseData.data
|
|
63
|
-
}
|
|
62
|
+
};
|
|
64
63
|
}
|
|
65
64
|
else {
|
|
66
65
|
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
|
|
@@ -76,80 +75,52 @@ class FirecrawlApp {
|
|
|
76
75
|
return { success: false, error: "Internal server error." };
|
|
77
76
|
}
|
|
78
77
|
/**
|
|
79
|
-
*
|
|
80
|
-
* @param query - The query
|
|
81
|
-
* @param params - Additional parameters for the search
|
|
82
|
-
* @returns
|
|
78
|
+
* This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API.
|
|
79
|
+
* @param query - The search query string.
|
|
80
|
+
* @param params - Additional parameters for the search.
|
|
81
|
+
* @returns Throws an error advising to use version 0 of the API.
|
|
83
82
|
*/
|
|
84
83
|
async search(query, params) {
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
84
|
+
throw new Error("Search is not supported in v1, please update FirecrawlApp() initialization to use v0.");
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
88
|
+
* @param url - The URL to crawl.
|
|
89
|
+
* @param params - Additional parameters for the crawl request.
|
|
90
|
+
* @param pollInterval - Time in seconds for job status checks.
|
|
91
|
+
* @param idempotencyKey - Optional idempotency key for the request.
|
|
92
|
+
* @returns The response from the crawl operation.
|
|
93
|
+
*/
|
|
94
|
+
async crawlUrl(url, params, pollInterval = 2, idempotencyKey) {
|
|
95
|
+
const headers = this.prepareHeaders(idempotencyKey);
|
|
96
|
+
let jsonData = { url, ...params };
|
|
96
97
|
try {
|
|
97
|
-
const response = await
|
|
98
|
+
const response = await this.postRequest(this.apiUrl + `/v1/crawl`, jsonData, headers);
|
|
98
99
|
if (response.status === 200) {
|
|
99
|
-
const
|
|
100
|
-
|
|
101
|
-
return responseData;
|
|
102
|
-
}
|
|
103
|
-
else {
|
|
104
|
-
throw new Error(`Failed to search. Error: ${responseData.error}`);
|
|
105
|
-
}
|
|
100
|
+
const id = response.data.id;
|
|
101
|
+
return this.monitorJobStatus(id, headers, pollInterval);
|
|
106
102
|
}
|
|
107
103
|
else {
|
|
108
|
-
this.handleError(response, "
|
|
104
|
+
this.handleError(response, "start crawl job");
|
|
109
105
|
}
|
|
110
106
|
}
|
|
111
107
|
catch (error) {
|
|
112
|
-
|
|
108
|
+
if (error.response?.data?.error) {
|
|
109
|
+
throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
|
|
110
|
+
}
|
|
111
|
+
else {
|
|
112
|
+
throw new Error(error.message);
|
|
113
|
+
}
|
|
113
114
|
}
|
|
114
115
|
return { success: false, error: "Internal server error." };
|
|
115
116
|
}
|
|
116
|
-
|
|
117
|
-
* Initiates a crawl job for a URL using the Firecrawl API.
|
|
118
|
-
* @param url - The URL to crawl.
|
|
119
|
-
* @param params - Additional parameters for the crawl request.
|
|
120
|
-
* @param waitUntilDone - Whether to wait for the crawl job to complete.
|
|
121
|
-
* @param pollInterval - Time in seconds for job status checks.
|
|
122
|
-
* @param idempotencyKey - Optional idempotency key for the request.
|
|
123
|
-
* @returns The response from the crawl operation.
|
|
124
|
-
*/
|
|
125
|
-
async crawlUrl(url, params, waitUntilDone = true, pollInterval = 2, idempotencyKey) {
|
|
117
|
+
async asyncCrawlUrl(url, params, idempotencyKey) {
|
|
126
118
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
127
119
|
let jsonData = { url, ...params };
|
|
128
120
|
try {
|
|
129
|
-
const response = await this.postRequest(this.apiUrl +
|
|
121
|
+
const response = await this.postRequest(this.apiUrl + `/v1/crawl`, jsonData, headers);
|
|
130
122
|
if (response.status === 200) {
|
|
131
|
-
|
|
132
|
-
let checkUrl = undefined;
|
|
133
|
-
if (waitUntilDone) {
|
|
134
|
-
if (this.version === 'v1') {
|
|
135
|
-
checkUrl = response.data.url;
|
|
136
|
-
}
|
|
137
|
-
return this.monitorJobStatus(id, headers, pollInterval, checkUrl);
|
|
138
|
-
}
|
|
139
|
-
else {
|
|
140
|
-
if (this.version === 'v0') {
|
|
141
|
-
return {
|
|
142
|
-
success: true,
|
|
143
|
-
jobId: id
|
|
144
|
-
};
|
|
145
|
-
}
|
|
146
|
-
else {
|
|
147
|
-
return {
|
|
148
|
-
success: true,
|
|
149
|
-
id: id
|
|
150
|
-
};
|
|
151
|
-
}
|
|
152
|
-
}
|
|
123
|
+
return response.data;
|
|
153
124
|
}
|
|
154
125
|
else {
|
|
155
126
|
this.handleError(response, "start crawl job");
|
|
@@ -176,37 +147,19 @@ class FirecrawlApp {
|
|
|
176
147
|
}
|
|
177
148
|
const headers = this.prepareHeaders();
|
|
178
149
|
try {
|
|
179
|
-
const response = await this.getRequest(this.
|
|
180
|
-
`${this.apiUrl}/${this.version}/crawl/${id}` :
|
|
181
|
-
`${this.apiUrl}/${this.version}/crawl/status/${id}`, headers);
|
|
150
|
+
const response = await this.getRequest(`${this.apiUrl}/v1/crawl/${id}`, headers);
|
|
182
151
|
if (response.status === 200) {
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
: undefined,
|
|
195
|
-
};
|
|
196
|
-
}
|
|
197
|
-
else {
|
|
198
|
-
return {
|
|
199
|
-
success: true,
|
|
200
|
-
status: response.data.status,
|
|
201
|
-
total: response.data.total,
|
|
202
|
-
completed: response.data.completed,
|
|
203
|
-
creditsUsed: response.data.creditsUsed,
|
|
204
|
-
expiresAt: new Date(response.data.expiresAt),
|
|
205
|
-
next: response.data.next,
|
|
206
|
-
data: response.data.data,
|
|
207
|
-
error: response.data.error
|
|
208
|
-
};
|
|
209
|
-
}
|
|
152
|
+
return ({
|
|
153
|
+
success: true,
|
|
154
|
+
status: response.data.status,
|
|
155
|
+
total: response.data.total,
|
|
156
|
+
completed: response.data.completed,
|
|
157
|
+
creditsUsed: response.data.creditsUsed,
|
|
158
|
+
expiresAt: new Date(response.data.expiresAt),
|
|
159
|
+
next: response.data.next,
|
|
160
|
+
data: response.data.data,
|
|
161
|
+
error: response.data.error
|
|
162
|
+
});
|
|
210
163
|
}
|
|
211
164
|
else {
|
|
212
165
|
this.handleError(response, "check crawl status");
|
|
@@ -215,29 +168,21 @@ class FirecrawlApp {
|
|
|
215
168
|
catch (error) {
|
|
216
169
|
throw new Error(error.message);
|
|
217
170
|
}
|
|
218
|
-
return
|
|
219
|
-
{
|
|
220
|
-
success: false,
|
|
221
|
-
status: "unknown",
|
|
222
|
-
current: 0,
|
|
223
|
-
current_url: "",
|
|
224
|
-
current_step: "",
|
|
225
|
-
total: 0,
|
|
226
|
-
error: "Internal server error.",
|
|
227
|
-
} :
|
|
228
|
-
{
|
|
229
|
-
success: false,
|
|
230
|
-
error: "Internal server error.",
|
|
231
|
-
};
|
|
171
|
+
return { success: false, error: "Internal server error." };
|
|
232
172
|
}
|
|
233
|
-
async
|
|
234
|
-
|
|
235
|
-
|
|
173
|
+
async crawlUrlAndWatch(url, params, idempotencyKey) {
|
|
174
|
+
const crawl = await this.asyncCrawlUrl(url, params, idempotencyKey);
|
|
175
|
+
if (crawl.success && crawl.id) {
|
|
176
|
+
const id = crawl.id;
|
|
177
|
+
return new CrawlWatcher(id, this);
|
|
236
178
|
}
|
|
179
|
+
throw new Error("Crawl job failed to start");
|
|
180
|
+
}
|
|
181
|
+
async mapUrl(url, params) {
|
|
237
182
|
const headers = this.prepareHeaders();
|
|
238
183
|
let jsonData = { url, ...params };
|
|
239
184
|
try {
|
|
240
|
-
const response = await this.postRequest(this.apiUrl +
|
|
185
|
+
const response = await this.postRequest(this.apiUrl + `/v1/map`, jsonData, headers);
|
|
241
186
|
if (response.status === 200) {
|
|
242
187
|
return response.data;
|
|
243
188
|
}
|
|
@@ -289,21 +234,14 @@ class FirecrawlApp {
|
|
|
289
234
|
* @param checkUrl - Optional URL to check the status (used for v1 API)
|
|
290
235
|
* @returns The final job status or data.
|
|
291
236
|
*/
|
|
292
|
-
async monitorJobStatus(id, headers, checkInterval
|
|
293
|
-
let apiUrl = '';
|
|
237
|
+
async monitorJobStatus(id, headers, checkInterval) {
|
|
294
238
|
while (true) {
|
|
295
|
-
|
|
296
|
-
apiUrl = checkUrl ?? `${this.apiUrl}/v1/crawl/${id}`;
|
|
297
|
-
}
|
|
298
|
-
else if (this.version === 'v0') {
|
|
299
|
-
apiUrl = `${this.apiUrl}/v0/crawl/status/${id}`;
|
|
300
|
-
}
|
|
301
|
-
const statusResponse = await this.getRequest(apiUrl, headers);
|
|
239
|
+
const statusResponse = await this.getRequest(`${this.apiUrl}/v1/crawl/${id}`, headers);
|
|
302
240
|
if (statusResponse.status === 200) {
|
|
303
241
|
const statusData = statusResponse.data;
|
|
304
242
|
if (statusData.status === "completed") {
|
|
305
243
|
if ("data" in statusData) {
|
|
306
|
-
return
|
|
244
|
+
return statusData;
|
|
307
245
|
}
|
|
308
246
|
else {
|
|
309
247
|
throw new Error("Crawl job completed but no data was returned");
|
|
@@ -338,3 +276,72 @@ class FirecrawlApp {
|
|
|
338
276
|
}
|
|
339
277
|
}
|
|
340
278
|
exports.default = FirecrawlApp;
|
|
279
|
+
class CrawlWatcher extends typescript_event_target_1.TypedEventTarget {
|
|
280
|
+
constructor(id, app) {
|
|
281
|
+
super();
|
|
282
|
+
this.ws = new isows_1.WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
283
|
+
this.status = "scraping";
|
|
284
|
+
this.data = [];
|
|
285
|
+
const messageHandler = (msg) => {
|
|
286
|
+
if (msg.type === "done") {
|
|
287
|
+
this.status = "completed";
|
|
288
|
+
this.dispatchTypedEvent("done", new CustomEvent("done", {
|
|
289
|
+
detail: {
|
|
290
|
+
status: this.status,
|
|
291
|
+
data: this.data,
|
|
292
|
+
},
|
|
293
|
+
}));
|
|
294
|
+
}
|
|
295
|
+
else if (msg.type === "error") {
|
|
296
|
+
this.status = "failed";
|
|
297
|
+
this.dispatchTypedEvent("error", new CustomEvent("error", {
|
|
298
|
+
detail: {
|
|
299
|
+
status: this.status,
|
|
300
|
+
data: this.data,
|
|
301
|
+
error: msg.error,
|
|
302
|
+
},
|
|
303
|
+
}));
|
|
304
|
+
}
|
|
305
|
+
else if (msg.type === "catchup") {
|
|
306
|
+
this.status = msg.data.status;
|
|
307
|
+
this.data.push(...(msg.data.data ?? []));
|
|
308
|
+
for (const doc of this.data) {
|
|
309
|
+
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
310
|
+
detail: doc,
|
|
311
|
+
}));
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
else if (msg.type === "document") {
|
|
315
|
+
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
316
|
+
detail: msg.data,
|
|
317
|
+
}));
|
|
318
|
+
}
|
|
319
|
+
};
|
|
320
|
+
this.ws.onmessage = ((ev) => {
|
|
321
|
+
if (typeof ev.data !== "string") {
|
|
322
|
+
this.ws.close();
|
|
323
|
+
return;
|
|
324
|
+
}
|
|
325
|
+
const msg = JSON.parse(ev.data);
|
|
326
|
+
messageHandler(msg);
|
|
327
|
+
}).bind(this);
|
|
328
|
+
this.ws.onclose = ((ev) => {
|
|
329
|
+
const msg = JSON.parse(ev.reason);
|
|
330
|
+
messageHandler(msg);
|
|
331
|
+
}).bind(this);
|
|
332
|
+
this.ws.onerror = ((_) => {
|
|
333
|
+
this.status = "failed";
|
|
334
|
+
this.dispatchTypedEvent("error", new CustomEvent("error", {
|
|
335
|
+
detail: {
|
|
336
|
+
status: this.status,
|
|
337
|
+
data: this.data,
|
|
338
|
+
error: "WebSocket error",
|
|
339
|
+
},
|
|
340
|
+
}));
|
|
341
|
+
}).bind(this);
|
|
342
|
+
}
|
|
343
|
+
close() {
|
|
344
|
+
this.ws.close();
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
exports.CrawlWatcher = CrawlWatcher;
|