@mendable/firecrawl 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example ADDED
@@ -0,0 +1,3 @@
1
+ API_URL=http://localhost:3002
2
+ TEST_API_KEY=fc-YOUR_API_KEY
3
+
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Sideguide Technologies Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,160 @@
1
+ # Firecrawl Node SDK
2
+
3
+ The Firecrawl Node SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
4
+
5
+ ## Installation
6
+
7
+ To install the Firecrawl Node SDK, you can use npm:
8
+
9
+ ```bash
10
+ npm install @mendable/firecrawl-js
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ 1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
16
+ 2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
17
+
18
+ Here's an example of how to use the SDK with error handling:
19
+
20
+ ```js
21
+ import FirecrawlApp, { CrawlParams, CrawlStatusResponse } from '@mendable/firecrawl-js';
22
+
23
+ const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
24
+
25
+ // Scrape a website
26
+ const scrapeResponse = await app.scrapeUrl('https://firecrawl.dev', {
27
+ formats: ['markdown', 'html'],
28
+ });
29
+
30
+ if (scrapeResponse) {
31
+ console.log(scrapeResponse)
32
+ }
33
+
34
+ // Crawl a website
35
+ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
36
+ limit: 100,
37
+ scrapeOptions: {
38
+ formats: ['markdown', 'html'],
39
+ }
40
+ })
41
+
42
+ console.log(crawlResponse)
43
+ ```
44
+
45
+ ### Scraping a URL
46
+
47
+ To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
48
+
49
+ ```js
50
+ const url = "https://example.com";
51
+ const scrapedData = await app.scrapeUrl(url);
52
+ ```
53
+
54
+ ### Crawling a Website
55
+
56
+ To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
57
+
58
+ ```js
59
+ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
60
+ limit: 100,
61
+ scrapeOptions: {
62
+ formats: ['markdown', 'html'],
63
+ }
64
+ })
65
+ ```
66
+
67
+
68
+ ### Asynchronous Crawl
69
+
70
+ To initiate an asynchronous crawl of a website, utilize the AsyncCrawlURL method. This method requires the starting URL and optional parameters as inputs. The params argument enables you to define various settings for the asynchronous crawl, such as the maximum number of pages to crawl, permitted domains, and the output format. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the crawl.
71
+
72
+ ```js
73
+ const asyncCrawlResult = await app.asyncCrawlUrl('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
74
+ ```
75
+
76
+ ### Checking Crawl Status
77
+
78
+ To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job`
79
+
80
+ ```js
81
+ const status = await app.checkCrawlStatus(id);
82
+ ```
83
+
84
+ ### Extracting structured data from a URL
85
+
86
+ With LLM extraction, you can easily extract structured data from any URL. We support zod schema to make it easier for you too. Here is how you to use it:
87
+
88
+ ```js
89
+ import FirecrawlApp from "@mendable/firecrawl-js";
90
+ import { z } from "zod";
91
+
92
+ const app = new FirecrawlApp({
93
+ apiKey: "fc-YOUR_API_KEY",
94
+ });
95
+
96
+ // Define schema to extract contents into
97
+ const schema = z.object({
98
+ top: z
99
+ .array(
100
+ z.object({
101
+ title: z.string(),
102
+ points: z.number(),
103
+ by: z.string(),
104
+ commentsURL: z.string(),
105
+ })
106
+ )
107
+ .length(5)
108
+ .describe("Top 5 stories on Hacker News"),
109
+ });
110
+
111
+ const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", {
112
+ extractorOptions: { extractionSchema: schema },
113
+ });
114
+
115
+ console.log(scrapeResult.data["llm_extraction"]);
116
+ ```
117
+
118
+ ### Map a Website
119
+
120
+ Use `map_url` to generate a list of URLs from a website. The `params` argument let you customize the mapping process, including options to exclude subdomains or to utilize the sitemap.
121
+
122
+ ```js
123
+ const mapResult = await app.mapUrl('https://example.com') as MapResponse;
124
+ console.log(mapResult)
125
+ ```
126
+
127
+ ### Crawl a website with WebSockets
128
+
129
+ To crawl a website with WebSockets, use the `crawlUrlAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
130
+
131
+ ```js
132
+ // Crawl a website with WebSockets:
133
+ const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
134
+
135
+ watch.addEventListener("document", doc => {
136
+ console.log("DOC", doc.detail);
137
+ });
138
+
139
+ watch.addEventListener("error", err => {
140
+ console.error("ERR", err.detail.error);
141
+ });
142
+
143
+ watch.addEventListener("done", state => {
144
+ console.log("DONE", state.detail.status);
145
+ });
146
+ ```
147
+
148
+ ## Error Handling
149
+
150
+ The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks.
151
+
152
+ ## License
153
+
154
+ The Firecrawl Node SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions:
155
+
156
+ - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
157
+
158
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
159
+
160
+ Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details.
@@ -0,0 +1,354 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.CrawlWatcher = void 0;
7
+ const axios_1 = __importDefault(require("axios"));
8
+ const zod_to_json_schema_1 = require("zod-to-json-schema");
9
+ const isows_1 = require("isows");
10
+ const typescript_event_target_1 = require("typescript-event-target");
11
+ /**
12
+ * Main class for interacting with the Firecrawl API.
13
+ * Provides methods for scraping, searching, crawling, and mapping web content.
14
+ */
15
+ class FirecrawlApp {
16
+ /**
17
+ * Initializes a new instance of the FirecrawlApp class.
18
+ * @param config - Configuration options for the FirecrawlApp instance.
19
+ */
20
+ constructor({ apiKey = null, apiUrl = null }) {
21
+ this.apiKey = apiKey || "";
22
+ this.apiUrl = apiUrl || "https://api.firecrawl.dev";
23
+ }
24
+ /**
25
+ * Scrapes a URL using the Firecrawl API.
26
+ * @param url - The URL to scrape.
27
+ * @param params - Additional parameters for the scrape request.
28
+ * @returns The response from the scrape operation.
29
+ */
30
+ async scrapeUrl(url, params) {
31
+ const headers = {
32
+ "Content-Type": "application/json",
33
+ Authorization: `Bearer ${this.apiKey}`,
34
+ };
35
+ let jsonData = { url, ...params };
36
+ if (jsonData?.extract?.schema) {
37
+ let schema = jsonData.extract.schema;
38
+ // Try parsing the schema as a Zod schema
39
+ try {
40
+ schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema);
41
+ }
42
+ catch (error) {
43
+ }
44
+ jsonData = {
45
+ ...jsonData,
46
+ extract: {
47
+ ...jsonData.extract,
48
+ schema: schema,
49
+ },
50
+ };
51
+ }
52
+ try {
53
+ const response = await axios_1.default.post(this.apiUrl + `/v1/scrape`, jsonData, { headers });
54
+ if (response.status === 200) {
55
+ const responseData = response.data;
56
+ if (responseData.success) {
57
+ return {
58
+ success: true,
59
+ warning: responseData.warning,
60
+ error: responseData.error,
61
+ ...responseData.data
62
+ };
63
+ }
64
+ else {
65
+ throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
66
+ }
67
+ }
68
+ else {
69
+ this.handleError(response, "scrape URL");
70
+ }
71
+ }
72
+ catch (error) {
73
+ throw new Error(error.message);
74
+ }
75
+ return { success: false, error: "Internal server error." };
76
+ }
77
+ /**
78
+ * This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API.
79
+ * @param query - The search query string.
80
+ * @param params - Additional parameters for the search.
81
+ * @returns Throws an error advising to use version 0 of the API.
82
+ */
83
+ async search(query, params) {
84
+ throw new Error("Search is not supported in v1, please update FirecrawlApp() initialization to use v0.");
85
+ }
86
+ /**
87
+ * Initiates a crawl job for a URL using the Firecrawl API.
88
+ * @param url - The URL to crawl.
89
+ * @param params - Additional parameters for the crawl request.
90
+ * @param pollInterval - Time in seconds for job status checks.
91
+ * @param idempotencyKey - Optional idempotency key for the request.
92
+ * @returns The response from the crawl operation.
93
+ */
94
+ async crawlUrl(url, params, pollInterval = 2, idempotencyKey) {
95
+ const headers = this.prepareHeaders(idempotencyKey);
96
+ let jsonData = { url, ...params };
97
+ try {
98
+ const response = await this.postRequest(this.apiUrl + `/v1/crawl`, jsonData, headers);
99
+ if (response.status === 200) {
100
+ const id = response.data.id;
101
+ return this.monitorJobStatus(id, headers, pollInterval);
102
+ }
103
+ else {
104
+ this.handleError(response, "start crawl job");
105
+ }
106
+ }
107
+ catch (error) {
108
+ if (error.response?.data?.error) {
109
+ throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
110
+ }
111
+ else {
112
+ throw new Error(error.message);
113
+ }
114
+ }
115
+ return { success: false, error: "Internal server error." };
116
+ }
117
+ async asyncCrawlUrl(url, params, idempotencyKey) {
118
+ const headers = this.prepareHeaders(idempotencyKey);
119
+ let jsonData = { url, ...params };
120
+ try {
121
+ const response = await this.postRequest(this.apiUrl + `/v1/crawl`, jsonData, headers);
122
+ if (response.status === 200) {
123
+ return response.data;
124
+ }
125
+ else {
126
+ this.handleError(response, "start crawl job");
127
+ }
128
+ }
129
+ catch (error) {
130
+ if (error.response?.data?.error) {
131
+ throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
132
+ }
133
+ else {
134
+ throw new Error(error.message);
135
+ }
136
+ }
137
+ return { success: false, error: "Internal server error." };
138
+ }
139
+ /**
140
+ * Checks the status of a crawl job using the Firecrawl API.
141
+ * @param id - The ID of the crawl operation.
142
+ * @returns The response containing the job status.
143
+ */
144
+ async checkCrawlStatus(id) {
145
+ if (!id) {
146
+ throw new Error("No crawl ID provided");
147
+ }
148
+ const headers = this.prepareHeaders();
149
+ try {
150
+ const response = await this.getRequest(`${this.apiUrl}/v1/crawl/${id}`, headers);
151
+ if (response.status === 200) {
152
+ return ({
153
+ success: true,
154
+ status: response.data.status,
155
+ total: response.data.total,
156
+ completed: response.data.completed,
157
+ creditsUsed: response.data.creditsUsed,
158
+ expiresAt: new Date(response.data.expiresAt),
159
+ next: response.data.next,
160
+ data: response.data.data,
161
+ error: response.data.error
162
+ });
163
+ }
164
+ else {
165
+ this.handleError(response, "check crawl status");
166
+ }
167
+ }
168
+ catch (error) {
169
+ throw new Error(error.message);
170
+ }
171
+ return { success: false, error: "Internal server error." };
172
+ }
173
+ async crawlUrlAndWatch(url, params, idempotencyKey) {
174
+ const crawl = await this.asyncCrawlUrl(url, params, idempotencyKey);
175
+ if (crawl.success && crawl.id) {
176
+ const id = crawl.id;
177
+ return new CrawlWatcher(id, this);
178
+ }
179
+ throw new Error("Crawl job failed to start");
180
+ }
181
+ async mapUrl(url, params) {
182
+ const headers = this.prepareHeaders();
183
+ let jsonData = { url, ...params };
184
+ try {
185
+ const response = await this.postRequest(this.apiUrl + `/v1/map`, jsonData, headers);
186
+ if (response.status === 200) {
187
+ return response.data;
188
+ }
189
+ else {
190
+ this.handleError(response, "map");
191
+ }
192
+ }
193
+ catch (error) {
194
+ throw new Error(error.message);
195
+ }
196
+ return { success: false, error: "Internal server error." };
197
+ }
198
+ /**
199
+ * Prepares the headers for an API request.
200
+ * @param idempotencyKey - Optional key to ensure idempotency.
201
+ * @returns The prepared headers.
202
+ */
203
+ prepareHeaders(idempotencyKey) {
204
+ return {
205
+ "Content-Type": "application/json",
206
+ Authorization: `Bearer ${this.apiKey}`,
207
+ ...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}),
208
+ };
209
+ }
210
+ /**
211
+ * Sends a POST request to the specified URL.
212
+ * @param url - The URL to send the request to.
213
+ * @param data - The data to send in the request.
214
+ * @param headers - The headers for the request.
215
+ * @returns The response from the POST request.
216
+ */
217
+ postRequest(url, data, headers) {
218
+ return axios_1.default.post(url, data, { headers });
219
+ }
220
+ /**
221
+ * Sends a GET request to the specified URL.
222
+ * @param url - The URL to send the request to.
223
+ * @param headers - The headers for the request.
224
+ * @returns The response from the GET request.
225
+ */
226
+ getRequest(url, headers) {
227
+ return axios_1.default.get(url, { headers });
228
+ }
229
+ /**
230
+ * Monitors the status of a crawl job until completion or failure.
231
+ * @param id - The ID of the crawl operation.
232
+ * @param headers - The headers for the request.
233
+ * @param checkInterval - Interval in seconds for job status checks.
234
+ * @param checkUrl - Optional URL to check the status (used for v1 API)
235
+ * @returns The final job status or data.
236
+ */
237
+ async monitorJobStatus(id, headers, checkInterval) {
238
+ while (true) {
239
+ let statusResponse = await this.getRequest(`${this.apiUrl}/v1/crawl/${id}`, headers);
240
+ if (statusResponse.status === 200) {
241
+ let statusData = statusResponse.data;
242
+ if (statusData.status === "completed") {
243
+ if ("data" in statusData) {
244
+ let data = statusData.data;
245
+ while ('next' in statusData) {
246
+ statusResponse = await this.getRequest(statusData.next, headers);
247
+ statusData = statusResponse.data;
248
+ data = data.concat(statusData.data);
249
+ }
250
+ statusData.data = data;
251
+ return statusData;
252
+ }
253
+ else {
254
+ throw new Error("Crawl job completed but no data was returned");
255
+ }
256
+ }
257
+ else if (["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)) {
258
+ checkInterval = Math.max(checkInterval, 2);
259
+ await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000));
260
+ }
261
+ else {
262
+ throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`);
263
+ }
264
+ }
265
+ else {
266
+ this.handleError(statusResponse, "check crawl status");
267
+ }
268
+ }
269
+ }
270
+ /**
271
+ * Handles errors from API responses.
272
+ * @param {AxiosResponse} response - The response from the API.
273
+ * @param {string} action - The action being performed when the error occurred.
274
+ */
275
+ handleError(response, action) {
276
+ if ([402, 408, 409, 500].includes(response.status)) {
277
+ const errorMessage = response.data.error || "Unknown error occurred";
278
+ throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`);
279
+ }
280
+ else {
281
+ throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`);
282
+ }
283
+ }
284
+ }
285
+ exports.default = FirecrawlApp;
286
+ class CrawlWatcher extends typescript_event_target_1.TypedEventTarget {
287
+ constructor(id, app) {
288
+ super();
289
+ this.ws = new isows_1.WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
290
+ this.status = "scraping";
291
+ this.data = [];
292
+ const messageHandler = (msg) => {
293
+ if (msg.type === "done") {
294
+ this.status = "completed";
295
+ this.dispatchTypedEvent("done", new CustomEvent("done", {
296
+ detail: {
297
+ status: this.status,
298
+ data: this.data,
299
+ },
300
+ }));
301
+ }
302
+ else if (msg.type === "error") {
303
+ this.status = "failed";
304
+ this.dispatchTypedEvent("error", new CustomEvent("error", {
305
+ detail: {
306
+ status: this.status,
307
+ data: this.data,
308
+ error: msg.error,
309
+ },
310
+ }));
311
+ }
312
+ else if (msg.type === "catchup") {
313
+ this.status = msg.data.status;
314
+ this.data.push(...(msg.data.data ?? []));
315
+ for (const doc of this.data) {
316
+ this.dispatchTypedEvent("document", new CustomEvent("document", {
317
+ detail: doc,
318
+ }));
319
+ }
320
+ }
321
+ else if (msg.type === "document") {
322
+ this.dispatchTypedEvent("document", new CustomEvent("document", {
323
+ detail: msg.data,
324
+ }));
325
+ }
326
+ };
327
+ this.ws.onmessage = ((ev) => {
328
+ if (typeof ev.data !== "string") {
329
+ this.ws.close();
330
+ return;
331
+ }
332
+ const msg = JSON.parse(ev.data);
333
+ messageHandler(msg);
334
+ }).bind(this);
335
+ this.ws.onclose = ((ev) => {
336
+ const msg = JSON.parse(ev.reason);
337
+ messageHandler(msg);
338
+ }).bind(this);
339
+ this.ws.onerror = ((_) => {
340
+ this.status = "failed";
341
+ this.dispatchTypedEvent("error", new CustomEvent("error", {
342
+ detail: {
343
+ status: this.status,
344
+ data: this.data,
345
+ error: "WebSocket error",
346
+ },
347
+ }));
348
+ }).bind(this);
349
+ }
350
+ close() {
351
+ this.ws.close();
352
+ }
353
+ }
354
+ exports.CrawlWatcher = CrawlWatcher;
@@ -0,0 +1 @@
1
+ {"type": "commonjs"}