npm - firecrawl - Versions diffs - 0.0.28 - Mend

firecrawl 0.0.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/.env.example +3 -0
package/README.md +195 -0
package/build/index.js +257 -0
package/build_and_publish.sh +34 -0
package/jest.config.cjs +5 -0
package/package.json +52 -0
package/src/__tests__/e2e_withAuth/index.test.ts +156 -0
package/src/__tests__/fixtures/scrape.json +22 -0
package/src/__tests__/index.test.ts +48 -0
package/src/index.ts +411 -0
package/tsconfig.json +111 -0
package/types/index.d.ts +189 -0

package/.env.example ADDED Viewed

@@ -0,0 +1,3 @@
+API_URL=http://localhost:3002
+TEST_API_KEY=fc-YOUR_API_KEY

package/README.md ADDED Viewed

@@ -0,0 +1,195 @@
+# Firecrawl JavaScript SDK
+The Firecrawl JavaScript SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
+## Installation
+To install the Firecrawl JavaScript SDK, you can use npm:
+```bash
+npm install @mendable/firecrawl-js
+```
+## Usage
+1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
+2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
+Here's an example of how to use the SDK with error handling:
+```js
+  import FirecrawlApp from '@mendable/firecrawl-js';
+  async function main() {
+    try {
+      // Initialize the FirecrawlApp with your API key
+      const app = new FirecrawlApp({ apiKey: "YOUR_API_KEY" });
+      // Scrape a single URL
+      const url = 'https://mendable.ai';
+      const scrapedData = await app.scrapeUrl(url);
+      console.log(scrapedData);
+      // Crawl a website
+      const crawlUrl = 'https://mendable.ai';
+      const params = {
+      crawlerOptions: {
+        excludes: ['blog/'],
+        includes: [], // leave empty for all pages
+        limit: 1000,
+        },
+        pageOptions: {
+            onlyMainContent: true
+        }
+      };
+      const crawlResult = await app.crawlUrl(crawlUrl, params);
+      console.log(crawlResult);
+    } catch (error) {
+      console.error('An error occurred:', error.message);
+    }
+  }
+  main();
+```
+### Scraping a URL
+To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
+```js
+  async function scrapeExample() {
+    try {
+      const url = 'https://example.com';
+      const scrapedData = await app.scrapeUrl(url);
+      console.log(scrapedData);
+    } catch (error) {
+      console.error(
+        'Error occurred while scraping:',
+        error.message
+      );
+    }
+  }
+  scrapeExample();
+```
+### Extracting structured data from a URL
+With LLM extraction, you can easily extract structured data from any URL. We support zod schemas to make it easier for you too. Here is how you to use it:
+```js
+import { z } from "zod";
+const zodSchema = z.object({
+  top: z
+    .array(
+      z.object({
+        title: z.string(),
+        points: z.number(),
+        by: z.string(),
+        commentsURL: z.string(),
+      })
+    )
+    .length(5)
+    .describe("Top 5 stories on Hacker News"),
+});
+let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
+  extractorOptions: { extractionSchema: zodSchema },
+});
+console.log(llmExtractionResult.data.llm_extraction);
+```
+### Search for a query
+Used to search the web, get the most relevant results, scrap each page and return the markdown.
+```js
+query = 'what is mendable?'
+searchResult = app.search(query)
+```
+### Crawling a Website
+To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
+```js
+async function crawlExample() {
+  try {
+    const crawlUrl = 'https://example.com';
+    const params = {
+      crawlerOptions: {
+        excludes: ['blog/'],
+        includes: [], // leave empty for all pages
+        limit: 1000,
+      },
+      pageOptions: {
+        onlyMainContent: true
+      }
+    };
+    const waitUntilDone = true;
+    const timeout = 5;
+    const crawlResult = await app.crawlUrl(
+      crawlUrl,
+      params,
+      waitUntilDone,
+      timeout
+    );
+    console.log(crawlResult);
+  } catch (error) {
+    console.error(
+      'Error occurred while crawling:',
+      error.message
+    );
+  }
+}
+crawlExample();
+```
+### Checking Crawl Status
+To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
+```js
+async function checkStatusExample(jobId) {
+  try {
+    const status = await app.checkCrawlStatus(jobId);
+    console.log(status);
+  } catch (error) {
+    console.error(
+      'Error occurred while checking crawl status:',
+      error.message
+    );
+  }
+}
+// Example usage, assuming you have a jobId
+checkStatusExample('your_job_id_here');
+```
+## Running Locally
+To use the SDK when running Firecrawl locally, you can change the initial Firecrawl app instance to:
+```js
+const app = new FirecrawlApp({ apiKey: "YOUR_API_KEY", apiUrl: "http://localhost:3002" });
+```
+## Error Handling
+The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks.
+## Contributing
+Contributions to the Firecrawl JavaScript SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
+## License
+The Firecrawl JavaScript SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).

package/build/index.js ADDED Viewed

@@ -0,0 +1,257 @@
+var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
+    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
+    return new (P || (P = Promise))(function (resolve, reject) {
+        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
+        function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
+        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+    });
+};
+import axios from "axios";
+import { z } from "zod";
+import { zodToJsonSchema } from "zod-to-json-schema";
+/**
+ * Main class for interacting with the Firecrawl API.
+ */
+export default class FirecrawlApp {
+    /**
+     * Initializes a new instance of the FirecrawlApp class.
+     * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
+     */
+    constructor({ apiKey = null, apiUrl = null }) {
+        this.apiKey = apiKey || "";
+        this.apiUrl = apiUrl || "https://api.firecrawl.dev";
+        if (!this.apiKey) {
+            throw new Error("No API key provided");
+        }
+    }
+    /**
+     * Scrapes a URL using the Firecrawl API.
+     * @param {string} url - The URL to scrape.
+     * @param {Params | null} params - Additional parameters for the scrape request.
+     * @returns {Promise<ScrapeResponse>} The response from the scrape operation.
+     */
+    scrapeUrl(url, params = null) {
+        var _a;
+        return __awaiter(this, void 0, void 0, function* () {
+            const headers = {
+                "Content-Type": "application/json",
+                Authorization: `Bearer ${this.apiKey}`,
+            };
+            let jsonData = Object.assign({ url }, params);
+            if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) {
+                let schema = params.extractorOptions.extractionSchema;
+                // Check if schema is an instance of ZodSchema to correctly identify Zod schemas
+                if (schema instanceof z.ZodSchema) {
+                    schema = zodToJsonSchema(schema);
+                }
+                jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) });
+            }
+            try {
+                const response = yield axios.post(this.apiUrl + "/v0/scrape", jsonData, { headers });
+                if (response.status === 200) {
+                    const responseData = response.data;
+                    if (responseData.success) {
+                        return responseData;
+                    }
+                    else {
+                        throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
+                    }
+                }
+                else {
+                    this.handleError(response, "scrape URL");
+                }
+            }
+            catch (error) {
+                throw new Error(error.message);
+            }
+            return { success: false, error: "Internal server error." };
+        });
+    }
+    /**
+     * Searches for a query using the Firecrawl API.
+     * @param {string} query - The query to search for.
+     * @param {Params | null} params - Additional parameters for the search request.
+     * @returns {Promise<SearchResponse>} The response from the search operation.
+     */
+    search(query, params = null) {
+        return __awaiter(this, void 0, void 0, function* () {
+            const headers = {
+                "Content-Type": "application/json",
+                Authorization: `Bearer ${this.apiKey}`,
+            };
+            let jsonData = { query };
+            if (params) {
+                jsonData = Object.assign(Object.assign({}, jsonData), params);
+            }
+            try {
+                const response = yield axios.post(this.apiUrl + "/v0/search", jsonData, { headers });
+                if (response.status === 200) {
+                    const responseData = response.data;
+                    if (responseData.success) {
+                        return responseData;
+                    }
+                    else {
+                        throw new Error(`Failed to search. Error: ${responseData.error}`);
+                    }
+                }
+                else {
+                    this.handleError(response, "search");
+                }
+            }
+            catch (error) {
+                throw new Error(error.message);
+            }
+            return { success: false, error: "Internal server error." };
+        });
+    }
+    /**
+     * Initiates a crawl job for a URL using the Firecrawl API.
+     * @param {string} url - The URL to crawl.
+     * @param {Params | null} params - Additional parameters for the crawl request.
+     * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
+     * @param {number} pollInterval - Time in seconds for job status checks.
+     * @param {string} idempotencyKey - Optional idempotency key for the request.
+     * @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
+     */
+    crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) {
+        return __awaiter(this, void 0, void 0, function* () {
+            const headers = this.prepareHeaders(idempotencyKey);
+            let jsonData = { url };
+            if (params) {
+                jsonData = Object.assign(Object.assign({}, jsonData), params);
+            }
+            try {
+                const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers);
+                if (response.status === 200) {
+                    const jobId = response.data.jobId;
+                    if (waitUntilDone) {
+                        return this.monitorJobStatus(jobId, headers, pollInterval);
+                    }
+                    else {
+                        return { success: true, jobId };
+                    }
+                }
+                else {
+                    this.handleError(response, "start crawl job");
+                }
+            }
+            catch (error) {
+                console.log(error);
+                throw new Error(error.message);
+            }
+            return { success: false, error: "Internal server error." };
+        });
+    }
+    /**
+     * Checks the status of a crawl job using the Firecrawl API.
+     * @param {string} jobId - The job ID of the crawl operation.
+     * @returns {Promise<JobStatusResponse>} The response containing the job status.
+     */
+    checkCrawlStatus(jobId) {
+        return __awaiter(this, void 0, void 0, function* () {
+            const headers = this.prepareHeaders();
+            try {
+                const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers);
+                if (response.status === 200) {
+                    return {
+                        success: true,
+                        status: response.data.status,
+                        data: response.data.data,
+                        partial_data: !response.data.data
+                            ? response.data.partial_data
+                            : undefined,
+                    };
+                }
+                else {
+                    this.handleError(response, "check crawl status");
+                }
+            }
+            catch (error) {
+                throw new Error(error.message);
+            }
+            return {
+                success: false,
+                status: "unknown",
+                error: "Internal server error.",
+            };
+        });
+    }
+    /**
+     * Prepares the headers for an API request.
+     * @returns {AxiosRequestHeaders} The prepared headers.
+     */
+    prepareHeaders(idempotencyKey) {
+        return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}));
+    }
+    /**
+     * Sends a POST request to the specified URL.
+     * @param {string} url - The URL to send the request to.
+     * @param {Params} data - The data to send in the request.
+     * @param {AxiosRequestHeaders} headers - The headers for the request.
+     * @returns {Promise<AxiosResponse>} The response from the POST request.
+     */
+    postRequest(url, data, headers) {
+        return axios.post(url, data, { headers });
+    }
+    /**
+     * Sends a GET request to the specified URL.
+     * @param {string} url - The URL to send the request to.
+     * @param {AxiosRequestHeaders} headers - The headers for the request.
+     * @returns {Promise<AxiosResponse>} The response from the GET request.
+     */
+    getRequest(url, headers) {
+        return axios.get(url, { headers });
+    }
+    /**
+     * Monitors the status of a crawl job until completion or failure.
+     * @param {string} jobId - The job ID of the crawl operation.
+     * @param {AxiosRequestHeaders} headers - The headers for the request.
+     * @param {number} timeout - Timeout in seconds for job status checks.
+     * @returns {Promise<any>} The final job status or data.
+     */
+    monitorJobStatus(jobId, headers, checkInterval) {
+        return __awaiter(this, void 0, void 0, function* () {
+            while (true) {
+                const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers);
+                if (statusResponse.status === 200) {
+                    const statusData = statusResponse.data;
+                    if (statusData.status === "completed") {
+                        if ("data" in statusData) {
+                            return statusData.data;
+                        }
+                        else {
+                            throw new Error("Crawl job completed but no data was returned");
+                        }
+                    }
+                    else if (["active", "paused", "pending", "queued"].includes(statusData.status)) {
+                        if (checkInterval < 2) {
+                            checkInterval = 2;
+                        }
+                        yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again
+                    }
+                    else {
+                        throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`);
+                    }
+                }
+                else {
+                    this.handleError(statusResponse, "check crawl status");
+                }
+            }
+        });
+    }
+    /**
+     * Handles errors from API responses.
+     * @param {AxiosResponse} response - The response from the API.
+     * @param {string} action - The action being performed when the error occurred.
+     */
+    handleError(response, action) {
+        if ([402, 408, 409, 500].includes(response.status)) {
+            const errorMessage = response.data.error || "Unknown error occurred";
+            throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`);
+        }
+        else {
+            throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`);
+        }
+    }
+}

package/build_and_publish.sh ADDED Viewed

@@ -0,0 +1,34 @@
+#!/bin/bash
+function build_and_publish {
+    PACKAGE_NAME=$1
+    # Replace placeholder with the package name in package.json
+    sed -i '' "s/\"name\": \"PLACEHOLDER_NAME\"/\"name\": \"$PACKAGE_NAME\"/" package.json
+    # Debug: show modified state
+    echo "Modified package.json for $PACKAGE_NAME:"
+    cat package.json
+    # Publish the package using npm
+    npm publish
+    # Check if publish was successful
+    if [ $? -ne 0 ]; then
+        echo "Publish failed for $PACKAGE_NAME"
+        exit 1
+    fi
+    # Revert the changes to the original placeholder in package.json
+    sed -i '' "s/\"name\": \"$PACKAGE_NAME\"/\"name\": \"PLACEHOLDER_NAME\"/" package.json
+    # Debug: show reverted state
+    echo "Reverted package.json to placeholder:"
+    cat package.json
+}
+# Build and publish the first package to npm
+build_and_publish "@mendable/firecrawl-js"
+# Build and publish the second package to npm
+build_and_publish "firecrawl"

package/jest.config.cjs ADDED Viewed

@@ -0,0 +1,5 @@
+/** @type {import('ts-jest').JestConfigWithTsJest} */
+module.exports = {
+  preset: 'ts-jest',
+  testEnvironment: 'node',
+};

package/package.json ADDED Viewed

@@ -0,0 +1,52 @@
+{
+  "name": "firecrawl",
+  "version": "0.0.28",
+  "description": "JavaScript SDK for Firecrawl API",
+  "main": "build/index.js",
+  "types": "types/index.d.ts",
+  "type": "module",
+  "scripts": {
+    "build": "tsc",
+    "build-and-publish": "npm run build && npm publish --access public",
+    "publish-beta": "npm run build && npm publish --access public --tag beta",
+    "test": "jest src/__tests__/**/*.test.ts"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/mendableai/firecrawl.git"
+  },
+  "author": "Mendable.ai",
+  "license": "MIT",
+  "dependencies": {
+    "axios": "^1.6.8",
+    "dotenv": "^16.4.5",
+    "uuid": "^9.0.1",
+    "zod": "^3.23.8",
+    "zod-to-json-schema": "^3.23.0"
+  },
+  "bugs": {
+    "url": "https://github.com/mendableai/firecrawl/issues"
+  },
+  "homepage": "https://github.com/mendableai/firecrawl#readme",
+  "devDependencies": {
+    "@jest/globals": "^29.7.0",
+    "@types/axios": "^0.14.0",
+    "@types/dotenv": "^8.2.0",
+    "@types/jest": "^29.5.12",
+    "@types/mocha": "^10.0.6",
+    "@types/node": "^20.12.12",
+    "@types/uuid": "^9.0.8",
+    "jest": "^29.7.0",
+    "ts-jest": "^29.1.2",
+    "typescript": "^5.4.5"
+  },
+  "keywords": [
+    "firecrawl",
+    "mendable",
+    "crawler",
+    "web",
+    "scraper",
+    "api",
+    "sdk"
+  ]
+}