@mendable/firecrawl-js 0.0.29-beta.3 → 0.0.29-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,135 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ const index_js_1 = require("../../index.js");
4
+ const uuid_1 = require("uuid");
5
+ const dotenv_1 = require("dotenv");
6
+ dotenv_1.default.config();
7
+ const TEST_API_KEY = process.env.TEST_API_KEY;
8
+ const API_URL = "http://127.0.0.1:3002";
9
+ describe('FirecrawlApp E2E Tests', () => {
10
+ test.concurrent('should throw error for no API key', () => {
11
+ expect(() => {
12
+ new index_js_1.default({ apiKey: null, apiUrl: API_URL });
13
+ }).toThrow("No API key provided");
14
+ });
15
+ test.concurrent('should throw error for invalid API key on scrape', async () => {
16
+ const invalidApp = new index_js_1.default({ apiKey: "invalid_api_key", apiUrl: API_URL });
17
+ await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
18
+ });
19
+ test.concurrent('should throw error for blocklisted URL on scrape', async () => {
20
+ const app = new index_js_1.default({ apiKey: TEST_API_KEY, apiUrl: API_URL });
21
+ const blocklistedUrl = "https://facebook.com/fake-test";
22
+ await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
23
+ });
24
+ test.concurrent('should return successful response with valid preview token', async () => {
25
+ const app = new index_js_1.default({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
26
+ const response = await app.scrapeUrl('https://roastmywebsite.ai');
27
+ expect(response).not.toBeNull();
28
+ expect(response.data?.content).toContain("_Roast_");
29
+ }, 30000); // 30 seconds timeout
30
+ test.concurrent('should return successful response for valid scrape', async () => {
31
+ const app = new index_js_1.default({ apiKey: TEST_API_KEY, apiUrl: API_URL });
32
+ const response = await app.scrapeUrl('https://roastmywebsite.ai');
33
+ expect(response).not.toBeNull();
34
+ expect(response.data?.content).toContain("_Roast_");
35
+ expect(response.data).toHaveProperty('markdown');
36
+ expect(response.data).toHaveProperty('metadata');
37
+ expect(response.data).not.toHaveProperty('html');
38
+ }, 30000); // 30 seconds timeout
39
+ test.concurrent('should return successful response with valid API key and include HTML', async () => {
40
+ const app = new index_js_1.default({ apiKey: TEST_API_KEY, apiUrl: API_URL });
41
+ const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } });
42
+ expect(response).not.toBeNull();
43
+ expect(response.data?.content).toContain("_Roast_");
44
+ expect(response.data?.markdown).toContain("_Roast_");
45
+ expect(response.data?.html).toContain("<h1");
46
+ }, 30000); // 30 seconds timeout
47
+ test.concurrent('should return successful response for valid scrape with PDF file', async () => {
48
+ const app = new index_js_1.default({ apiKey: TEST_API_KEY, apiUrl: API_URL });
49
+ const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf');
50
+ expect(response).not.toBeNull();
51
+ expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
52
+ }, 30000); // 30 seconds timeout
53
+ test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => {
54
+ const app = new index_js_1.default({ apiKey: TEST_API_KEY, apiUrl: API_URL });
55
+ const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001');
56
+ expect(response).not.toBeNull();
57
+ expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
58
+ }, 30000); // 30 seconds timeout
59
+ test.concurrent('should throw error for invalid API key on crawl', async () => {
60
+ const invalidApp = new index_js_1.default({ apiKey: "invalid_api_key", apiUrl: API_URL });
61
+ await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
62
+ });
63
+ test.concurrent('should throw error for blocklisted URL on crawl', async () => {
64
+ const app = new index_js_1.default({ apiKey: TEST_API_KEY, apiUrl: API_URL });
65
+ const blocklistedUrl = "https://twitter.com/fake-test";
66
+ await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
67
+ });
68
+ test.concurrent('should return successful response for crawl and wait for completion', async () => {
69
+ const app = new index_js_1.default({ apiKey: TEST_API_KEY, apiUrl: API_URL });
70
+ const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30);
71
+ expect(response).not.toBeNull();
72
+ expect(response[0].content).toContain("_Roast_");
73
+ }, 60000); // 60 seconds timeout
74
+ test.concurrent('should handle idempotency key for crawl', async () => {
75
+ const app = new index_js_1.default({ apiKey: TEST_API_KEY, apiUrl: API_URL });
76
+ const uniqueIdempotencyKey = (0, uuid_1.v4)();
77
+ const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey);
78
+ expect(response).not.toBeNull();
79
+ expect(response.jobId).toBeDefined();
80
+ await expect(app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
81
+ });
82
+ test.concurrent('should check crawl status', async () => {
83
+ const app = new index_js_1.default({ apiKey: TEST_API_KEY, apiUrl: API_URL });
84
+ const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false);
85
+ expect(response).not.toBeNull();
86
+ expect(response.jobId).toBeDefined();
87
+ let statusResponse = await app.checkCrawlStatus(response.jobId);
88
+ const maxChecks = 15;
89
+ let checks = 0;
90
+ while (statusResponse.status === 'active' && checks < maxChecks) {
91
+ await new Promise(resolve => setTimeout(resolve, 1000));
92
+ expect(statusResponse.partial_data).not.toBeNull();
93
+ statusResponse = await app.checkCrawlStatus(response.jobId);
94
+ checks++;
95
+ }
96
+ expect(statusResponse).not.toBeNull();
97
+ expect(statusResponse.status).toBe('completed');
98
+ expect(statusResponse?.data?.length).toBeGreaterThan(0);
99
+ }, 35000); // 35 seconds timeout
100
+ test.concurrent('should return successful response for search', async () => {
101
+ const app = new index_js_1.default({ apiKey: TEST_API_KEY, apiUrl: API_URL });
102
+ const response = await app.search("test query");
103
+ expect(response).not.toBeNull();
104
+ expect(response?.data?.[0]?.content).toBeDefined();
105
+ expect(response?.data?.length).toBeGreaterThan(2);
106
+ }, 30000); // 30 seconds timeout
107
+ test.concurrent('should throw error for invalid API key on search', async () => {
108
+ const invalidApp = new index_js_1.default({ apiKey: "invalid_api_key", apiUrl: API_URL });
109
+ await expect(invalidApp.search("test query")).rejects.toThrow("Request failed with status code 401");
110
+ });
111
+ test.concurrent('should perform LLM extraction', async () => {
112
+ const app = new index_js_1.default({ apiKey: TEST_API_KEY, apiUrl: API_URL });
113
+ const response = await app.scrapeUrl("https://mendable.ai", {
114
+ extractorOptions: {
115
+ mode: 'llm-extraction',
116
+ extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
117
+ extractionSchema: {
118
+ type: 'object',
119
+ properties: {
120
+ company_mission: { type: 'string' },
121
+ supports_sso: { type: 'boolean' },
122
+ is_open_source: { type: 'boolean' }
123
+ },
124
+ required: ['company_mission', 'supports_sso', 'is_open_source']
125
+ }
126
+ }
127
+ });
128
+ expect(response).not.toBeNull();
129
+ expect(response.data?.llm_extraction).toBeDefined();
130
+ const llmExtraction = response.data?.llm_extraction;
131
+ expect(llmExtraction?.company_mission).toBeDefined();
132
+ expect(typeof llmExtraction?.supports_sso).toBe('boolean');
133
+ expect(typeof llmExtraction?.is_open_source).toBe('boolean');
134
+ }, 30000); // 30 seconds timeout
135
+ });
@@ -0,0 +1,38 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ const globals_1 = require("@jest/globals");
4
+ const axios_1 = require("axios");
5
+ const index_js_1 = require("../index.js");
6
+ const promises_1 = require("fs/promises");
7
+ const path_1 = require("path");
8
+ // Mock jest and set the type
9
+ globals_1.jest.mock('axios');
10
+ const mockedAxios = axios_1.default;
11
+ // Get the fixure data from the JSON file in ./fixtures
12
+ async function loadFixture(name) {
13
+ return await (0, promises_1.readFile)((0, path_1.join)(__dirname, 'fixtures', `${name}.json`), 'utf-8');
14
+ }
15
+ (0, globals_1.describe)('the firecrawl JS SDK', () => {
16
+ (0, globals_1.test)('Should require an API key to instantiate FirecrawlApp', async () => {
17
+ const fn = () => {
18
+ new index_js_1.default({ apiKey: undefined });
19
+ };
20
+ (0, globals_1.expect)(fn).toThrow('No API key provided');
21
+ });
22
+ (0, globals_1.test)('Should return scraped data from a /scrape API call', async () => {
23
+ const mockData = await loadFixture('scrape');
24
+ mockedAxios.post.mockResolvedValue({
25
+ status: 200,
26
+ data: JSON.parse(mockData),
27
+ });
28
+ const apiKey = 'YOUR_API_KEY';
29
+ const app = new index_js_1.default({ apiKey });
30
+ // Scrape a single URL
31
+ const url = 'https://mendable.ai';
32
+ const scrapedData = await app.scrapeUrl(url);
33
+ (0, globals_1.expect)(mockedAxios.post).toHaveBeenCalledTimes(1);
34
+ (0, globals_1.expect)(mockedAxios.post).toHaveBeenCalledWith(globals_1.expect.stringMatching(/^https:\/\/api.firecrawl.dev/), globals_1.expect.objectContaining({ url }), globals_1.expect.objectContaining({ headers: globals_1.expect.objectContaining({ 'Authorization': `Bearer ${apiKey}` }) }));
35
+ (0, globals_1.expect)(scrapedData.success).toBe(true);
36
+ (0, globals_1.expect)(scrapedData?.data?.metadata.title).toEqual('Mendable');
37
+ });
38
+ });
@@ -0,0 +1,263 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ const axios_1 = require("axios");
4
+ const zod_1 = require("zod");
5
+ const zod_to_json_schema_1 = require("zod-to-json-schema");
6
+ /**
7
+ * Main class for interacting with the Firecrawl API.
8
+ */
9
+ class FirecrawlApp {
10
+ /**
11
+ * Initializes a new instance of the FirecrawlApp class.
12
+ * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
13
+ */
14
+ constructor({ apiKey = null, apiUrl = null }) {
15
+ Object.defineProperty(this, "apiKey", {
16
+ enumerable: true,
17
+ configurable: true,
18
+ writable: true,
19
+ value: void 0
20
+ });
21
+ Object.defineProperty(this, "apiUrl", {
22
+ enumerable: true,
23
+ configurable: true,
24
+ writable: true,
25
+ value: void 0
26
+ });
27
+ this.apiKey = apiKey || "";
28
+ this.apiUrl = apiUrl || "https://api.firecrawl.dev";
29
+ if (!this.apiKey) {
30
+ throw new Error("No API key provided");
31
+ }
32
+ }
33
+ /**
34
+ * Scrapes a URL using the Firecrawl API.
35
+ * @param {string} url - The URL to scrape.
36
+ * @param {Params | null} params - Additional parameters for the scrape request.
37
+ * @returns {Promise<ScrapeResponse>} The response from the scrape operation.
38
+ */
39
+ async scrapeUrl(url, params = null) {
40
+ const headers = {
41
+ "Content-Type": "application/json",
42
+ Authorization: `Bearer ${this.apiKey}`,
43
+ };
44
+ let jsonData = { url, ...params };
45
+ if (params?.extractorOptions?.extractionSchema) {
46
+ let schema = params.extractorOptions.extractionSchema;
47
+ // Check if schema is an instance of ZodSchema to correctly identify Zod schemas
48
+ if (schema instanceof zod_1.z.ZodSchema) {
49
+ schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema);
50
+ }
51
+ jsonData = {
52
+ ...jsonData,
53
+ extractorOptions: {
54
+ ...params.extractorOptions,
55
+ extractionSchema: schema,
56
+ mode: params.extractorOptions.mode || "llm-extraction",
57
+ },
58
+ };
59
+ }
60
+ try {
61
+ const response = await axios_1.default.post(this.apiUrl + "/v0/scrape", jsonData, { headers });
62
+ if (response.status === 200) {
63
+ const responseData = response.data;
64
+ if (responseData.success) {
65
+ return responseData;
66
+ }
67
+ else {
68
+ throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
69
+ }
70
+ }
71
+ else {
72
+ this.handleError(response, "scrape URL");
73
+ }
74
+ }
75
+ catch (error) {
76
+ throw new Error(error.message);
77
+ }
78
+ return { success: false, error: "Internal server error." };
79
+ }
80
+ /**
81
+ * Searches for a query using the Firecrawl API.
82
+ * @param {string} query - The query to search for.
83
+ * @param {Params | null} params - Additional parameters for the search request.
84
+ * @returns {Promise<SearchResponse>} The response from the search operation.
85
+ */
86
+ async search(query, params = null) {
87
+ const headers = {
88
+ "Content-Type": "application/json",
89
+ Authorization: `Bearer ${this.apiKey}`,
90
+ };
91
+ let jsonData = { query };
92
+ if (params) {
93
+ jsonData = { ...jsonData, ...params };
94
+ }
95
+ try {
96
+ const response = await axios_1.default.post(this.apiUrl + "/v0/search", jsonData, { headers });
97
+ if (response.status === 200) {
98
+ const responseData = response.data;
99
+ if (responseData.success) {
100
+ return responseData;
101
+ }
102
+ else {
103
+ throw new Error(`Failed to search. Error: ${responseData.error}`);
104
+ }
105
+ }
106
+ else {
107
+ this.handleError(response, "search");
108
+ }
109
+ }
110
+ catch (error) {
111
+ throw new Error(error.message);
112
+ }
113
+ return { success: false, error: "Internal server error." };
114
+ }
115
+ /**
116
+ * Initiates a crawl job for a URL using the Firecrawl API.
117
+ * @param {string} url - The URL to crawl.
118
+ * @param {Params | null} params - Additional parameters for the crawl request.
119
+ * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
120
+ * @param {number} pollInterval - Time in seconds for job status checks.
121
+ * @param {string} idempotencyKey - Optional idempotency key for the request.
122
+ * @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
123
+ */
124
+ async crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) {
125
+ const headers = this.prepareHeaders(idempotencyKey);
126
+ let jsonData = { url };
127
+ if (params) {
128
+ jsonData = { ...jsonData, ...params };
129
+ }
130
+ try {
131
+ const response = await this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers);
132
+ if (response.status === 200) {
133
+ const jobId = response.data.jobId;
134
+ if (waitUntilDone) {
135
+ return this.monitorJobStatus(jobId, headers, pollInterval);
136
+ }
137
+ else {
138
+ return { success: true, jobId };
139
+ }
140
+ }
141
+ else {
142
+ this.handleError(response, "start crawl job");
143
+ }
144
+ }
145
+ catch (error) {
146
+ console.log(error);
147
+ throw new Error(error.message);
148
+ }
149
+ return { success: false, error: "Internal server error." };
150
+ }
151
+ /**
152
+ * Checks the status of a crawl job using the Firecrawl API.
153
+ * @param {string} jobId - The job ID of the crawl operation.
154
+ * @returns {Promise<JobStatusResponse>} The response containing the job status.
155
+ */
156
+ async checkCrawlStatus(jobId) {
157
+ const headers = this.prepareHeaders();
158
+ try {
159
+ const response = await this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers);
160
+ if (response.status === 200) {
161
+ return {
162
+ success: true,
163
+ status: response.data.status,
164
+ data: response.data.data,
165
+ partial_data: !response.data.data
166
+ ? response.data.partial_data
167
+ : undefined,
168
+ };
169
+ }
170
+ else {
171
+ this.handleError(response, "check crawl status");
172
+ }
173
+ }
174
+ catch (error) {
175
+ throw new Error(error.message);
176
+ }
177
+ return {
178
+ success: false,
179
+ status: "unknown",
180
+ error: "Internal server error.",
181
+ };
182
+ }
183
+ /**
184
+ * Prepares the headers for an API request.
185
+ * @returns {AxiosRequestHeaders} The prepared headers.
186
+ */
187
+ prepareHeaders(idempotencyKey) {
188
+ return {
189
+ "Content-Type": "application/json",
190
+ Authorization: `Bearer ${this.apiKey}`,
191
+ ...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}),
192
+ };
193
+ }
194
+ /**
195
+ * Sends a POST request to the specified URL.
196
+ * @param {string} url - The URL to send the request to.
197
+ * @param {Params} data - The data to send in the request.
198
+ * @param {AxiosRequestHeaders} headers - The headers for the request.
199
+ * @returns {Promise<AxiosResponse>} The response from the POST request.
200
+ */
201
+ postRequest(url, data, headers) {
202
+ return axios_1.default.post(url, data, { headers });
203
+ }
204
+ /**
205
+ * Sends a GET request to the specified URL.
206
+ * @param {string} url - The URL to send the request to.
207
+ * @param {AxiosRequestHeaders} headers - The headers for the request.
208
+ * @returns {Promise<AxiosResponse>} The response from the GET request.
209
+ */
210
+ getRequest(url, headers) {
211
+ return axios_1.default.get(url, { headers });
212
+ }
213
+ /**
214
+ * Monitors the status of a crawl job until completion or failure.
215
+ * @param {string} jobId - The job ID of the crawl operation.
216
+ * @param {AxiosRequestHeaders} headers - The headers for the request.
217
+ * @param {number} timeout - Timeout in seconds for job status checks.
218
+ * @returns {Promise<any>} The final job status or data.
219
+ */
220
+ async monitorJobStatus(jobId, headers, checkInterval) {
221
+ while (true) {
222
+ const statusResponse = await this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers);
223
+ if (statusResponse.status === 200) {
224
+ const statusData = statusResponse.data;
225
+ if (statusData.status === "completed") {
226
+ if ("data" in statusData) {
227
+ return statusData.data;
228
+ }
229
+ else {
230
+ throw new Error("Crawl job completed but no data was returned");
231
+ }
232
+ }
233
+ else if (["active", "paused", "pending", "queued"].includes(statusData.status)) {
234
+ if (checkInterval < 2) {
235
+ checkInterval = 2;
236
+ }
237
+ await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again
238
+ }
239
+ else {
240
+ throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`);
241
+ }
242
+ }
243
+ else {
244
+ this.handleError(statusResponse, "check crawl status");
245
+ }
246
+ }
247
+ }
248
+ /**
249
+ * Handles errors from API responses.
250
+ * @param {AxiosResponse} response - The response from the API.
251
+ * @param {string} action - The action being performed when the error occurred.
252
+ */
253
+ handleError(response, action) {
254
+ if ([402, 408, 409, 500].includes(response.status)) {
255
+ const errorMessage = response.data.error || "Unknown error occurred";
256
+ throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`);
257
+ }
258
+ else {
259
+ throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`);
260
+ }
261
+ }
262
+ }
263
+ exports.default = FirecrawlApp;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mendable/firecrawl-js",
3
- "version": "0.0.29-beta.3",
3
+ "version": "0.0.29-beta.4",
4
4
  "description": "JavaScript SDK for Firecrawl API",
5
5
  "main": "build/index.js",
6
6
  "types": "types/index.d.ts",
@@ -8,7 +8,7 @@
8
8
  "scripts": {
9
9
  "build": "tsc",
10
10
  "build-and-publish": "npm run build && npm publish --access public",
11
- "build-cjs": "tsc --project tsconfig.cjs.json",
11
+ "build-cjs": "tsc --outDir dist-cjs/ --project tsconfig.cjs.json",
12
12
  "publish-beta": "npm run build && npm publish --access public --tag beta",
13
13
  "test": "jest src/__tests__/**/*.test.ts"
14
14
  },
package/tsconfig.json CHANGED
@@ -29,5 +29,9 @@
29
29
  "node_modules",
30
30
  "dist",
31
31
  "docs"
32
- ]
32
+ ],
33
+ ".": {
34
+ "import": "./build/index.js",
35
+ "require": "./dist-cjs/index.js"
36
+ }
33
37
  }