@mendable/firecrawl 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,312 @@
1
+ import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index';
2
+ import { v4 as uuidv4 } from 'uuid';
3
+ import dotenv from 'dotenv';
4
+ import { describe, test, expect } from '@jest/globals';
5
+
6
+ dotenv.config();
7
+
8
+ const TEST_API_KEY = process.env.TEST_API_KEY;
9
+ const API_URL = "http://127.0.0.1:3002";
10
+
11
+ describe('FirecrawlApp E2E Tests', () => {
12
+ test.concurrent('should throw error for no API key', async () => {
13
+ expect(() => {
14
+ new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
15
+ }).toThrow("No API key provided");
16
+ });
17
+
18
+ test.concurrent('should throw error for invalid API key on scrape', async () => {
19
+ const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
20
+ await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
21
+ });
22
+
23
+ test.concurrent('should throw error for blocklisted URL on scrape', async () => {
24
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
25
+ const blocklistedUrl = "https://facebook.com/fake-test";
26
+ await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
27
+ });
28
+
29
+ test.concurrent('should return successful response with valid preview token', async () => {
30
+ const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
31
+ const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse;
32
+ expect(response).not.toBeNull();
33
+ expect(response?.markdown).toContain("_Roast_");
34
+ }, 30000); // 30 seconds timeout
35
+
36
+ test.concurrent('should return successful response for valid scrape', async () => {
37
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
38
+ const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse;
39
+ expect(response).not.toBeNull();
40
+ expect(response).not.toHaveProperty('content'); // v0
41
+ expect(response).not.toHaveProperty('html');
42
+ expect(response).not.toHaveProperty('rawHtml');
43
+ expect(response).not.toHaveProperty('screenshot');
44
+ expect(response).not.toHaveProperty('links');
45
+
46
+ expect(response).toHaveProperty('markdown');
47
+ expect(response).toHaveProperty('metadata');
48
+ }, 30000); // 30 seconds timeout
49
+
50
+ test.concurrent('should return successful response with valid API key and options', async () => {
51
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
52
+ const response = await app.scrapeUrl(
53
+ 'https://roastmywebsite.ai', {
54
+ formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
55
+ headers: { "x-key": "test" },
56
+ includeTags: ['h1'],
57
+ excludeTags: ['h2'],
58
+ onlyMainContent: true,
59
+ timeout: 30000,
60
+ waitFor: 1000
61
+ }) as ScrapeResponse;
62
+ expect(response).not.toBeNull();
63
+ expect(response).not.toHaveProperty('content'); // v0
64
+ expect(response.markdown).toContain("_Roast_");
65
+ expect(response.html).toContain("<h1");
66
+ expect(response.rawHtml).toContain("<h1");
67
+ expect(response.screenshot).not.toBeUndefined();
68
+ expect(response.screenshot).not.toBeNull();
69
+ expect(response.screenshot).toContain("https://");
70
+ expect(response.links).not.toBeNull();
71
+ expect(response.links?.length).toBeGreaterThan(0);
72
+ expect(response.links?.[0]).toContain("https://");
73
+ expect(response.metadata).not.toBeNull();
74
+ expect(response.metadata).toHaveProperty("title");
75
+ expect(response.metadata).toHaveProperty("description");
76
+ expect(response.metadata).toHaveProperty("keywords");
77
+ expect(response.metadata).toHaveProperty("robots");
78
+ expect(response.metadata).toHaveProperty("ogTitle");
79
+ expect(response.metadata).toHaveProperty("ogDescription");
80
+ expect(response.metadata).toHaveProperty("ogUrl");
81
+ expect(response.metadata).toHaveProperty("ogImage");
82
+ expect(response.metadata).toHaveProperty("ogLocaleAlternate");
83
+ expect(response.metadata).toHaveProperty("ogSiteName");
84
+ expect(response.metadata).toHaveProperty("sourceURL");
85
+ expect(response.metadata).not.toHaveProperty("pageStatusCode");
86
+ expect(response.metadata).toHaveProperty("statusCode");
87
+ expect(response.metadata).not.toHaveProperty("pageError");
88
+ expect(response.metadata.error).toBeUndefined();
89
+ expect(response.metadata.title).toBe("Roast My Website");
90
+ expect(response.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
91
+ expect(response.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
92
+ expect(response.metadata.robots).toBe("follow, index");
93
+ expect(response.metadata.ogTitle).toBe("Roast My Website");
94
+ expect(response.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
95
+ expect(response.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
96
+ expect(response.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
97
+ expect(response.metadata.ogLocaleAlternate).toStrictEqual([]);
98
+ expect(response.metadata.ogSiteName).toBe("Roast My Website");
99
+ expect(response.metadata.sourceURL).toBe("https://roastmywebsite.ai");
100
+ expect(response.metadata.statusCode).toBe(200);
101
+ }, 30000); // 30 seconds timeout
102
+
103
+ test.concurrent('should return successful response for valid scrape with PDF file', async () => {
104
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
105
+ const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponse;
106
+ expect(response).not.toBeNull();
107
+ expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
108
+ }, 30000); // 30 seconds timeout
109
+
110
+ test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => {
111
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
112
+ const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponse;
113
+ expect(response).not.toBeNull();
114
+ expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
115
+ }, 30000); // 30 seconds timeout
116
+
117
+ test.concurrent('should throw error for invalid API key on crawl', async () => {
118
+ const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
119
+ await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
120
+ });
121
+
122
+ test.concurrent('should throw error for blocklisted URL on crawl', async () => {
123
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
124
+ const blocklistedUrl = "https://twitter.com/fake-test";
125
+ await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
126
+ });
127
+
128
+ test.concurrent('should return successful response for crawl and wait for completion', async () => {
129
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
130
+ const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as CrawlStatusResponse;
131
+ expect(response).not.toBeNull();
132
+ expect(response).toHaveProperty("total");
133
+ expect(response.total).toBeGreaterThan(0);
134
+ expect(response).toHaveProperty("creditsUsed");
135
+ expect(response.creditsUsed).toBeGreaterThan(0);
136
+ expect(response).toHaveProperty("expiresAt");
137
+ expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now());
138
+ expect(response).toHaveProperty("status");
139
+ expect(response.status).toBe("completed");
140
+ expect(response).not.toHaveProperty("next"); // wait until done
141
+ expect(response.data?.length).toBeGreaterThan(0);
142
+ expect(response.data?.[0]).toHaveProperty("markdown");
143
+ expect(response.data?.[0].markdown).toContain("_Roast_");
144
+ expect(response.data?.[0]).not.toHaveProperty('content'); // v0
145
+ expect(response.data?.[0]).not.toHaveProperty("html");
146
+ expect(response.data?.[0]).not.toHaveProperty("rawHtml");
147
+ expect(response.data?.[0]).not.toHaveProperty("screenshot");
148
+ expect(response.data?.[0]).not.toHaveProperty("links");
149
+ expect(response.data?.[0]).toHaveProperty("metadata");
150
+ expect(response.data?.[0].metadata).toHaveProperty("title");
151
+ expect(response.data?.[0].metadata).toHaveProperty("description");
152
+ expect(response.data?.[0].metadata).toHaveProperty("language");
153
+ expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
154
+ expect(response.data?.[0].metadata).toHaveProperty("statusCode");
155
+ expect(response.data?.[0].metadata).not.toHaveProperty("error");
156
+ }, 60000); // 60 seconds timeout
157
+
158
+ test.concurrent('should return successful response for crawl with options and wait for completion', async () => {
159
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
160
+ const response = await app.crawlUrl('https://roastmywebsite.ai', {
161
+ excludePaths: ['blog/*'],
162
+ includePaths: ['/'],
163
+ maxDepth: 2,
164
+ ignoreSitemap: true,
165
+ limit: 10,
166
+ allowBackwardLinks: true,
167
+ allowExternalLinks: true,
168
+ scrapeOptions: {
169
+ formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
170
+ headers: { "x-key": "test" },
171
+ includeTags: ['h1'],
172
+ excludeTags: ['h2'],
173
+ onlyMainContent: true,
174
+ waitFor: 1000
175
+ }
176
+ } as CrawlParams, true, 30) as CrawlStatusResponse;
177
+ expect(response).not.toBeNull();
178
+ expect(response).toHaveProperty("total");
179
+ expect(response.total).toBeGreaterThan(0);
180
+ expect(response).toHaveProperty("creditsUsed");
181
+ expect(response.creditsUsed).toBeGreaterThan(0);
182
+ expect(response).toHaveProperty("expiresAt");
183
+ expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now());
184
+ expect(response).toHaveProperty("status");
185
+ expect(response.status).toBe("completed");
186
+ expect(response).not.toHaveProperty("next");
187
+ expect(response.data?.length).toBeGreaterThan(0);
188
+ expect(response.data?.[0]).toHaveProperty("markdown");
189
+ expect(response.data?.[0].markdown).toContain("_Roast_");
190
+ expect(response.data?.[0]).not.toHaveProperty('content'); // v0
191
+ expect(response.data?.[0]).toHaveProperty("html");
192
+ expect(response.data?.[0].html).toContain("<h1");
193
+ expect(response.data?.[0]).toHaveProperty("rawHtml");
194
+ expect(response.data?.[0].rawHtml).toContain("<h1");
195
+ expect(response.data?.[0]).toHaveProperty("screenshot");
196
+ expect(response.data?.[0].screenshot).toContain("https://");
197
+ expect(response.data?.[0]).toHaveProperty("links");
198
+ expect(response.data?.[0].links).not.toBeNull();
199
+ expect(response.data?.[0].links?.length).toBeGreaterThan(0);
200
+ expect(response.data?.[0]).toHaveProperty("metadata");
201
+ expect(response.data?.[0].metadata).toHaveProperty("title");
202
+ expect(response.data?.[0].metadata).toHaveProperty("description");
203
+ expect(response.data?.[0].metadata).toHaveProperty("language");
204
+ expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
205
+ expect(response.data?.[0].metadata).toHaveProperty("statusCode");
206
+ expect(response.data?.[0].metadata).not.toHaveProperty("error");
207
+ }, 60000); // 60 seconds timeout
208
+
209
+ test.concurrent('should handle idempotency key for crawl', async () => {
210
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
211
+ const uniqueIdempotencyKey = uuidv4();
212
+ const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse;
213
+ expect(response).not.toBeNull();
214
+ expect(response.id).toBeDefined();
215
+
216
+ await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
217
+ });
218
+
219
+ test.concurrent('should check crawl status', async () => {
220
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
221
+ const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse;
222
+ expect(response).not.toBeNull();
223
+ expect(response.id).toBeDefined();
224
+
225
+ let statusResponse = await app.checkCrawlStatus(response.id);
226
+ const maxChecks = 15;
227
+ let checks = 0;
228
+
229
+ while (statusResponse.status === 'scraping' && checks < maxChecks) {
230
+ await new Promise(resolve => setTimeout(resolve, 5000));
231
+ expect(statusResponse).not.toHaveProperty("partial_data"); // v0
232
+ expect(statusResponse).not.toHaveProperty("current"); // v0
233
+ expect(statusResponse).toHaveProperty("data");
234
+ expect(statusResponse).toHaveProperty("total");
235
+ expect(statusResponse).toHaveProperty("creditsUsed");
236
+ expect(statusResponse).toHaveProperty("expiresAt");
237
+ expect(statusResponse).toHaveProperty("status");
238
+ expect(statusResponse).toHaveProperty("next");
239
+ expect(statusResponse.total).toBeGreaterThan(0);
240
+ expect(statusResponse.creditsUsed).toBeGreaterThan(0);
241
+ expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
242
+ expect(statusResponse.status).toBe("scraping");
243
+ expect(statusResponse.next).toContain("/v1/crawl/");
244
+ statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
245
+ checks++;
246
+ }
247
+
248
+ expect(statusResponse).not.toBeNull();
249
+ expect(statusResponse).toHaveProperty("total");
250
+ expect(statusResponse.total).toBeGreaterThan(0);
251
+ expect(statusResponse).toHaveProperty("creditsUsed");
252
+ expect(statusResponse.creditsUsed).toBeGreaterThan(0);
253
+ expect(statusResponse).toHaveProperty("expiresAt");
254
+ expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
255
+ expect(statusResponse).toHaveProperty("status");
256
+ expect(statusResponse.status).toBe("completed");
257
+ expect(statusResponse.data?.length).toBeGreaterThan(0);
258
+ expect(statusResponse.data?.[0]).toHaveProperty("markdown");
259
+ expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10);
260
+ expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0
261
+ expect(statusResponse.data?.[0]).toHaveProperty("html");
262
+ expect(statusResponse.data?.[0].html).toContain("<div");
263
+ expect(statusResponse.data?.[0]).toHaveProperty("rawHtml");
264
+ expect(statusResponse.data?.[0].rawHtml).toContain("<div");
265
+ expect(statusResponse.data?.[0]).toHaveProperty("screenshot");
266
+ expect(statusResponse.data?.[0].screenshot).toContain("https://");
267
+ expect(statusResponse.data?.[0]).toHaveProperty("links");
268
+ expect(statusResponse.data?.[0].links).not.toBeNull();
269
+ expect(statusResponse.data?.[0].links?.length).toBeGreaterThan(0);
270
+ expect(statusResponse.data?.[0]).toHaveProperty("metadata");
271
+ expect(statusResponse.data?.[0].metadata).toHaveProperty("title");
272
+ expect(statusResponse.data?.[0].metadata).toHaveProperty("description");
273
+ expect(statusResponse.data?.[0].metadata).toHaveProperty("language");
274
+ expect(statusResponse.data?.[0].metadata).toHaveProperty("sourceURL");
275
+ expect(statusResponse.data?.[0].metadata).toHaveProperty("statusCode");
276
+ expect(statusResponse.data?.[0].metadata).not.toHaveProperty("error");
277
+ }, 60000); // 60 seconds timeout
278
+
279
+ test.concurrent('should throw error for invalid API key on map', async () => {
280
+ const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
281
+ await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
282
+ });
283
+
284
+ test.concurrent('should throw error for blocklisted URL on map', async () => {
285
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
286
+ const blocklistedUrl = "https://facebook.com/fake-test";
287
+ await expect(app.mapUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
288
+ });
289
+
290
+ test.concurrent('should return successful response with valid preview token', async () => {
291
+ const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
292
+ const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
293
+ expect(response).not.toBeNull();
294
+ expect(response.links?.length).toBeGreaterThan(0);
295
+ }, 30000); // 30 seconds timeout
296
+
297
+ test.concurrent('should return successful response for valid map', async () => {
298
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
299
+ const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
300
+ expect(response).not.toBeNull();
301
+
302
+ expect(response.links?.length).toBeGreaterThan(0);
303
+ expect(response.links?.[0]).toContain("https://");
304
+ const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai"));
305
+ expect(filteredLinks?.length).toBeGreaterThan(0);
306
+ }, 30000); // 30 seconds timeout
307
+
308
+ test('should throw NotImplementedError for search on v1', async () => {
309
+ const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
310
+ await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1");
311
+ });
312
+ });