@mendable/firecrawl-js 0.0.36 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/cjs/index.js +216 -147
- package/build/esm/index.js +216 -147
- package/package.json +2 -2
- package/src/__tests__/e2e_withAuth/index.test.ts +299 -128
- package/src/__tests__/index.test.ts +1 -1
- package/src/__tests__/v1/e2e_withAuth/index.test.ts +312 -0
- package/src/index.ts +385 -108
- package/tsconfig.json +3 -3
- package/types/index.d.ts +232 -53
|
@@ -1,160 +1,331 @@
|
|
|
1
|
-
import FirecrawlApp
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
import FirecrawlApp, {
|
|
2
|
+
CrawlResponseV0,
|
|
3
|
+
CrawlStatusResponse,
|
|
4
|
+
CrawlStatusResponseV0,
|
|
5
|
+
FirecrawlDocumentV0,
|
|
6
|
+
ScrapeResponseV0,
|
|
7
|
+
SearchResponseV0,
|
|
8
|
+
} from "../../index";
|
|
9
|
+
import { v4 as uuidv4 } from "uuid";
|
|
10
|
+
import dotenv from "dotenv";
|
|
11
|
+
import { describe, test, expect } from "@jest/globals";
|
|
5
12
|
|
|
6
13
|
dotenv.config();
|
|
7
14
|
|
|
8
15
|
const TEST_API_KEY = process.env.TEST_API_KEY;
|
|
9
16
|
const API_URL = "http://127.0.0.1:3002";
|
|
10
17
|
|
|
11
|
-
describe('FirecrawlApp E2E Tests', () => {
|
|
12
|
-
test.concurrent(
|
|
18
|
+
describe('FirecrawlApp<"v0"> E2E Tests', () => {
|
|
19
|
+
test.concurrent("should throw error for no API key", async () => {
|
|
13
20
|
expect(() => {
|
|
14
|
-
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
|
21
|
+
new FirecrawlApp<"v0">({ apiKey: null, apiUrl: API_URL, version: "v0" });
|
|
15
22
|
}).toThrow("No API key provided");
|
|
16
23
|
});
|
|
17
24
|
|
|
18
|
-
test.concurrent(
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
25
|
+
test.concurrent(
|
|
26
|
+
"should throw error for invalid API key on scrape",
|
|
27
|
+
async () => {
|
|
28
|
+
const invalidApp = new FirecrawlApp<"v0">({
|
|
29
|
+
apiKey: "invalid_api_key",
|
|
30
|
+
apiUrl: API_URL,
|
|
31
|
+
version: "v0",
|
|
32
|
+
});
|
|
33
|
+
await expect(
|
|
34
|
+
invalidApp.scrapeUrl("https://roastmywebsite.ai")
|
|
35
|
+
).rejects.toThrow("Request failed with status code 401");
|
|
36
|
+
}
|
|
37
|
+
);
|
|
22
38
|
|
|
23
|
-
test.concurrent(
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
39
|
+
test.concurrent(
|
|
40
|
+
"should throw error for blocklisted URL on scrape",
|
|
41
|
+
async () => {
|
|
42
|
+
const app = new FirecrawlApp<"v0">({
|
|
43
|
+
apiKey: TEST_API_KEY,
|
|
44
|
+
apiUrl: API_URL,
|
|
45
|
+
version: "v0",
|
|
46
|
+
});
|
|
47
|
+
const blocklistedUrl = "https://facebook.com/fake-test";
|
|
48
|
+
await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow(
|
|
49
|
+
"Request failed with status code 403"
|
|
50
|
+
);
|
|
51
|
+
}
|
|
52
|
+
);
|
|
28
53
|
|
|
29
|
-
test.concurrent(
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
54
|
+
test.concurrent(
|
|
55
|
+
"should return successful response with valid preview token",
|
|
56
|
+
async () => {
|
|
57
|
+
const app = new FirecrawlApp<"v0">({
|
|
58
|
+
apiKey: "this_is_just_a_preview_token",
|
|
59
|
+
apiUrl: API_URL,
|
|
60
|
+
version: "v0",
|
|
61
|
+
});
|
|
62
|
+
const response = (await app.scrapeUrl(
|
|
63
|
+
"https://roastmywebsite.ai"
|
|
64
|
+
)) as ScrapeResponseV0;
|
|
65
|
+
expect(response).not.toBeNull();
|
|
66
|
+
expect(response.data?.content).toContain("_Roast_");
|
|
67
|
+
},
|
|
68
|
+
30000
|
|
69
|
+
); // 30 seconds timeout
|
|
35
70
|
|
|
36
|
-
test.concurrent(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
test.concurrent('should return successful response for valid scrape with PDF file', async () => {
|
|
56
|
-
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
57
|
-
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf');
|
|
58
|
-
expect(response).not.toBeNull();
|
|
59
|
-
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
|
60
|
-
}, 30000); // 30 seconds timeout
|
|
71
|
+
test.concurrent(
|
|
72
|
+
"should return successful response for valid scrape",
|
|
73
|
+
async () => {
|
|
74
|
+
const app = new FirecrawlApp<"v0">({
|
|
75
|
+
apiKey: TEST_API_KEY,
|
|
76
|
+
apiUrl: API_URL,
|
|
77
|
+
version: "v0",
|
|
78
|
+
});
|
|
79
|
+
const response = (await app.scrapeUrl(
|
|
80
|
+
"https://roastmywebsite.ai"
|
|
81
|
+
)) as ScrapeResponseV0;
|
|
82
|
+
expect(response).not.toBeNull();
|
|
83
|
+
expect(response.data?.content).toContain("_Roast_");
|
|
84
|
+
expect(response.data).toHaveProperty("markdown");
|
|
85
|
+
expect(response.data).toHaveProperty("metadata");
|
|
86
|
+
expect(response.data).not.toHaveProperty("html");
|
|
87
|
+
},
|
|
88
|
+
30000
|
|
89
|
+
); // 30 seconds timeout
|
|
61
90
|
|
|
62
|
-
test.concurrent(
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
91
|
+
test.concurrent(
|
|
92
|
+
"should return successful response with valid API key and include HTML",
|
|
93
|
+
async () => {
|
|
94
|
+
const app = new FirecrawlApp<"v0">({
|
|
95
|
+
apiKey: TEST_API_KEY,
|
|
96
|
+
apiUrl: API_URL,
|
|
97
|
+
version: "v0",
|
|
98
|
+
});
|
|
99
|
+
const response = (await app.scrapeUrl("https://roastmywebsite.ai", {
|
|
100
|
+
pageOptions: { includeHtml: true },
|
|
101
|
+
})) as ScrapeResponseV0;
|
|
102
|
+
expect(response).not.toBeNull();
|
|
103
|
+
expect(response.data?.content).toContain("_Roast_");
|
|
104
|
+
expect(response.data?.markdown).toContain("_Roast_");
|
|
105
|
+
expect(response.data?.html).toContain("<h1");
|
|
106
|
+
},
|
|
107
|
+
30000
|
|
108
|
+
); // 30 seconds timeout
|
|
68
109
|
|
|
69
|
-
test.concurrent(
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
110
|
+
test.concurrent(
|
|
111
|
+
"should return successful response for valid scrape with PDF file",
|
|
112
|
+
async () => {
|
|
113
|
+
const app = new FirecrawlApp<"v0">({
|
|
114
|
+
apiKey: TEST_API_KEY,
|
|
115
|
+
apiUrl: API_URL,
|
|
116
|
+
version: "v0",
|
|
117
|
+
});
|
|
118
|
+
const response = (await app.scrapeUrl(
|
|
119
|
+
"https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
|
120
|
+
)) as ScrapeResponseV0;
|
|
121
|
+
expect(response).not.toBeNull();
|
|
122
|
+
expect(response.data?.content).toContain(
|
|
123
|
+
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
|
|
124
|
+
);
|
|
125
|
+
},
|
|
126
|
+
30000
|
|
127
|
+
); // 30 seconds timeout
|
|
73
128
|
|
|
74
|
-
test.concurrent(
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
129
|
+
test.concurrent(
|
|
130
|
+
"should return successful response for valid scrape with PDF file without explicit extension",
|
|
131
|
+
async () => {
|
|
132
|
+
const app = new FirecrawlApp<"v0">({
|
|
133
|
+
apiKey: TEST_API_KEY,
|
|
134
|
+
apiUrl: API_URL,
|
|
135
|
+
version: "v0",
|
|
136
|
+
});
|
|
137
|
+
const response = (await app.scrapeUrl(
|
|
138
|
+
"https://arxiv.org/pdf/astro-ph/9301001"
|
|
139
|
+
)) as ScrapeResponseV0;
|
|
140
|
+
expect(response).not.toBeNull();
|
|
141
|
+
expect(response.data?.content).toContain(
|
|
142
|
+
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
|
|
143
|
+
);
|
|
144
|
+
},
|
|
145
|
+
30000
|
|
146
|
+
); // 30 seconds timeout
|
|
79
147
|
|
|
80
|
-
test.concurrent(
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
148
|
+
test.concurrent(
|
|
149
|
+
"should throw error for invalid API key on crawl",
|
|
150
|
+
async () => {
|
|
151
|
+
const invalidApp = new FirecrawlApp<"v0">({
|
|
152
|
+
apiKey: "invalid_api_key",
|
|
153
|
+
apiUrl: API_URL,
|
|
154
|
+
version: "v0",
|
|
155
|
+
});
|
|
156
|
+
await expect(
|
|
157
|
+
invalidApp.crawlUrl("https://roastmywebsite.ai")
|
|
158
|
+
).rejects.toThrow("Request failed with status code 401");
|
|
159
|
+
}
|
|
160
|
+
);
|
|
161
|
+
|
|
162
|
+
test.concurrent(
|
|
163
|
+
"should throw error for blocklisted URL on crawl",
|
|
164
|
+
async () => {
|
|
165
|
+
const app = new FirecrawlApp<"v0">({
|
|
166
|
+
apiKey: TEST_API_KEY,
|
|
167
|
+
apiUrl: API_URL,
|
|
168
|
+
version: "v0",
|
|
169
|
+
});
|
|
170
|
+
const blocklistedUrl = "https://twitter.com/fake-test";
|
|
171
|
+
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow(
|
|
172
|
+
"Request failed with status code 403"
|
|
173
|
+
);
|
|
174
|
+
}
|
|
175
|
+
);
|
|
86
176
|
|
|
87
|
-
test.concurrent(
|
|
88
|
-
|
|
177
|
+
test.concurrent(
|
|
178
|
+
"should return successful response for crawl and wait for completion",
|
|
179
|
+
async () => {
|
|
180
|
+
const app = new FirecrawlApp<"v0">({
|
|
181
|
+
apiKey: TEST_API_KEY,
|
|
182
|
+
apiUrl: API_URL,
|
|
183
|
+
version: "v0",
|
|
184
|
+
});
|
|
185
|
+
const response = (await app.crawlUrl(
|
|
186
|
+
"https://roastmywebsite.ai",
|
|
187
|
+
{ crawlerOptions: { excludes: ["blog/*"] } },
|
|
188
|
+
true,
|
|
189
|
+
10
|
|
190
|
+
)) as FirecrawlDocumentV0[];
|
|
191
|
+
expect(response).not.toBeNull();
|
|
192
|
+
console.log({ response });
|
|
193
|
+
expect(response[0].content).toContain("_Roast_");
|
|
194
|
+
},
|
|
195
|
+
60000
|
|
196
|
+
); // 60 seconds timeout
|
|
197
|
+
|
|
198
|
+
test.concurrent("should handle idempotency key for crawl", async () => {
|
|
199
|
+
const app = new FirecrawlApp<"v0">({
|
|
200
|
+
apiKey: TEST_API_KEY,
|
|
201
|
+
apiUrl: API_URL,
|
|
202
|
+
version: "v0",
|
|
203
|
+
});
|
|
89
204
|
const uniqueIdempotencyKey = uuidv4();
|
|
90
|
-
const response = await app.crawlUrl(
|
|
205
|
+
const response = (await app.crawlUrl(
|
|
206
|
+
"https://roastmywebsite.ai",
|
|
207
|
+
{ crawlerOptions: { excludes: ["blog/*"] } },
|
|
208
|
+
false,
|
|
209
|
+
2,
|
|
210
|
+
uniqueIdempotencyKey
|
|
211
|
+
)) as CrawlResponseV0;
|
|
91
212
|
expect(response).not.toBeNull();
|
|
92
213
|
expect(response.jobId).toBeDefined();
|
|
93
214
|
|
|
94
|
-
await expect(
|
|
215
|
+
await expect(
|
|
216
|
+
app.crawlUrl(
|
|
217
|
+
"https://roastmywebsite.ai",
|
|
218
|
+
{ crawlerOptions: { excludes: ["blog/*"] } },
|
|
219
|
+
true,
|
|
220
|
+
2,
|
|
221
|
+
uniqueIdempotencyKey
|
|
222
|
+
)
|
|
223
|
+
).rejects.toThrow("Request failed with status code 409");
|
|
95
224
|
});
|
|
96
225
|
|
|
97
|
-
test.concurrent(
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
226
|
+
test.concurrent(
|
|
227
|
+
"should check crawl status",
|
|
228
|
+
async () => {
|
|
229
|
+
const app = new FirecrawlApp<"v0">({
|
|
230
|
+
apiKey: TEST_API_KEY,
|
|
231
|
+
apiUrl: API_URL,
|
|
232
|
+
version: "v0",
|
|
233
|
+
});
|
|
234
|
+
const response: any = (await app.crawlUrl(
|
|
235
|
+
"https://roastmywebsite.ai",
|
|
236
|
+
{ crawlerOptions: { excludes: ["blog/*"] } },
|
|
237
|
+
false
|
|
238
|
+
)) as CrawlResponseV0;
|
|
239
|
+
expect(response).not.toBeNull();
|
|
240
|
+
expect(response.jobId).toBeDefined();
|
|
102
241
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
242
|
+
let statusResponse = await app.checkCrawlStatus(response.jobId);
|
|
243
|
+
const maxChecks = 15;
|
|
244
|
+
let checks = 0;
|
|
106
245
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
246
|
+
while (statusResponse.status === "active" && checks < maxChecks) {
|
|
247
|
+
await new Promise((resolve) => setTimeout(resolve, 5000));
|
|
248
|
+
expect(statusResponse.partial_data).not.toBeNull();
|
|
249
|
+
// expect(statusResponse.current).toBeGreaterThanOrEqual(1);
|
|
250
|
+
statusResponse = (await app.checkCrawlStatus(
|
|
251
|
+
response.jobId
|
|
252
|
+
)) as CrawlStatusResponseV0;
|
|
253
|
+
checks++;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
expect(statusResponse).not.toBeNull();
|
|
257
|
+
expect(statusResponse.success).toBe(true);
|
|
258
|
+
expect(statusResponse.status).toBe("completed");
|
|
259
|
+
expect(statusResponse.total).toEqual(statusResponse.current);
|
|
260
|
+
expect(statusResponse.current_step).not.toBeNull();
|
|
110
261
|
expect(statusResponse.current).toBeGreaterThanOrEqual(1);
|
|
111
|
-
statusResponse = await app.checkCrawlStatus(response.jobId);
|
|
112
|
-
checks++;
|
|
113
|
-
}
|
|
114
262
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
expect(statusResponse.current_step).not.toBeNull();
|
|
120
|
-
expect(statusResponse?.data?.length).toBeGreaterThan(0);
|
|
121
|
-
}, 35000); // 35 seconds timeout
|
|
122
|
-
|
|
123
|
-
test.concurrent('should return successful response for search', async () => {
|
|
124
|
-
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
125
|
-
const response = await app.search("test query");
|
|
126
|
-
expect(response).not.toBeNull();
|
|
127
|
-
expect(response?.data?.[0]?.content).toBeDefined();
|
|
128
|
-
expect(response?.data?.length).toBeGreaterThan(2);
|
|
129
|
-
}, 30000); // 30 seconds timeout
|
|
263
|
+
expect(statusResponse?.data?.length).toBeGreaterThan(0);
|
|
264
|
+
},
|
|
265
|
+
35000
|
|
266
|
+
); // 35 seconds timeout
|
|
130
267
|
|
|
131
|
-
test.concurrent(
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
268
|
+
test.concurrent(
|
|
269
|
+
"should return successful response for search",
|
|
270
|
+
async () => {
|
|
271
|
+
const app = new FirecrawlApp<"v0">({
|
|
272
|
+
apiKey: TEST_API_KEY,
|
|
273
|
+
apiUrl: API_URL,
|
|
274
|
+
version: "v0",
|
|
275
|
+
});
|
|
276
|
+
const response = (await app.search("test query")) as SearchResponseV0;
|
|
277
|
+
expect(response).not.toBeNull();
|
|
278
|
+
expect(response?.data?.[0]?.content).toBeDefined();
|
|
279
|
+
expect(response?.data?.length).toBeGreaterThan(2);
|
|
280
|
+
},
|
|
281
|
+
30000
|
|
282
|
+
); // 30 seconds timeout
|
|
283
|
+
|
|
284
|
+
test.concurrent(
|
|
285
|
+
"should throw error for invalid API key on search",
|
|
286
|
+
async () => {
|
|
287
|
+
const invalidApp = new FirecrawlApp<"v0">({
|
|
288
|
+
apiKey: "invalid_api_key",
|
|
289
|
+
apiUrl: API_URL,
|
|
290
|
+
version: "v0",
|
|
291
|
+
});
|
|
292
|
+
await expect(invalidApp.search("test query")).rejects.toThrow(
|
|
293
|
+
"Request failed with status code 401"
|
|
294
|
+
);
|
|
295
|
+
}
|
|
296
|
+
);
|
|
135
297
|
|
|
136
|
-
test.concurrent(
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
298
|
+
test.concurrent(
|
|
299
|
+
"should perform LLM extraction",
|
|
300
|
+
async () => {
|
|
301
|
+
const app = new FirecrawlApp<"v0">({
|
|
302
|
+
apiKey: TEST_API_KEY,
|
|
303
|
+
apiUrl: API_URL,
|
|
304
|
+
version: "v0",
|
|
305
|
+
});
|
|
306
|
+
const response = (await app.scrapeUrl("https://mendable.ai", {
|
|
307
|
+
extractorOptions: {
|
|
308
|
+
mode: "llm-extraction",
|
|
309
|
+
extractionPrompt:
|
|
310
|
+
"Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
|
311
|
+
extractionSchema: {
|
|
312
|
+
type: "object",
|
|
313
|
+
properties: {
|
|
314
|
+
company_mission: { type: "string" },
|
|
315
|
+
supports_sso: { type: "boolean" },
|
|
316
|
+
is_open_source: { type: "boolean" },
|
|
317
|
+
},
|
|
318
|
+
required: ["company_mission", "supports_sso", "is_open_source"],
|
|
148
319
|
},
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
320
|
+
},
|
|
321
|
+
})) as ScrapeResponseV0;
|
|
322
|
+
expect(response).not.toBeNull();
|
|
323
|
+
expect(response.data?.llm_extraction).toBeDefined();
|
|
324
|
+
const llmExtraction = response.data?.llm_extraction;
|
|
325
|
+
expect(llmExtraction?.company_mission).toBeDefined();
|
|
326
|
+
expect(typeof llmExtraction?.supports_sso).toBe("boolean");
|
|
327
|
+
expect(typeof llmExtraction?.is_open_source).toBe("boolean");
|
|
328
|
+
},
|
|
329
|
+
30000
|
|
330
|
+
); // 30 seconds timeout
|
|
160
331
|
});
|
|
@@ -31,7 +31,7 @@ describe('the firecrawl JS SDK', () => {
|
|
|
31
31
|
});
|
|
32
32
|
|
|
33
33
|
const apiKey = 'YOUR_API_KEY'
|
|
34
|
-
const app = new FirecrawlApp({ apiKey });
|
|
34
|
+
const app = new FirecrawlApp<"v0">({ apiKey });
|
|
35
35
|
// Scrape a single URL
|
|
36
36
|
const url = 'https://mendable.ai';
|
|
37
37
|
const scrapedData = await app.scrapeUrl(url);
|