firecrawl 1.29.2 → 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +4 -2
- package/LICENSE +0 -0
- package/README.md +85 -78
- package/audit-ci.jsonc +4 -0
- package/dist/chunk-JFWW4BWA.js +85 -0
- package/dist/index.cjs +1005 -42
- package/dist/index.d.cts +535 -11
- package/dist/index.d.ts +535 -11
- package/dist/index.js +994 -32
- package/dist/package-KYZ3HXR5.js +4 -0
- package/dump.rdb +0 -0
- package/jest.config.js +0 -0
- package/package.json +6 -5
- package/src/__tests__/{v1/e2e_withAuth → e2e/v1}/index.test.ts +1 -0
- package/src/__tests__/e2e/v2/batch.test.ts +74 -0
- package/src/__tests__/e2e/v2/crawl.test.ts +182 -0
- package/src/__tests__/e2e/v2/extract.test.ts +70 -0
- package/src/__tests__/e2e/v2/map.test.ts +55 -0
- package/src/__tests__/e2e/v2/scrape.test.ts +130 -0
- package/src/__tests__/e2e/v2/search.test.ts +247 -0
- package/src/__tests__/e2e/v2/usage.test.ts +36 -0
- package/src/__tests__/e2e/v2/utils/idmux.ts +58 -0
- package/src/__tests__/e2e/v2/watcher.test.ts +96 -0
- package/src/__tests__/unit/v1/monitor-job-status-retry.test.ts +154 -0
- package/src/__tests__/unit/v2/errorHandler.test.ts +19 -0
- package/src/__tests__/unit/v2/scrape.unit.test.ts +11 -0
- package/src/__tests__/unit/v2/validation.test.ts +59 -0
- package/src/index.backup.ts +2146 -0
- package/src/index.ts +27 -2071
- package/src/v1/index.ts +2158 -0
- package/src/v2/client.ts +281 -0
- package/src/v2/methods/batch.ts +131 -0
- package/src/v2/methods/crawl.ts +160 -0
- package/src/v2/methods/extract.ts +86 -0
- package/src/v2/methods/map.ts +37 -0
- package/src/v2/methods/scrape.ts +26 -0
- package/src/v2/methods/search.ts +69 -0
- package/src/v2/methods/usage.ts +39 -0
- package/src/v2/types.ts +308 -0
- package/src/v2/utils/errorHandler.ts +18 -0
- package/src/v2/utils/getVersion.ts +14 -0
- package/src/v2/utils/httpClient.ts +99 -0
- package/src/v2/utils/validation.ts +50 -0
- package/src/v2/watcher.ts +159 -0
- package/tsconfig.json +2 -1
- package/tsup.config.ts +0 -0
- package/dist/package-E7ICGMY6.js +0 -110
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* E2E tests for v2 search (translated from Python tests)
|
|
3
|
+
*/
|
|
4
|
+
import Firecrawl from "../../../index";
|
|
5
|
+
import type { Document, SearchResult } from "../../../index";
|
|
6
|
+
import { config } from "dotenv";
|
|
7
|
+
import { getIdentity, getApiUrl } from "./utils/idmux";
|
|
8
|
+
import { describe, test, expect, beforeAll } from "@jest/globals";
|
|
9
|
+
|
|
10
|
+
config();
|
|
11
|
+
|
|
12
|
+
const API_URL = getApiUrl();
|
|
13
|
+
let client: Firecrawl;
|
|
14
|
+
|
|
15
|
+
beforeAll(async () => {
|
|
16
|
+
const { apiKey } = await getIdentity({ name: "js-e2e-search" });
|
|
17
|
+
client = new Firecrawl({ apiKey, apiUrl: API_URL });
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
function collectTexts(entries: any[] | undefined): string[] {
|
|
21
|
+
const texts: string[] = [];
|
|
22
|
+
for (const r of entries || []) {
|
|
23
|
+
const title = (r && typeof r === 'object') ? (r.title as unknown as string | undefined) : undefined;
|
|
24
|
+
const desc = (r && typeof r === 'object') ? (r.description as unknown as string | undefined) : undefined;
|
|
25
|
+
if (title) texts.push(String(title).toLowerCase());
|
|
26
|
+
if (desc) texts.push(String(desc).toLowerCase());
|
|
27
|
+
}
|
|
28
|
+
return texts;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function isDocument(entry: Document | SearchResult | undefined | null): entry is Document {
|
|
32
|
+
if (!entry) return false;
|
|
33
|
+
const d = entry as Document;
|
|
34
|
+
return (
|
|
35
|
+
typeof d.markdown === 'string' ||
|
|
36
|
+
typeof d.rawHtml === 'string' ||
|
|
37
|
+
typeof d.html === 'string' ||
|
|
38
|
+
typeof d.links === 'object' ||
|
|
39
|
+
typeof d.screenshot === 'string' ||
|
|
40
|
+
typeof d.changeTracking === 'object' ||
|
|
41
|
+
typeof d.summary === 'string' ||
|
|
42
|
+
typeof d.json === 'object'
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
describe("v2.search e2e", () => {
|
|
47
|
+
|
|
48
|
+
test("minimal request", async () => {
|
|
49
|
+
if (!client) throw new Error();
|
|
50
|
+
const results = await client.search("What is the capital of France?");
|
|
51
|
+
expect(results).toBeTruthy();
|
|
52
|
+
expect(results).toHaveProperty("web");
|
|
53
|
+
expect(results).not.toHaveProperty("news");
|
|
54
|
+
expect(results).not.toHaveProperty("images");
|
|
55
|
+
|
|
56
|
+
expect(results.web).toBeTruthy();
|
|
57
|
+
expect((results.web || []).length).toBeGreaterThan(0);
|
|
58
|
+
|
|
59
|
+
for (const result of results.web || []) {
|
|
60
|
+
if (isDocument(result)) {
|
|
61
|
+
// documents appear if scraping happens
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
expect(typeof result.url).toBe("string");
|
|
65
|
+
expect(result.url.startsWith("http")).toBe(true);
|
|
66
|
+
expect(typeof result.title === "string" || result.title == null).toBe(true);
|
|
67
|
+
expect(typeof result.description === "string" || result.description == null).toBe(true);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const allText = collectTexts(results.web).join(" ");
|
|
71
|
+
expect(allText.includes("paris")).toBe(true);
|
|
72
|
+
|
|
73
|
+
expect(results.news == null).toBe(true);
|
|
74
|
+
expect(results.images == null).toBe(true);
|
|
75
|
+
}, 90_000);
|
|
76
|
+
|
|
77
|
+
test("with sources web+news and limit", async () => {
|
|
78
|
+
if (!client) throw new Error();
|
|
79
|
+
const results = await client.search("firecrawl", { sources: ["web", "news"], limit: 3 });
|
|
80
|
+
expect(results).toBeTruthy();
|
|
81
|
+
expect(results.web).toBeTruthy();
|
|
82
|
+
expect((results.web || []).length).toBeLessThanOrEqual(3);
|
|
83
|
+
if (results.news != null) {
|
|
84
|
+
expect((results.news || []).length).toBeLessThanOrEqual(3);
|
|
85
|
+
}
|
|
86
|
+
expect(results.images == null).toBe(true);
|
|
87
|
+
|
|
88
|
+
const webTitles = (results.web || [])
|
|
89
|
+
.filter((r): r is SearchResult => !isDocument(r))
|
|
90
|
+
.map(r => (r.title || "").toString().toLowerCase());
|
|
91
|
+
const webDescriptions = (results.web || [])
|
|
92
|
+
.filter((r): r is SearchResult => !isDocument(r))
|
|
93
|
+
.map(r => (r.description || "").toString().toLowerCase());
|
|
94
|
+
const allWebText = (webTitles.concat(webDescriptions)).join(" ");
|
|
95
|
+
expect(allWebText.includes("firecrawl")).toBe(true);
|
|
96
|
+
}, 90_000);
|
|
97
|
+
|
|
98
|
+
test("result structure", async () => {
|
|
99
|
+
if (!client) throw new Error();
|
|
100
|
+
const results = await client.search("test query", { limit: 1 });
|
|
101
|
+
if (results.web && results.web.length > 0) {
|
|
102
|
+
const result: any = results.web[0];
|
|
103
|
+
expect(result).toHaveProperty("url");
|
|
104
|
+
expect(result).toHaveProperty("title");
|
|
105
|
+
expect(result).toHaveProperty("description");
|
|
106
|
+
expect(typeof result.url).toBe("string");
|
|
107
|
+
expect(typeof result.title === "string" || result.title == null).toBe(true);
|
|
108
|
+
expect(typeof result.description === "string" || result.description == null).toBe(true);
|
|
109
|
+
expect(result.url.startsWith("http")).toBe(true);
|
|
110
|
+
}
|
|
111
|
+
}, 90_000);
|
|
112
|
+
|
|
113
|
+
test("all parameters (comprehensive)", async () => {
|
|
114
|
+
if (!client) throw new Error();
|
|
115
|
+
const schema = {
|
|
116
|
+
type: "object",
|
|
117
|
+
properties: {
|
|
118
|
+
title: { type: "string" },
|
|
119
|
+
description: { type: "string" },
|
|
120
|
+
url: { type: "string" },
|
|
121
|
+
},
|
|
122
|
+
required: ["title", "description"],
|
|
123
|
+
} as const;
|
|
124
|
+
|
|
125
|
+
const results = await client.search("artificial intelligence", {
|
|
126
|
+
sources: [ "web", "news", "images" ],
|
|
127
|
+
limit: 3,
|
|
128
|
+
tbs: "qdr:m",
|
|
129
|
+
location: "US",
|
|
130
|
+
ignoreInvalidURLs: true,
|
|
131
|
+
timeout: 60_000,
|
|
132
|
+
scrapeOptions: {
|
|
133
|
+
formats: [
|
|
134
|
+
"markdown",
|
|
135
|
+
"html",
|
|
136
|
+
{ type: "json", prompt: "Extract the title and description from the page", schema },
|
|
137
|
+
],
|
|
138
|
+
headers: { "User-Agent": "Firecrawl-Test/1.0" },
|
|
139
|
+
includeTags: ["h1", "h2", "p"],
|
|
140
|
+
excludeTags: ["nav", "footer"],
|
|
141
|
+
onlyMainContent: true,
|
|
142
|
+
waitFor: 2000,
|
|
143
|
+
mobile: false,
|
|
144
|
+
skipTlsVerification: false,
|
|
145
|
+
removeBase64Images: true,
|
|
146
|
+
blockAds: true,
|
|
147
|
+
proxy: "basic",
|
|
148
|
+
maxAge: 3_600_000,
|
|
149
|
+
storeInCache: true,
|
|
150
|
+
location: { country: "US", languages: ["en"] },
|
|
151
|
+
actions: [{ type: "wait", milliseconds: 1000 }],
|
|
152
|
+
},
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
expect(results).toBeTruthy();
|
|
156
|
+
expect(results).toHaveProperty("web");
|
|
157
|
+
expect(results).toHaveProperty("news");
|
|
158
|
+
expect(results).toHaveProperty("images");
|
|
159
|
+
|
|
160
|
+
expect(results.web).toBeTruthy();
|
|
161
|
+
expect((results.web || []).length).toBeLessThanOrEqual(3);
|
|
162
|
+
|
|
163
|
+
const nonDocEntries = (results.web || []).filter(r => !isDocument(r));
|
|
164
|
+
if (nonDocEntries.length > 0) {
|
|
165
|
+
const allWebText = collectTexts(nonDocEntries).join(" ");
|
|
166
|
+
const aiTerms = ["artificial", "intelligence", "ai", "machine", "learning"];
|
|
167
|
+
expect(aiTerms.some(t => allWebText.includes(t))).toBe(true);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
for (const result of results.web || []) {
|
|
171
|
+
if (isDocument(result)) {
|
|
172
|
+
expect(Boolean(result.markdown) || Boolean(result.html)).toBe(true);
|
|
173
|
+
} else {
|
|
174
|
+
expect(typeof result.url).toBe("string");
|
|
175
|
+
expect(result.url.startsWith("http")).toBe(true);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
if (results.news != null) {
|
|
180
|
+
expect((results.news || []).length).toBeLessThanOrEqual(3);
|
|
181
|
+
for (const result of results.news || []) {
|
|
182
|
+
if (isDocument(result)) {
|
|
183
|
+
expect(Boolean(result.markdown) || Boolean(result.html)).toBe(true);
|
|
184
|
+
} else {
|
|
185
|
+
expect(typeof result.url).toBe("string");
|
|
186
|
+
expect(result.url.startsWith("http")).toBe(true);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
expect(results.images).toBeTruthy();
|
|
192
|
+
expect((results.images || []).length).toBeLessThanOrEqual(3);
|
|
193
|
+
for (const result of results.images || []) {
|
|
194
|
+
if (!isDocument(result)) {
|
|
195
|
+
expect(typeof result.url).toBe("string");
|
|
196
|
+
expect(result.url.startsWith("http")).toBe(true);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}, 120_000);
|
|
200
|
+
|
|
201
|
+
test("formats flexibility: list vs object", async () => {
|
|
202
|
+
if (!client) throw new Error();
|
|
203
|
+
const results1 = await client.search("python programming", {
|
|
204
|
+
limit: 1,
|
|
205
|
+
scrapeOptions: { formats: ["markdown"] },
|
|
206
|
+
});
|
|
207
|
+
const results2 = await client.search("python programming", {
|
|
208
|
+
limit: 1,
|
|
209
|
+
scrapeOptions: { formats: ["markdown"] },
|
|
210
|
+
});
|
|
211
|
+
expect(results1).toBeTruthy();
|
|
212
|
+
expect(results2).toBeTruthy();
|
|
213
|
+
expect(results1.web).toBeTruthy();
|
|
214
|
+
expect(results2.web).toBeTruthy();
|
|
215
|
+
}, 90_000);
|
|
216
|
+
|
|
217
|
+
test("with json format object", async () => {
|
|
218
|
+
if (!client) throw new Error();
|
|
219
|
+
const jsonSchema = {
|
|
220
|
+
type: "object",
|
|
221
|
+
properties: { title: { type: "string" } },
|
|
222
|
+
required: ["title"],
|
|
223
|
+
} as const;
|
|
224
|
+
const results = await client.search("site:docs.firecrawl.dev", {
|
|
225
|
+
limit: 1,
|
|
226
|
+
scrapeOptions: {
|
|
227
|
+
formats: [{ type: "json", prompt: "Extract page title", schema: jsonSchema }],
|
|
228
|
+
},
|
|
229
|
+
});
|
|
230
|
+
expect(results).toBeTruthy();
|
|
231
|
+
expect(Array.isArray(results.web) || results.web == null).toBe(true);
|
|
232
|
+
}, 90_000);
|
|
233
|
+
|
|
234
|
+
test("with summary format, documents include summary when present", async () => {
|
|
235
|
+
if (!client) throw new Error();
|
|
236
|
+
const results = await client.search("site:firecrawl.dev", {
|
|
237
|
+
limit: 1,
|
|
238
|
+
scrapeOptions: { formats: ["summary"] },
|
|
239
|
+
});
|
|
240
|
+
const docs = (results.web || []).filter(r => isDocument(r)) as Document[];
|
|
241
|
+
if (docs.length > 0) {
|
|
242
|
+
expect(typeof docs[0].summary).toBe("string");
|
|
243
|
+
expect((docs[0].summary || "").length).toBeGreaterThan(5);
|
|
244
|
+
}
|
|
245
|
+
}, 90_000);
|
|
246
|
+
});
|
|
247
|
+
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* E2E tests for v2 usage endpoints (translated from Python tests)
|
|
3
|
+
*/
|
|
4
|
+
import Firecrawl from "../../../index";
|
|
5
|
+
import { config } from "dotenv";
|
|
6
|
+
import { getIdentity, getApiUrl } from "./utils/idmux";
|
|
7
|
+
import { describe, test, expect, beforeAll } from "@jest/globals";
|
|
8
|
+
|
|
9
|
+
config();
|
|
10
|
+
|
|
11
|
+
const API_URL = getApiUrl();
|
|
12
|
+
let client: Firecrawl;
|
|
13
|
+
|
|
14
|
+
beforeAll(async () => {
|
|
15
|
+
const { apiKey } = await getIdentity({ name: "js-e2e-usage" });
|
|
16
|
+
client = new Firecrawl({ apiKey, apiUrl: API_URL });
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
describe("v2.usage e2e", () => {
|
|
20
|
+
test("get_concurrency", async () => {
|
|
21
|
+
const resp = await client.getConcurrency();
|
|
22
|
+
expect(typeof resp.concurrency).toBe("number");
|
|
23
|
+
expect(typeof resp.maxConcurrency).toBe("number");
|
|
24
|
+
}, 60_000);
|
|
25
|
+
|
|
26
|
+
test("get_credit_usage", async () => {
|
|
27
|
+
const resp = await client.getCreditUsage();
|
|
28
|
+
expect(typeof resp.remainingCredits).toBe("number");
|
|
29
|
+
}, 60_000);
|
|
30
|
+
|
|
31
|
+
test("get_token_usage", async () => {
|
|
32
|
+
const resp = await client.getTokenUsage();
|
|
33
|
+
expect(typeof resp.remainingTokens).toBe("number");
|
|
34
|
+
}, 60_000);
|
|
35
|
+
});
|
|
36
|
+
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
export type IdmuxRequest = {
|
|
2
|
+
name?: string;
|
|
3
|
+
concurrency?: number;
|
|
4
|
+
credits?: number;
|
|
5
|
+
tokens?: number;
|
|
6
|
+
teamId?: string;
|
|
7
|
+
flags?: Record<string, unknown>;
|
|
8
|
+
};
|
|
9
|
+
|
|
10
|
+
export type Identity = {
|
|
11
|
+
apiKey: string;
|
|
12
|
+
teamId: string;
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
let cachedIdentity: Identity | null = null;
|
|
16
|
+
|
|
17
|
+
export function getApiUrl(): string {
|
|
18
|
+
return process.env.TEST_URL ?? process.env.FIRECRAWL_API_URL ?? "https://api.firecrawl.dev";
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export async function getIdentity(req: IdmuxRequest = {}): Promise<Identity> {
|
|
22
|
+
if (cachedIdentity) return cachedIdentity;
|
|
23
|
+
|
|
24
|
+
const idmuxUrl = process.env.IDMUX_URL;
|
|
25
|
+
if (!idmuxUrl) {
|
|
26
|
+
const fallback: Identity = {
|
|
27
|
+
apiKey: process.env.TEST_API_KEY ?? process.env.FIRECRAWL_API_KEY ?? "",
|
|
28
|
+
teamId: process.env.TEST_TEAM_ID ?? "",
|
|
29
|
+
};
|
|
30
|
+
cachedIdentity = fallback;
|
|
31
|
+
return fallback;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const runNumberRaw = process.env.GITHUB_RUN_NUMBER;
|
|
35
|
+
const runNumber = runNumberRaw ? Number(runNumberRaw) : 0;
|
|
36
|
+
const body = {
|
|
37
|
+
refName: process.env.GITHUB_REF_NAME ?? "local",
|
|
38
|
+
runNumber: Number.isFinite(runNumber) ? runNumber : 0,
|
|
39
|
+
concurrency: req.concurrency ?? 100,
|
|
40
|
+
...req,
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
const res = await fetch(`${idmuxUrl}/`, {
|
|
44
|
+
method: "POST",
|
|
45
|
+
headers: { "Content-Type": "application/json" },
|
|
46
|
+
body: JSON.stringify(body),
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
if (!res.ok) {
|
|
50
|
+
const text = await res.text();
|
|
51
|
+
throw new Error(`idmux request failed: ${res.status} ${text}`);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const identity = (await res.json()) as Identity;
|
|
55
|
+
cachedIdentity = identity;
|
|
56
|
+
return identity;
|
|
57
|
+
}
|
|
58
|
+
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import Firecrawl from "../../../index";
|
|
2
|
+
import { config } from "dotenv";
|
|
3
|
+
import { describe, test, expect, beforeAll } from "@jest/globals";
|
|
4
|
+
import { getIdentity } from "./utils/idmux";
|
|
5
|
+
|
|
6
|
+
config();
|
|
7
|
+
|
|
8
|
+
const API_URL = process.env.FIRECRAWL_API_URL ?? "https://api.firecrawl.dev";
|
|
9
|
+
let client: Firecrawl;
|
|
10
|
+
|
|
11
|
+
beforeAll(async () => {
|
|
12
|
+
const { apiKey } = await getIdentity({ name: "js-e2e-watcher" });
|
|
13
|
+
client = new Firecrawl({ apiKey, apiUrl: API_URL });
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
describe("v2.watcher e2e", () => {
|
|
17
|
+
test("crawl watcher minimal", async () => {
|
|
18
|
+
// client is initialized in beforeAll
|
|
19
|
+
const start = await client.startCrawl("https://docs.firecrawl.dev", { limit: 3 });
|
|
20
|
+
|
|
21
|
+
expect(typeof start.id).toBe("string");
|
|
22
|
+
|
|
23
|
+
const watcher = client.watcher(start.id, { pollInterval: 2 });
|
|
24
|
+
|
|
25
|
+
let snapshots = 0;
|
|
26
|
+
let documents = 0;
|
|
27
|
+
|
|
28
|
+
watcher.on("snapshot", (snap: any) => {
|
|
29
|
+
snapshots += 1;
|
|
30
|
+
expect(["scraping", "completed", "failed", "cancelled"]).toContain(snap.status);
|
|
31
|
+
expect(typeof snap.completed).toBe("number");
|
|
32
|
+
expect(typeof snap.total).toBe("number");
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
watcher.on("document", (_doc: any) => {
|
|
36
|
+
documents += 1;
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
const final = await new Promise<any>(async (resolve) => {
|
|
40
|
+
watcher.on("done", (payload: any) => {
|
|
41
|
+
resolve(payload);
|
|
42
|
+
});
|
|
43
|
+
watcher.on("error", (err: any) => {
|
|
44
|
+
resolve(err);
|
|
45
|
+
});
|
|
46
|
+
await watcher.start();
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
expect(["completed", "failed", "cancelled"]).toContain(final.status);
|
|
50
|
+
expect(Array.isArray(final.data)).toBe(true);
|
|
51
|
+
expect(typeof final.id).toBe("string");
|
|
52
|
+
expect(snapshots).toBeGreaterThanOrEqual(1);
|
|
53
|
+
expect(documents).toBeGreaterThanOrEqual(0);
|
|
54
|
+
watcher.close();
|
|
55
|
+
}, 240_000);
|
|
56
|
+
|
|
57
|
+
test("batch watcher with options (kind, pollInterval, timeout)", async () => {
|
|
58
|
+
// client is initialized in beforeAll
|
|
59
|
+
const urls = [
|
|
60
|
+
"https://docs.firecrawl.dev",
|
|
61
|
+
"https://firecrawl.dev",
|
|
62
|
+
];
|
|
63
|
+
|
|
64
|
+
const start = await client.startBatchScrape(urls, { options: { formats: ["markdown"] }, ignoreInvalidURLs: true });
|
|
65
|
+
expect(typeof start.id).toBe("string");
|
|
66
|
+
|
|
67
|
+
const watcher = client.watcher(start.id, { kind: "batch", pollInterval: 2, timeout: 180 });
|
|
68
|
+
|
|
69
|
+
let snapshots = 0;
|
|
70
|
+
let gotCompleted = false;
|
|
71
|
+
|
|
72
|
+
watcher.on("snapshot", (snap: any) => {
|
|
73
|
+
snapshots += 1;
|
|
74
|
+
if (snap.status === "completed") gotCompleted = true;
|
|
75
|
+
expect(["scraping", "completed", "failed", "cancelled"]).toContain(snap.status);
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
const final = await new Promise<any>(async (resolve) => {
|
|
79
|
+
watcher.on("done", (payload: any) => {
|
|
80
|
+
resolve(payload);
|
|
81
|
+
});
|
|
82
|
+
watcher.on("error", (err: any) => {
|
|
83
|
+
resolve(err);
|
|
84
|
+
});
|
|
85
|
+
await watcher.start();
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
expect(["completed", "failed", "cancelled"]).toContain(final.status);
|
|
89
|
+
expect(Array.isArray(final.data)).toBe(true);
|
|
90
|
+
expect(typeof final.id).toBe("string");
|
|
91
|
+
expect(snapshots).toBeGreaterThanOrEqual(1);
|
|
92
|
+
expect(gotCompleted || final.status !== "completed").toBe(true);
|
|
93
|
+
watcher.close();
|
|
94
|
+
}, 300_000);
|
|
95
|
+
});
|
|
96
|
+
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import FirecrawlApp from '../../../index';
|
|
2
|
+
import { describe, test, expect, jest, beforeEach, afterEach } from '@jest/globals';
|
|
3
|
+
|
|
4
|
+
describe('monitorJobStatus retry logic', () => {
|
|
5
|
+
let app: FirecrawlApp;
|
|
6
|
+
let originalConsoleWarn: typeof console.warn;
|
|
7
|
+
|
|
8
|
+
beforeEach(() => {
|
|
9
|
+
app = new FirecrawlApp({ apiKey: 'test-key', apiUrl: 'https://test.com' });
|
|
10
|
+
originalConsoleWarn = console.warn;
|
|
11
|
+
console.warn = jest.fn();
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
afterEach(() => {
|
|
15
|
+
console.warn = originalConsoleWarn;
|
|
16
|
+
jest.clearAllMocks();
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
test('should retry on socket hang up error', async () => {
|
|
20
|
+
const socketHangUpError = new Error('socket hang up') as any;
|
|
21
|
+
socketHangUpError.code = 'ECONNRESET';
|
|
22
|
+
|
|
23
|
+
const successResponse = {
|
|
24
|
+
status: 200,
|
|
25
|
+
data: { status: 'completed', data: [{ url: 'test.com', markdown: 'test' }] }
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
const originalGetRequest = app.getRequest;
|
|
29
|
+
let callCount = 0;
|
|
30
|
+
|
|
31
|
+
app.getRequest = async function(url: string, headers: any) {
|
|
32
|
+
callCount++;
|
|
33
|
+
if (callCount === 1) {
|
|
34
|
+
throw socketHangUpError;
|
|
35
|
+
}
|
|
36
|
+
return successResponse;
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
const result = await app.monitorJobStatus('test-id', {}, 1);
|
|
40
|
+
|
|
41
|
+
expect(callCount).toBe(2);
|
|
42
|
+
expect(result).toEqual(successResponse.data);
|
|
43
|
+
expect(console.warn).toHaveBeenCalledWith(
|
|
44
|
+
expect.stringContaining('Network error during job status check (attempt 1/3): socket hang up')
|
|
45
|
+
);
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
test('should retry on ETIMEDOUT error', async () => {
|
|
49
|
+
const timeoutError = new Error('timeout') as any;
|
|
50
|
+
timeoutError.code = 'ETIMEDOUT';
|
|
51
|
+
|
|
52
|
+
const successResponse = {
|
|
53
|
+
status: 200,
|
|
54
|
+
data: { status: 'completed', data: [{ url: 'test.com', markdown: 'test' }] }
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
const originalGetRequest = app.getRequest;
|
|
58
|
+
let callCount = 0;
|
|
59
|
+
|
|
60
|
+
app.getRequest = async function(url: string, headers: any) {
|
|
61
|
+
callCount++;
|
|
62
|
+
if (callCount === 1) {
|
|
63
|
+
throw timeoutError;
|
|
64
|
+
}
|
|
65
|
+
return successResponse;
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
const result = await app.monitorJobStatus('test-id', {}, 1);
|
|
69
|
+
|
|
70
|
+
expect(callCount).toBe(2);
|
|
71
|
+
expect(result).toEqual(successResponse.data);
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
test('should fail after max retries exceeded', async () => {
|
|
75
|
+
const socketHangUpError = new Error('socket hang up') as any;
|
|
76
|
+
socketHangUpError.code = 'ECONNRESET';
|
|
77
|
+
|
|
78
|
+
app.getRequest = async function(url: string, headers: any) {
|
|
79
|
+
throw socketHangUpError;
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
await expect(app.monitorJobStatus('test-id', {}, 1)).rejects.toThrow('socket hang up');
|
|
83
|
+
|
|
84
|
+
expect(console.warn).toHaveBeenCalledTimes(3);
|
|
85
|
+
}, 15000);
|
|
86
|
+
|
|
87
|
+
test('should not retry on non-retryable errors', async () => {
|
|
88
|
+
const authError = new Error('Unauthorized') as any;
|
|
89
|
+
authError.response = { status: 401, data: { error: 'Unauthorized' } };
|
|
90
|
+
|
|
91
|
+
app.getRequest = async function(url: string, headers: any) {
|
|
92
|
+
throw authError;
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
await expect(app.monitorJobStatus('test-id', {}, 1)).rejects.toThrow('Unauthorized');
|
|
96
|
+
|
|
97
|
+
expect(console.warn).not.toHaveBeenCalled();
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
test('should retry on HTTP timeout status codes', async () => {
|
|
101
|
+
const timeoutError = new Error('Request timeout') as any;
|
|
102
|
+
timeoutError.response = { status: 408, data: { error: 'Request timeout' } };
|
|
103
|
+
|
|
104
|
+
const successResponse = {
|
|
105
|
+
status: 200,
|
|
106
|
+
data: { status: 'completed', data: [{ url: 'test.com', markdown: 'test' }] }
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
const originalGetRequest = app.getRequest;
|
|
110
|
+
let callCount = 0;
|
|
111
|
+
|
|
112
|
+
app.getRequest = async function(url: string, headers: any) {
|
|
113
|
+
callCount++;
|
|
114
|
+
if (callCount === 1) {
|
|
115
|
+
throw timeoutError;
|
|
116
|
+
}
|
|
117
|
+
return successResponse;
|
|
118
|
+
};
|
|
119
|
+
|
|
120
|
+
const result = await app.monitorJobStatus('test-id', {}, 1);
|
|
121
|
+
|
|
122
|
+
expect(callCount).toBe(2);
|
|
123
|
+
expect(result).toEqual(successResponse.data);
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
test('should use exponential backoff for retries', async () => {
|
|
127
|
+
const socketHangUpError = new Error('socket hang up') as any;
|
|
128
|
+
socketHangUpError.code = 'ECONNRESET';
|
|
129
|
+
|
|
130
|
+
const successResponse = {
|
|
131
|
+
status: 200,
|
|
132
|
+
data: { status: 'completed', data: [{ url: 'test.com', markdown: 'test' }] }
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
const originalGetRequest = app.getRequest;
|
|
136
|
+
let callCount = 0;
|
|
137
|
+
|
|
138
|
+
app.getRequest = async function(url: string, headers: any) {
|
|
139
|
+
callCount++;
|
|
140
|
+
if (callCount <= 2) {
|
|
141
|
+
throw socketHangUpError;
|
|
142
|
+
}
|
|
143
|
+
return successResponse;
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
const startTime = Date.now();
|
|
147
|
+
const result = await app.monitorJobStatus('test-id', {}, 1);
|
|
148
|
+
const endTime = Date.now();
|
|
149
|
+
|
|
150
|
+
expect(callCount).toBe(3);
|
|
151
|
+
expect(result).toEqual(successResponse.data);
|
|
152
|
+
expect(endTime - startTime).toBeGreaterThan(3000);
|
|
153
|
+
});
|
|
154
|
+
});
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { describe, test, expect } from "@jest/globals";
|
|
2
|
+
import { throwForBadResponse, normalizeAxiosError } from "../../../v2/utils/errorHandler";
|
|
3
|
+
|
|
4
|
+
describe("v2 utils: errorHandler", () => {
|
|
5
|
+
test("throwForBadResponse: throws SdkError with message from body.error", () => {
|
|
6
|
+
const resp: any = { status: 400, data: { error: "bad" } };
|
|
7
|
+
expect(() => throwForBadResponse(resp, "do thing")).toThrow(/bad/);
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
test("normalizeAxiosError: prefers body.error then err.message", () => {
|
|
11
|
+
const err: any = {
|
|
12
|
+
isAxiosError: true,
|
|
13
|
+
response: { status: 402, data: { error: "payment required" } },
|
|
14
|
+
message: "network",
|
|
15
|
+
};
|
|
16
|
+
expect(() => normalizeAxiosError(err, "action")).toThrow(/payment required/);
|
|
17
|
+
});
|
|
18
|
+
});
|
|
19
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Minimal unit test for v2 scrape (no mocking; sanity check payload path)
|
|
3
|
+
*/
|
|
4
|
+
import { FirecrawlClient } from "../../../v2/client";
|
|
5
|
+
|
|
6
|
+
describe("v2.scrape unit", () => {
|
|
7
|
+
test("constructor requires apiKey", () => {
|
|
8
|
+
expect(() => new FirecrawlClient({ apiKey: "", apiUrl: "https://api.firecrawl.dev" })).toThrow();
|
|
9
|
+
});
|
|
10
|
+
});
|
|
11
|
+
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { describe, test, expect } from "@jest/globals";
|
|
2
|
+
import { ensureValidFormats, ensureValidScrapeOptions } from "../../../v2/utils/validation";
|
|
3
|
+
import type { FormatOption } from "../../../v2/types";
|
|
4
|
+
import { z } from "zod";
|
|
5
|
+
|
|
6
|
+
describe("v2 utils: validation", () => {
|
|
7
|
+
test("ensureValidFormats: plain 'json' string is invalid", () => {
|
|
8
|
+
const formats: FormatOption[] = ["markdown", "json"] as unknown as FormatOption[];
|
|
9
|
+
expect(() => ensureValidFormats(formats)).toThrow(/json format must be an object/i);
|
|
10
|
+
});
|
|
11
|
+
|
|
12
|
+
test("ensureValidFormats: json format requires prompt and schema", () => {
|
|
13
|
+
const bad1: FormatOption[] = [{ type: "json", prompt: "p" } as any];
|
|
14
|
+
const bad2: FormatOption[] = [{ type: "json", schema: {} } as any];
|
|
15
|
+
expect(() => ensureValidFormats(bad1)).toThrow(/requires 'prompt' and 'schema'/i);
|
|
16
|
+
expect(() => ensureValidFormats(bad2)).toThrow(/requires 'prompt' and 'schema'/i);
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
test("ensureValidFormats: converts zod schema to JSON schema", () => {
|
|
20
|
+
const schema = z.object({ title: z.string() });
|
|
21
|
+
const formats: FormatOption[] = [
|
|
22
|
+
{ type: "json", prompt: "extract", schema } as any,
|
|
23
|
+
];
|
|
24
|
+
ensureValidFormats(formats);
|
|
25
|
+
const jsonFmt = formats[0] as any;
|
|
26
|
+
expect(typeof jsonFmt.schema).toBe("object");
|
|
27
|
+
expect(jsonFmt.schema?.properties).toBeTruthy();
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
test("ensureValidFormats: screenshot quality must be non-negative number", () => {
|
|
31
|
+
const formats: FormatOption[] = [
|
|
32
|
+
{ type: "screenshot", quality: -1 } as any,
|
|
33
|
+
];
|
|
34
|
+
expect(() => ensureValidFormats(formats)).toThrow(/non-negative number/i);
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
test("ensureValidScrapeOptions: validates timeout and waitFor bounds", () => {
|
|
38
|
+
expect(() => ensureValidScrapeOptions({ timeout: 0 })).toThrow(/timeout must be positive/i);
|
|
39
|
+
expect(() => ensureValidScrapeOptions({ waitFor: -1 })).toThrow(/waitFor must be non-negative/i);
|
|
40
|
+
// valid
|
|
41
|
+
expect(() => ensureValidScrapeOptions({ timeout: 1000, waitFor: 0 })).not.toThrow();
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
test("ensureValidFormats: accepts screenshot viewport width/height", () => {
|
|
45
|
+
const formats: FormatOption[] = [
|
|
46
|
+
{ type: "screenshot", viewport: { width: 800, height: 600 } } as any,
|
|
47
|
+
];
|
|
48
|
+
expect(() => ensureValidFormats(formats)).not.toThrow();
|
|
49
|
+
expect((formats[0] as any).viewport).toEqual({ width: 800, height: 600 });
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
test("ensureValidScrapeOptions: leaves parsers untouched", () => {
|
|
53
|
+
const options = { parsers: ["pdf", "images"] as string[] } as any;
|
|
54
|
+
const before = [...options.parsers];
|
|
55
|
+
expect(() => ensureValidScrapeOptions(options)).not.toThrow();
|
|
56
|
+
expect(options.parsers).toEqual(before);
|
|
57
|
+
});
|
|
58
|
+
});
|
|
59
|
+
|