firecrawl 1.29.3 → 3.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +4 -2
- package/README.md +85 -78
- package/audit-ci.jsonc +4 -0
- package/dist/chunk-OIZ6OKY4.js +85 -0
- package/dist/index.cjs +961 -35
- package/dist/index.d.cts +524 -11
- package/dist/index.d.ts +524 -11
- package/dist/index.js +953 -27
- package/dist/package-V5IPFKBE.js +4 -0
- package/package.json +6 -6
- package/src/__tests__/e2e/v2/batch.test.ts +74 -0
- package/src/__tests__/e2e/v2/crawl.test.ts +182 -0
- package/src/__tests__/e2e/v2/extract.test.ts +70 -0
- package/src/__tests__/e2e/v2/map.test.ts +55 -0
- package/src/__tests__/e2e/v2/scrape.test.ts +130 -0
- package/src/__tests__/e2e/v2/search.test.ts +247 -0
- package/src/__tests__/e2e/v2/usage.test.ts +36 -0
- package/src/__tests__/e2e/v2/utils/idmux.ts +58 -0
- package/src/__tests__/e2e/v2/watcher.test.ts +96 -0
- package/src/__tests__/unit/v2/errorHandler.test.ts +19 -0
- package/src/__tests__/unit/v2/scrape.unit.test.ts +11 -0
- package/src/__tests__/unit/v2/validation.test.ts +59 -0
- package/src/index.backup.ts +2146 -0
- package/src/index.ts +27 -2134
- package/src/v1/index.ts +2158 -0
- package/src/v2/client.ts +283 -0
- package/src/v2/methods/batch.ts +119 -0
- package/src/v2/methods/crawl.ts +144 -0
- package/src/v2/methods/extract.ts +86 -0
- package/src/v2/methods/map.ts +37 -0
- package/src/v2/methods/scrape.ts +26 -0
- package/src/v2/methods/search.ts +69 -0
- package/src/v2/methods/usage.ts +39 -0
- package/src/v2/types.ts +337 -0
- package/src/v2/utils/errorHandler.ts +18 -0
- package/src/v2/utils/getVersion.ts +14 -0
- package/src/v2/utils/httpClient.ts +99 -0
- package/src/v2/utils/validation.ts +50 -0
- package/src/v2/watcher.ts +159 -0
- package/tsconfig.json +2 -1
- package/dist/package-Z6F7JDXI.js +0 -111
- /package/src/__tests__/{v1/e2e_withAuth → e2e/v1}/index.test.ts +0 -0
- /package/src/__tests__/{v1/unit → unit/v1}/monitor-job-status-retry.test.ts +0 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "firecrawl",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "3.0.3",
|
|
4
4
|
"description": "JavaScript SDK for Firecrawl API",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -16,12 +16,12 @@
|
|
|
16
16
|
"build": "tsup",
|
|
17
17
|
"build-and-publish": "npm run build && npm publish --access public",
|
|
18
18
|
"publish-beta": "npm run build && npm publish --access public --tag beta",
|
|
19
|
-
"test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/
|
|
20
|
-
"test:unit": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/
|
|
19
|
+
"test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/e2e/v2/*.test.ts --detectOpenHandles",
|
|
20
|
+
"test:unit": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/unit/v2/*.test.ts"
|
|
21
21
|
},
|
|
22
22
|
"repository": {
|
|
23
23
|
"type": "git",
|
|
24
|
-
"url": "git+https://github.com/
|
|
24
|
+
"url": "git+https://github.com/firecrawl/firecrawl.git"
|
|
25
25
|
},
|
|
26
26
|
"author": "Mendable.ai",
|
|
27
27
|
"license": "MIT",
|
|
@@ -32,9 +32,9 @@
|
|
|
32
32
|
"zod-to-json-schema": "^3.23.0"
|
|
33
33
|
},
|
|
34
34
|
"bugs": {
|
|
35
|
-
"url": "https://github.com/
|
|
35
|
+
"url": "https://github.com/firecrawl/firecrawl/issues"
|
|
36
36
|
},
|
|
37
|
-
"homepage": "https://github.com/
|
|
37
|
+
"homepage": "https://github.com/firecrawl/firecrawl#readme",
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"@jest/globals": "^30.0.5",
|
|
40
40
|
"@types/dotenv": "^8.2.0",
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* E2E tests for v2 batch scrape (translated from Python tests)
|
|
3
|
+
*/
|
|
4
|
+
import Firecrawl from "../../../index";
|
|
5
|
+
import { config } from "dotenv";
|
|
6
|
+
import { getIdentity, getApiUrl } from "./utils/idmux";
|
|
7
|
+
import { describe, test, expect, beforeAll } from "@jest/globals";
|
|
8
|
+
|
|
9
|
+
config();
|
|
10
|
+
|
|
11
|
+
const API_URL = getApiUrl();
|
|
12
|
+
let client: Firecrawl;
|
|
13
|
+
|
|
14
|
+
beforeAll(async () => {
|
|
15
|
+
const { apiKey } = await getIdentity({ name: "js-e2e-batch" });
|
|
16
|
+
client = new Firecrawl({ apiKey, apiUrl: API_URL });
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
describe("v2.batch e2e", () => {
|
|
20
|
+
test("batch scrape minimal (wait)", async () => {
|
|
21
|
+
const urls = [
|
|
22
|
+
"https://docs.firecrawl.dev",
|
|
23
|
+
"https://firecrawl.dev",
|
|
24
|
+
];
|
|
25
|
+
const job = await client.batchScrape(urls, { options: { formats: ["markdown"] }, pollInterval: 1, timeout: 180 });
|
|
26
|
+
expect(["completed", "failed"]).toContain(job.status);
|
|
27
|
+
expect(job.completed).toBeGreaterThanOrEqual(0);
|
|
28
|
+
expect(job.total).toBeGreaterThanOrEqual(0);
|
|
29
|
+
expect(Array.isArray(job.data)).toBe(true);
|
|
30
|
+
}, 240_000);
|
|
31
|
+
|
|
32
|
+
test("start batch minimal and status", async () => {
|
|
33
|
+
const urls = ["https://docs.firecrawl.dev", "https://firecrawl.dev"];
|
|
34
|
+
const start = await client.startBatchScrape(urls, { options: { formats: ["markdown"] }, ignoreInvalidURLs: true });
|
|
35
|
+
expect(typeof start.id).toBe("string");
|
|
36
|
+
expect(typeof start.url).toBe("string");
|
|
37
|
+
const status = await client.getBatchScrapeStatus(start.id);
|
|
38
|
+
expect(["scraping", "completed", "failed", "cancelled"]).toContain(status.status);
|
|
39
|
+
expect(status.total).toBeGreaterThanOrEqual(0);
|
|
40
|
+
}, 120_000);
|
|
41
|
+
|
|
42
|
+
test("wait batch with all params", async () => {
|
|
43
|
+
const urls = ["https://docs.firecrawl.dev", "https://firecrawl.dev"];
|
|
44
|
+
const job = await client.batchScrape(urls, {
|
|
45
|
+
options: {
|
|
46
|
+
formats: [
|
|
47
|
+
"markdown",
|
|
48
|
+
{ type: "json", prompt: "Extract page title", schema: { type: "object", properties: { title: { type: "string" } }, required: ["title"] } },
|
|
49
|
+
{ type: "changeTracking", prompt: "Track changes", modes: ["json"] },
|
|
50
|
+
],
|
|
51
|
+
onlyMainContent: true,
|
|
52
|
+
mobile: false,
|
|
53
|
+
},
|
|
54
|
+
ignoreInvalidURLs: true,
|
|
55
|
+
maxConcurrency: 2,
|
|
56
|
+
zeroDataRetention: false,
|
|
57
|
+
pollInterval: 1,
|
|
58
|
+
timeout: 180,
|
|
59
|
+
});
|
|
60
|
+
expect(["completed", "failed", "cancelled"]).toContain(job.status);
|
|
61
|
+
expect(job.completed).toBeGreaterThanOrEqual(0);
|
|
62
|
+
expect(job.total).toBeGreaterThanOrEqual(0);
|
|
63
|
+
expect(Array.isArray(job.data)).toBe(true);
|
|
64
|
+
}, 300_000);
|
|
65
|
+
|
|
66
|
+
test("cancel batch", async () => {
|
|
67
|
+
const urls = ["https://docs.firecrawl.dev", "https://firecrawl.dev"];
|
|
68
|
+
const start = await client.startBatchScrape(urls, { options: { formats: ["markdown"] }, maxConcurrency: 1 });
|
|
69
|
+
expect(typeof start.id).toBe("string");
|
|
70
|
+
const cancelled = await client.cancelBatchScrape(start.id);
|
|
71
|
+
expect(cancelled).toBe(true);
|
|
72
|
+
}, 120_000);
|
|
73
|
+
});
|
|
74
|
+
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* E2E tests for v2 crawl (translated from Python tests)
|
|
3
|
+
*/
|
|
4
|
+
import Firecrawl from "../../../index";
|
|
5
|
+
import { config } from "dotenv";
|
|
6
|
+
import { getIdentity, getApiUrl } from "./utils/idmux";
|
|
7
|
+
import { describe, test, expect, beforeAll } from "@jest/globals";
|
|
8
|
+
|
|
9
|
+
config();
|
|
10
|
+
|
|
11
|
+
const API_URL = getApiUrl();
|
|
12
|
+
let client: Firecrawl;
|
|
13
|
+
|
|
14
|
+
beforeAll(async () => {
|
|
15
|
+
const { apiKey } = await getIdentity({ name: "js-e2e-crawl" });
|
|
16
|
+
client = new Firecrawl({ apiKey, apiUrl: API_URL });
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
describe("v2.crawl e2e", () => {
|
|
20
|
+
|
|
21
|
+
test("start crawl minimal request", async () => {
|
|
22
|
+
if (!client) throw new Error();
|
|
23
|
+
const job = await client.startCrawl("https://docs.firecrawl.dev", { limit: 3 });
|
|
24
|
+
expect(typeof job.id).toBe("string");
|
|
25
|
+
expect(typeof job.url).toBe("string");
|
|
26
|
+
}, 90_000);
|
|
27
|
+
|
|
28
|
+
test("start crawl with options", async () => {
|
|
29
|
+
if (!client) throw new Error();
|
|
30
|
+
const job = await client.startCrawl("https://docs.firecrawl.dev", { limit: 5, maxDiscoveryDepth: 2 });
|
|
31
|
+
expect(typeof job.id).toBe("string");
|
|
32
|
+
expect(typeof job.url).toBe("string");
|
|
33
|
+
}, 90_000);
|
|
34
|
+
|
|
35
|
+
test("start crawl with prompt", async () => {
|
|
36
|
+
if (!client) throw new Error();
|
|
37
|
+
const job = await client.startCrawl("https://firecrawl.dev", { prompt: "Extract all blog posts", limit: 3 });
|
|
38
|
+
expect(typeof job.id).toBe("string");
|
|
39
|
+
expect(typeof job.url).toBe("string");
|
|
40
|
+
}, 90_000);
|
|
41
|
+
|
|
42
|
+
test("get crawl status", async () => {
|
|
43
|
+
if (!client) throw new Error();
|
|
44
|
+
const start = await client.startCrawl("https://docs.firecrawl.dev", { limit: 3 });
|
|
45
|
+
const status = await client.getCrawlStatus(start.id);
|
|
46
|
+
expect(["scraping", "completed", "failed", "cancelled"]).toContain(status.status);
|
|
47
|
+
expect(status.completed).toBeGreaterThanOrEqual(0);
|
|
48
|
+
// next/expiresAt may be null/undefined depending on state; check shape
|
|
49
|
+
expect(Array.isArray(status.data)).toBe(true);
|
|
50
|
+
}, 120_000);
|
|
51
|
+
|
|
52
|
+
test("cancel crawl", async () => {
|
|
53
|
+
if (!client) throw new Error();
|
|
54
|
+
const start = await client.startCrawl("https://docs.firecrawl.dev", { limit: 3 });
|
|
55
|
+
const ok = await client.cancelCrawl(start.id);
|
|
56
|
+
expect(ok).toBe(true);
|
|
57
|
+
}, 120_000);
|
|
58
|
+
|
|
59
|
+
test("get crawl errors", async () => {
|
|
60
|
+
if (!client) throw new Error();
|
|
61
|
+
const start = await client.startCrawl("https://docs.firecrawl.dev", { limit: 3 });
|
|
62
|
+
const resp = await client.getCrawlErrors(start.id);
|
|
63
|
+
expect(resp).toHaveProperty("errors");
|
|
64
|
+
expect(resp).toHaveProperty("robotsBlocked");
|
|
65
|
+
expect(Array.isArray(resp.errors)).toBe(true);
|
|
66
|
+
expect(Array.isArray(resp.robotsBlocked)).toBe(true);
|
|
67
|
+
for (const e of resp.errors) {
|
|
68
|
+
expect(typeof e.id === "string" || e.id == null).toBe(true);
|
|
69
|
+
expect(typeof e.timestamp === "string" || e.timestamp == null).toBe(true);
|
|
70
|
+
expect(typeof e.url === "string" || e.url == null).toBe(true);
|
|
71
|
+
expect(typeof e.error === "string" || e.error == null).toBe(true);
|
|
72
|
+
}
|
|
73
|
+
}, 120_000);
|
|
74
|
+
|
|
75
|
+
test("get crawl errors with invalid id should throw", async () => {
|
|
76
|
+
if (!client) throw new Error();
|
|
77
|
+
await expect(client.getCrawlErrors("invalid-job-id-12345")).rejects.toThrow();
|
|
78
|
+
}, 60_000);
|
|
79
|
+
|
|
80
|
+
test("get active crawls", async () => {
|
|
81
|
+
if (!client) throw new Error();
|
|
82
|
+
const active = await client.getActiveCrawls();
|
|
83
|
+
expect(typeof active.success).toBe("boolean");
|
|
84
|
+
expect(Array.isArray(active.crawls)).toBe(true);
|
|
85
|
+
for (const c of active.crawls) {
|
|
86
|
+
expect(typeof c.id).toBe("string");
|
|
87
|
+
expect(typeof c.teamId).toBe("string");
|
|
88
|
+
expect(typeof c.url).toBe("string");
|
|
89
|
+
if (c.options != null) {
|
|
90
|
+
expect(typeof c.options === "object").toBe(true);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}, 90_000);
|
|
94
|
+
|
|
95
|
+
test("get active crawls with running crawl", async () => {
|
|
96
|
+
if (!client) throw new Error();
|
|
97
|
+
const start = await client.startCrawl("https://docs.firecrawl.dev", { limit: 5 });
|
|
98
|
+
await new Promise(resolve => setTimeout(resolve, 300));
|
|
99
|
+
const active = await client.getActiveCrawls();
|
|
100
|
+
expect(Array.isArray(active.crawls)).toBe(true);
|
|
101
|
+
const ids = active.crawls.map(c => c.id);
|
|
102
|
+
expect(ids.includes(start.id)).toBe(true);
|
|
103
|
+
await client.cancelCrawl(start.id);
|
|
104
|
+
}, 120_000);
|
|
105
|
+
|
|
106
|
+
test("crawl with wait", async () => {
|
|
107
|
+
if (!client) throw new Error();
|
|
108
|
+
const job = await client.crawl("https://docs.firecrawl.dev", { limit: 3, maxDiscoveryDepth: 2, pollInterval: 1, timeout: 120 });
|
|
109
|
+
expect(["completed", "failed"]).toContain(job.status);
|
|
110
|
+
expect(job.completed).toBeGreaterThanOrEqual(0);
|
|
111
|
+
expect(job.total).toBeGreaterThanOrEqual(0);
|
|
112
|
+
expect(Array.isArray(job.data)).toBe(true);
|
|
113
|
+
}, 180_000);
|
|
114
|
+
|
|
115
|
+
test("crawl with prompt and wait", async () => {
|
|
116
|
+
if (!client) throw new Error();
|
|
117
|
+
const job = await client.crawl("https://docs.firecrawl.dev", { prompt: "Extract all blog posts", limit: 3, pollInterval: 1, timeout: 120 });
|
|
118
|
+
expect(["completed", "failed"]).toContain(job.status);
|
|
119
|
+
expect(job.completed).toBeGreaterThanOrEqual(0);
|
|
120
|
+
expect(job.total).toBeGreaterThanOrEqual(0);
|
|
121
|
+
expect(Array.isArray(job.data)).toBe(true);
|
|
122
|
+
}, 180_000);
|
|
123
|
+
|
|
124
|
+
test("crawl with scrape options", async () => {
|
|
125
|
+
if (!client) throw new Error();
|
|
126
|
+
const job = await client.startCrawl("https://docs.firecrawl.dev", {
|
|
127
|
+
limit: 2,
|
|
128
|
+
scrapeOptions: { formats: ["markdown", "links"], onlyMainContent: false, mobile: true },
|
|
129
|
+
});
|
|
130
|
+
expect(typeof job.id).toBe("string");
|
|
131
|
+
}, 120_000);
|
|
132
|
+
|
|
133
|
+
test("crawl with json format object", async () => {
|
|
134
|
+
if (!client) throw new Error();
|
|
135
|
+
const job = await client.startCrawl("https://docs.firecrawl.dev", {
|
|
136
|
+
limit: 2,
|
|
137
|
+
scrapeOptions: { formats: [{ type: "json", prompt: "Extract page title", schema: { type: "object", properties: { title: { type: "string" } }, required: ["title"] } }] },
|
|
138
|
+
});
|
|
139
|
+
expect(typeof job.id).toBe("string");
|
|
140
|
+
}, 120_000);
|
|
141
|
+
|
|
142
|
+
test("crawl all parameters", async () => {
|
|
143
|
+
if (!client) throw new Error();
|
|
144
|
+
const job = await client.startCrawl("https://docs.firecrawl.dev", {
|
|
145
|
+
prompt: "Extract all blog posts and documentation",
|
|
146
|
+
includePaths: ["/blog/*", "/docs/*"],
|
|
147
|
+
excludePaths: ["/admin/*"],
|
|
148
|
+
maxDiscoveryDepth: 3,
|
|
149
|
+
sitemap: "skip",
|
|
150
|
+
ignoreQueryParameters: true,
|
|
151
|
+
limit: 5,
|
|
152
|
+
crawlEntireDomain: true,
|
|
153
|
+
allowExternalLinks: false,
|
|
154
|
+
allowSubdomains: true,
|
|
155
|
+
delay: 1,
|
|
156
|
+
maxConcurrency: 2,
|
|
157
|
+
webhook: "https://example.com/hook",
|
|
158
|
+
scrapeOptions: {
|
|
159
|
+
formats: ["markdown", "html"],
|
|
160
|
+
headers: { "User-Agent": "Test Bot" },
|
|
161
|
+
includeTags: ["h1", "h2"],
|
|
162
|
+
excludeTags: ["nav"],
|
|
163
|
+
onlyMainContent: false,
|
|
164
|
+
timeout: 15_000,
|
|
165
|
+
waitFor: 2000,
|
|
166
|
+
mobile: true,
|
|
167
|
+
skipTlsVerification: true,
|
|
168
|
+
removeBase64Images: false,
|
|
169
|
+
},
|
|
170
|
+
zeroDataRetention: false,
|
|
171
|
+
});
|
|
172
|
+
expect(typeof job.id).toBe("string");
|
|
173
|
+
}, 180_000);
|
|
174
|
+
|
|
175
|
+
test("crawl params preview", async () => {
|
|
176
|
+
if (!client) throw new Error();
|
|
177
|
+
const params = await client.crawlParamsPreview("https://docs.firecrawl.dev", "Extract all blog posts and documentation");
|
|
178
|
+
expect(params && typeof params === "object").toBe(true);
|
|
179
|
+
// Optional fields may or may not be present; just assert object shape
|
|
180
|
+
}, 60_000);
|
|
181
|
+
});
|
|
182
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* E2E tests for v2 extract (proxied to v1), translated from Python tests
|
|
3
|
+
*/
|
|
4
|
+
import Firecrawl from "../../../index";
|
|
5
|
+
import { config } from "dotenv";
|
|
6
|
+
import { getIdentity, getApiUrl } from "./utils/idmux";
|
|
7
|
+
import { describe, test, expect, beforeAll } from "@jest/globals";
|
|
8
|
+
import { z } from "zod";
|
|
9
|
+
|
|
10
|
+
config();
|
|
11
|
+
|
|
12
|
+
const API_URL = getApiUrl();
|
|
13
|
+
let client: Firecrawl;
|
|
14
|
+
|
|
15
|
+
beforeAll(async () => {
|
|
16
|
+
const { apiKey } = await getIdentity({ name: "js-e2e-extract" });
|
|
17
|
+
client = new Firecrawl({ apiKey, apiUrl: API_URL });
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
describe("v2.extract e2e", () => {
|
|
21
|
+
test("extract minimal with prompt", async () => {
|
|
22
|
+
const resp = await client.extract({ urls: ["https://docs.firecrawl.dev"], prompt: "Extract the main page title" });
|
|
23
|
+
expect(typeof resp.success === "boolean" || resp.success == null).toBe(true);
|
|
24
|
+
}, 120_000);
|
|
25
|
+
|
|
26
|
+
test("extract with schema", async () => {
|
|
27
|
+
const schema = {
|
|
28
|
+
type: "object",
|
|
29
|
+
properties: { title: { type: "string" } },
|
|
30
|
+
required: ["title"],
|
|
31
|
+
} as const;
|
|
32
|
+
const resp = await client.extract({
|
|
33
|
+
urls: ["https://docs.firecrawl.dev"],
|
|
34
|
+
schema,
|
|
35
|
+
prompt: "Extract the main page title",
|
|
36
|
+
showSources: true,
|
|
37
|
+
enableWebSearch: false,
|
|
38
|
+
});
|
|
39
|
+
expect(typeof resp.success === "boolean" || resp.success == null).toBe(true);
|
|
40
|
+
if ((resp as any).sources != null) {
|
|
41
|
+
expect(typeof (resp as any).sources).toBe("object");
|
|
42
|
+
}
|
|
43
|
+
if (resp.data != null) {
|
|
44
|
+
expect(typeof resp.data).toBe("object");
|
|
45
|
+
expect((resp.data as any).title).toBeTruthy();
|
|
46
|
+
}
|
|
47
|
+
}, 180_000);
|
|
48
|
+
|
|
49
|
+
test("extract with zod schema", async () => {
|
|
50
|
+
const schema = z.object({
|
|
51
|
+
title: z.string(),
|
|
52
|
+
});
|
|
53
|
+
const resp = await client.extract({
|
|
54
|
+
urls: ["https://docs.firecrawl.dev"],
|
|
55
|
+
schema: schema,
|
|
56
|
+
prompt: "Extract the main page title",
|
|
57
|
+
showSources: true,
|
|
58
|
+
enableWebSearch: false,
|
|
59
|
+
});
|
|
60
|
+
expect(typeof resp.success === "boolean" || resp.success == null).toBe(true);
|
|
61
|
+
if ((resp as any).sources != null) {
|
|
62
|
+
expect(typeof (resp as any).sources).toBe("object");
|
|
63
|
+
}
|
|
64
|
+
if (resp.data != null) {
|
|
65
|
+
expect(typeof resp.data).toBe("object");
|
|
66
|
+
expect(schema.safeParse(resp.data).success).toBe(true);
|
|
67
|
+
}
|
|
68
|
+
}, 180_000);
|
|
69
|
+
});
|
|
70
|
+
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* E2E tests for v2 map (translated from Python tests)
|
|
3
|
+
*/
|
|
4
|
+
import Firecrawl from "../../../index";
|
|
5
|
+
import { config } from "dotenv";
|
|
6
|
+
import { getIdentity, getApiUrl } from "./utils/idmux";
|
|
7
|
+
import { describe, test, expect, beforeAll } from "@jest/globals";
|
|
8
|
+
|
|
9
|
+
config();
|
|
10
|
+
|
|
11
|
+
const API_URL = getApiUrl();
|
|
12
|
+
let client: Firecrawl;
|
|
13
|
+
|
|
14
|
+
beforeAll(async () => {
|
|
15
|
+
const { apiKey } = await getIdentity({ name: "js-e2e-map" });
|
|
16
|
+
client = new Firecrawl({ apiKey, apiUrl: API_URL });
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
describe("v2.map e2e", () => {
|
|
20
|
+
|
|
21
|
+
test("minimal request", async () => {
|
|
22
|
+
if (!client) throw new Error();
|
|
23
|
+
const resp = await client.map("https://docs.firecrawl.dev");
|
|
24
|
+
|
|
25
|
+
expect(resp).toBeTruthy();
|
|
26
|
+
expect(Array.isArray(resp.links)).toBe(true);
|
|
27
|
+
|
|
28
|
+
if (resp.links.length > 0) {
|
|
29
|
+
const first: any = resp.links[0];
|
|
30
|
+
expect(typeof first.url).toBe("string");
|
|
31
|
+
expect(first.url.startsWith("http")).toBe(true);
|
|
32
|
+
}
|
|
33
|
+
}, 90_000);
|
|
34
|
+
|
|
35
|
+
test.each(["only", "skip", "include"]) ("with options sitemap=%s", async (sitemap) => {
|
|
36
|
+
if (!client) throw new Error();
|
|
37
|
+
const resp = await client.map("https://docs.firecrawl.dev", {
|
|
38
|
+
search: "docs",
|
|
39
|
+
includeSubdomains: true,
|
|
40
|
+
limit: 10,
|
|
41
|
+
sitemap: sitemap as any,
|
|
42
|
+
timeout: 15_000,
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
expect(resp).toBeTruthy();
|
|
46
|
+
expect(Array.isArray(resp.links)).toBe(true);
|
|
47
|
+
expect(resp.links.length).toBeLessThanOrEqual(10);
|
|
48
|
+
|
|
49
|
+
for (const link of resp.links as any[]) {
|
|
50
|
+
expect(typeof link.url).toBe("string");
|
|
51
|
+
expect(link.url.startsWith("http")).toBe(true);
|
|
52
|
+
}
|
|
53
|
+
}, 120_000);
|
|
54
|
+
});
|
|
55
|
+
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* E2E tests for v2 scrape
|
|
3
|
+
*/
|
|
4
|
+
import Firecrawl from "../../../index";
|
|
5
|
+
import { z } from "zod";
|
|
6
|
+
import { config } from "dotenv";
|
|
7
|
+
import { getIdentity, getApiUrl } from "./utils/idmux";
|
|
8
|
+
import { describe, test, expect, beforeAll } from "@jest/globals";
|
|
9
|
+
|
|
10
|
+
config();
|
|
11
|
+
|
|
12
|
+
const API_URL = getApiUrl();
|
|
13
|
+
let client: Firecrawl;
|
|
14
|
+
|
|
15
|
+
beforeAll(async () => {
|
|
16
|
+
const { apiKey } = await getIdentity({ name: "js-e2e-scrape" });
|
|
17
|
+
client = new Firecrawl({ apiKey, apiUrl: API_URL });
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
describe("v2.scrape e2e", () => {
|
|
21
|
+
|
|
22
|
+
const assertValidDocument = (doc: any) => {
|
|
23
|
+
expect(doc).toBeTruthy();
|
|
24
|
+
const hasContent = Boolean(doc.markdown?.length) || Boolean(doc.html?.length) || Boolean(doc.rawHtml?.length);
|
|
25
|
+
expect(hasContent).toBe(true);
|
|
26
|
+
expect(doc.metadata).toBeTruthy();
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
test("minimal: scrape only required params", async () => {
|
|
30
|
+
if (!client) throw new Error();
|
|
31
|
+
const doc = await client.scrape("https://docs.firecrawl.dev");
|
|
32
|
+
assertValidDocument(doc);
|
|
33
|
+
}, 60_000);
|
|
34
|
+
|
|
35
|
+
test("maximal: scrape with all options", async () => {
|
|
36
|
+
if (!client) throw new Error();
|
|
37
|
+
const doc = await client.scrape("https://docs.firecrawl.dev", {
|
|
38
|
+
formats: [
|
|
39
|
+
"markdown",
|
|
40
|
+
"html",
|
|
41
|
+
"rawHtml",
|
|
42
|
+
"links",
|
|
43
|
+
{ type: "screenshot", fullPage: true, quality: 80, viewport: { width: 1280, height: 800 } },
|
|
44
|
+
{
|
|
45
|
+
type: "json",
|
|
46
|
+
prompt: "Summarize the page and list links",
|
|
47
|
+
schema: {
|
|
48
|
+
type: "object",
|
|
49
|
+
properties: {
|
|
50
|
+
summary: { type: "string" },
|
|
51
|
+
links: { type: "array", items: { type: "string", format: "uri" } },
|
|
52
|
+
},
|
|
53
|
+
required: ["summary"],
|
|
54
|
+
},
|
|
55
|
+
},
|
|
56
|
+
],
|
|
57
|
+
parsers: ["pdf"],
|
|
58
|
+
headers: { "User-Agent": "firecrawl-tests" },
|
|
59
|
+
includeTags: ["article"],
|
|
60
|
+
excludeTags: ["nav"],
|
|
61
|
+
onlyMainContent: true,
|
|
62
|
+
waitFor: 1000,
|
|
63
|
+
timeout: 30_000,
|
|
64
|
+
location: { country: "us", languages: ["en"] },
|
|
65
|
+
mobile: false,
|
|
66
|
+
skipTlsVerification: false,
|
|
67
|
+
removeBase64Images: true,
|
|
68
|
+
blockAds: true,
|
|
69
|
+
proxy: "auto",
|
|
70
|
+
storeInCache: true,
|
|
71
|
+
maxAge: 60_000,
|
|
72
|
+
});
|
|
73
|
+
assertValidDocument(doc);
|
|
74
|
+
}, 90_000);
|
|
75
|
+
|
|
76
|
+
test("json format with zod schema (auto-converted internally)", async () => {
|
|
77
|
+
if (!client) throw new Error();
|
|
78
|
+
const zodSchema = z.object({
|
|
79
|
+
title: z.string().min(1),
|
|
80
|
+
items: z.array(z.string().url()).optional(),
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
const doc = await client.scrape("https://docs.firecrawl.dev", {
|
|
84
|
+
formats: [
|
|
85
|
+
{
|
|
86
|
+
type: "json",
|
|
87
|
+
prompt: "Extract title and items",
|
|
88
|
+
schema: zodSchema,
|
|
89
|
+
},
|
|
90
|
+
],
|
|
91
|
+
});
|
|
92
|
+
expect(doc).toBeTruthy();
|
|
93
|
+
}, 90_000);
|
|
94
|
+
|
|
95
|
+
test("summary format returns summary string", async () => {
|
|
96
|
+
if (!client) throw new Error();
|
|
97
|
+
const doc = await client.scrape("https://firecrawl.dev", { formats: ["summary"] });
|
|
98
|
+
expect(typeof doc.summary).toBe("string");
|
|
99
|
+
expect((doc.summary || "").length).toBeGreaterThan(10);
|
|
100
|
+
}, 90_000);
|
|
101
|
+
|
|
102
|
+
test.each([
|
|
103
|
+
["markdown", "markdown"],
|
|
104
|
+
["html", "html"],
|
|
105
|
+
["rawHtml", "rawHtml"],
|
|
106
|
+
["links", "links"],
|
|
107
|
+
["screenshot", "screenshot"],
|
|
108
|
+
])("basic format: %s", async (fmt, expectField) => {
|
|
109
|
+
if (!client) throw new Error();
|
|
110
|
+
const doc = await client.scrape("https://docs.firecrawl.dev", { formats: [fmt as any] });
|
|
111
|
+
if (expectField !== "links" && expectField !== "screenshot") {
|
|
112
|
+
assertValidDocument(doc);
|
|
113
|
+
}
|
|
114
|
+
if (expectField === "markdown") expect(doc.markdown).toBeTruthy();
|
|
115
|
+
if (expectField === "html") expect(doc.html).toBeTruthy();
|
|
116
|
+
if (expectField === "rawHtml") expect(doc.rawHtml).toBeTruthy();
|
|
117
|
+
if (expectField === "screenshot") expect(doc.screenshot).toBeTruthy();
|
|
118
|
+
if (expectField === "links") {
|
|
119
|
+
expect(Array.isArray(doc.links)).toBe(true);
|
|
120
|
+
expect((doc.links || []).length).toBeGreaterThan(0);
|
|
121
|
+
}
|
|
122
|
+
}, 90_000);
|
|
123
|
+
|
|
124
|
+
test("invalid url should throw", async () => {
|
|
125
|
+
if (!client) throw new Error();
|
|
126
|
+
await expect(client.scrape("")).rejects.toThrow("URL cannot be empty");
|
|
127
|
+
await expect(client.scrape(" ")).rejects.toThrow("URL cannot be empty");
|
|
128
|
+
});
|
|
129
|
+
});
|
|
130
|
+
|