firecrawl 4.18.3 → 4.18.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -0
- package/dist/{chunk-XC6YCBFX.js → chunk-SEIHZPTI.js} +3 -2
- package/dist/index.cjs +153 -4
- package/dist/index.d.cts +29 -1
- package/dist/index.d.ts +29 -1
- package/dist/index.js +152 -4
- package/dist/{package-PW6FMSAE.js → package-ASKBBK6V.js} +1 -1
- package/package.json +1 -1
- package/src/__tests__/e2e/v2/parse.test.ts +67 -0
- package/src/__tests__/unit/v2/parse.unit.test.ts +40 -0
- package/src/v2/client.ts +21 -0
- package/src/v2/methods/crawl.ts +1 -0
- package/src/v2/methods/parse.ts +90 -0
- package/src/v2/types.ts +46 -0
- package/src/v2/utils/httpClient.ts +37 -4
- package/src/v2/utils/validation.ts +88 -1
package/README.md
CHANGED
|
@@ -46,6 +46,26 @@ const url = 'https://example.com';
|
|
|
46
46
|
const scrapedData = await app.scrape(url);
|
|
47
47
|
```
|
|
48
48
|
|
|
49
|
+
### Parsing uploaded files
|
|
50
|
+
|
|
51
|
+
Use `parse` to upload a file (`html`, `pdf`, `docx`, etc.) as multipart form data and process it through the same parsing pipeline.
|
|
52
|
+
Parse does not support browser-only formats/options like `changeTracking`, `screenshot`, `branding`, `actions`, `waitFor`, `location`, or `mobile`.
|
|
53
|
+
|
|
54
|
+
```js
|
|
55
|
+
const parsed = await app.parse(
|
|
56
|
+
{
|
|
57
|
+
data: '<html><body><h1>Hello parse</h1></body></html>',
|
|
58
|
+
filename: 'upload.html',
|
|
59
|
+
contentType: 'text/html',
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
formats: ['markdown'],
|
|
63
|
+
}
|
|
64
|
+
);
|
|
65
|
+
|
|
66
|
+
console.log(parsed.markdown);
|
|
67
|
+
```
|
|
68
|
+
|
|
49
69
|
### Crawling a Website
|
|
50
70
|
|
|
51
71
|
To crawl a website with error handling, use the `crawl` method. It takes the starting URL and optional parameters, including limits and per‑page `scrapeOptions`.
|
|
@@ -8,7 +8,7 @@ var require_package = __commonJS({
|
|
|
8
8
|
"package.json"(exports, module) {
|
|
9
9
|
module.exports = {
|
|
10
10
|
name: "@mendable/firecrawl-js",
|
|
11
|
-
version: "4.18.
|
|
11
|
+
version: "4.18.5",
|
|
12
12
|
description: "JavaScript SDK for Firecrawl API",
|
|
13
13
|
main: "dist/index.js",
|
|
14
14
|
types: "dist/index.d.ts",
|
|
@@ -78,7 +78,8 @@ var require_package = __commonJS({
|
|
|
78
78
|
"picomatch@<4.0.4": ">=4.0.4",
|
|
79
79
|
handlebars: ">=4.7.9",
|
|
80
80
|
"brace-expansion": ">=5.0.5",
|
|
81
|
-
"axios@<1.15.0": "1.15.0"
|
|
81
|
+
"axios@<1.15.0": "1.15.0",
|
|
82
|
+
"follow-redirects@<1.16.0": ">=1.16.0 <2.0.0"
|
|
82
83
|
}
|
|
83
84
|
}
|
|
84
85
|
};
|
package/dist/index.cjs
CHANGED
|
@@ -35,7 +35,7 @@ var require_package = __commonJS({
|
|
|
35
35
|
"package.json"(exports2, module2) {
|
|
36
36
|
module2.exports = {
|
|
37
37
|
name: "@mendable/firecrawl-js",
|
|
38
|
-
version: "4.18.
|
|
38
|
+
version: "4.18.5",
|
|
39
39
|
description: "JavaScript SDK for Firecrawl API",
|
|
40
40
|
main: "dist/index.js",
|
|
41
41
|
types: "dist/index.d.ts",
|
|
@@ -105,7 +105,8 @@ var require_package = __commonJS({
|
|
|
105
105
|
"picomatch@<4.0.4": ">=4.0.4",
|
|
106
106
|
handlebars: ">=4.7.9",
|
|
107
107
|
"brace-expansion": ">=5.0.5",
|
|
108
|
-
"axios@<1.15.0": "1.15.0"
|
|
108
|
+
"axios@<1.15.0": "1.15.0",
|
|
109
|
+
"follow-redirects@<1.16.0": ">=1.16.0 <2.0.0"
|
|
109
110
|
}
|
|
110
111
|
}
|
|
111
112
|
};
|
|
@@ -157,7 +158,6 @@ var HttpClient = class {
|
|
|
157
158
|
baseURL: this.apiUrl,
|
|
158
159
|
timeout: options.timeoutMs ?? 3e5,
|
|
159
160
|
headers: {
|
|
160
|
-
"Content-Type": "application/json",
|
|
161
161
|
Authorization: `Bearer ${this.apiKey}`
|
|
162
162
|
},
|
|
163
163
|
transitional: { clarifyTimeoutError: true }
|
|
@@ -178,13 +178,20 @@ var HttpClient = class {
|
|
|
178
178
|
for (let attempt = 0; attempt < this.maxRetries; attempt++) {
|
|
179
179
|
try {
|
|
180
180
|
const cfg = { ...config };
|
|
181
|
-
|
|
181
|
+
const isFormDataBody = typeof FormData !== "undefined" && cfg.data instanceof FormData;
|
|
182
|
+
const isPlainObjectBody = !isFormDataBody && cfg.data != null && typeof cfg.data === "object" && !Array.isArray(cfg.data);
|
|
183
|
+
if (isPlainObjectBody && cfg.method && ["post", "put", "patch"].includes(cfg.method.toLowerCase())) {
|
|
182
184
|
const data = cfg.data ?? {};
|
|
183
185
|
cfg.data = { ...data, origin: typeof data.origin === "string" && data.origin.includes("mcp") ? data.origin : `js-sdk@${version}` };
|
|
184
186
|
if (typeof data.timeout === "number") {
|
|
185
187
|
cfg.timeout = data.timeout + 5e3;
|
|
186
188
|
}
|
|
187
189
|
}
|
|
190
|
+
if (isFormDataBody) {
|
|
191
|
+
cfg.headers = { ...cfg.headers || {} };
|
|
192
|
+
delete cfg.headers["Content-Type"];
|
|
193
|
+
delete cfg.headers["content-type"];
|
|
194
|
+
}
|
|
188
195
|
const res = await this.instance.request(cfg);
|
|
189
196
|
if (res.status === 502 && attempt < this.maxRetries - 1) {
|
|
190
197
|
await this.sleep(this.backoffFactor * Math.pow(2, attempt));
|
|
@@ -209,6 +216,15 @@ var HttpClient = class {
|
|
|
209
216
|
post(endpoint, body, headers) {
|
|
210
217
|
return this.request({ method: "post", url: endpoint, data: body, headers });
|
|
211
218
|
}
|
|
219
|
+
postMultipart(endpoint, formData, headers, timeoutMs) {
|
|
220
|
+
return this.request({
|
|
221
|
+
method: "post",
|
|
222
|
+
url: endpoint,
|
|
223
|
+
data: formData,
|
|
224
|
+
headers,
|
|
225
|
+
timeout: timeoutMs
|
|
226
|
+
});
|
|
227
|
+
}
|
|
212
228
|
get(endpoint, headers) {
|
|
213
229
|
return this.request({ method: "get", url: endpoint, headers });
|
|
214
230
|
}
|
|
@@ -355,6 +371,76 @@ function ensureValidScrapeOptions(options) {
|
|
|
355
371
|
}
|
|
356
372
|
ensureValidFormats(options.formats);
|
|
357
373
|
}
|
|
374
|
+
function ensureValidParseFormats(formats) {
|
|
375
|
+
if (!formats) return;
|
|
376
|
+
for (const fmt of formats) {
|
|
377
|
+
if (typeof fmt === "string") {
|
|
378
|
+
if (fmt === "json") {
|
|
379
|
+
throw new Error("json format must be an object with { type: 'json', prompt, schema }");
|
|
380
|
+
}
|
|
381
|
+
if (fmt === "screenshot") {
|
|
382
|
+
throw new Error("parse does not support screenshot format");
|
|
383
|
+
}
|
|
384
|
+
if (fmt === "changeTracking") {
|
|
385
|
+
throw new Error("parse does not support changeTracking format");
|
|
386
|
+
}
|
|
387
|
+
if (fmt === "branding") {
|
|
388
|
+
throw new Error("parse does not support branding format");
|
|
389
|
+
}
|
|
390
|
+
continue;
|
|
391
|
+
}
|
|
392
|
+
const type = fmt.type;
|
|
393
|
+
if (type === "changeTracking") {
|
|
394
|
+
throw new Error("parse does not support changeTracking format");
|
|
395
|
+
}
|
|
396
|
+
if (type === "screenshot") {
|
|
397
|
+
throw new Error("parse does not support screenshot format");
|
|
398
|
+
}
|
|
399
|
+
if (type === "branding") {
|
|
400
|
+
throw new Error("parse does not support branding format");
|
|
401
|
+
}
|
|
402
|
+
if (fmt.type === "json") {
|
|
403
|
+
const j = fmt;
|
|
404
|
+
if (!j.prompt && !j.schema) {
|
|
405
|
+
throw new Error("json format requires either 'prompt' or 'schema' (or both)");
|
|
406
|
+
}
|
|
407
|
+
const maybeSchema = j.schema;
|
|
408
|
+
if (isZodSchema(maybeSchema)) {
|
|
409
|
+
j.schema = zodSchemaToJsonSchema(maybeSchema);
|
|
410
|
+
} else if (looksLikeZodShape(maybeSchema)) {
|
|
411
|
+
throw new Error(
|
|
412
|
+
"json format schema appears to be a Zod schema's .shape property. Pass the Zod schema directly (e.g., `schema: MySchema`) instead of `schema: MySchema.shape`. The SDK will automatically convert Zod schemas to JSON Schema format."
|
|
413
|
+
);
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
function ensureValidParseOptions(options) {
|
|
419
|
+
if (!options) return;
|
|
420
|
+
if (options.timeout != null && options.timeout <= 0) {
|
|
421
|
+
throw new Error("timeout must be positive");
|
|
422
|
+
}
|
|
423
|
+
const raw = options;
|
|
424
|
+
if (raw.waitFor !== void 0) {
|
|
425
|
+
throw new Error("parse does not support waitFor");
|
|
426
|
+
}
|
|
427
|
+
if (raw.actions !== void 0) {
|
|
428
|
+
throw new Error("parse does not support actions");
|
|
429
|
+
}
|
|
430
|
+
if (raw.location !== void 0) {
|
|
431
|
+
throw new Error("parse does not support location overrides");
|
|
432
|
+
}
|
|
433
|
+
if (raw.mobile !== void 0) {
|
|
434
|
+
throw new Error("parse does not support mobile rendering");
|
|
435
|
+
}
|
|
436
|
+
if (raw.maxAge !== void 0 || raw.minAge !== void 0 || raw.storeInCache !== void 0) {
|
|
437
|
+
throw new Error("parse does not support cache/index options");
|
|
438
|
+
}
|
|
439
|
+
if (raw.proxy !== void 0 && raw.proxy !== "basic" && raw.proxy !== "auto") {
|
|
440
|
+
throw new Error("parse only supports proxy values of 'basic' or 'auto'");
|
|
441
|
+
}
|
|
442
|
+
ensureValidParseFormats(options.formats);
|
|
443
|
+
}
|
|
358
444
|
|
|
359
445
|
// src/v2/utils/errorHandler.ts
|
|
360
446
|
var import_axios2 = require("axios");
|
|
@@ -455,6 +541,65 @@ async function stopInteraction(http, jobId) {
|
|
|
455
541
|
}
|
|
456
542
|
}
|
|
457
543
|
|
|
544
|
+
// src/v2/methods/parse.ts
|
|
545
|
+
function toUploadBlob(input, contentType) {
|
|
546
|
+
if (typeof Blob !== "undefined" && input instanceof Blob) {
|
|
547
|
+
if (contentType && input.type !== contentType) {
|
|
548
|
+
return new Blob([input], { type: contentType });
|
|
549
|
+
}
|
|
550
|
+
return input;
|
|
551
|
+
}
|
|
552
|
+
if (typeof Buffer !== "undefined" && Buffer.isBuffer(input)) {
|
|
553
|
+
return new Blob([input], { type: contentType });
|
|
554
|
+
}
|
|
555
|
+
if (input instanceof ArrayBuffer) {
|
|
556
|
+
return new Blob([input], { type: contentType });
|
|
557
|
+
}
|
|
558
|
+
if (ArrayBuffer.isView(input)) {
|
|
559
|
+
return new Blob([input], { type: contentType });
|
|
560
|
+
}
|
|
561
|
+
if (typeof input === "string") {
|
|
562
|
+
return new Blob([input], { type: contentType ?? "text/plain; charset=utf-8" });
|
|
563
|
+
}
|
|
564
|
+
throw new Error("Unsupported parse file data type");
|
|
565
|
+
}
|
|
566
|
+
async function parse(http, file, options) {
|
|
567
|
+
if (!file || !file.filename || !file.filename.trim()) {
|
|
568
|
+
throw new Error("filename cannot be empty");
|
|
569
|
+
}
|
|
570
|
+
if (file.data == null) {
|
|
571
|
+
throw new Error("file data cannot be empty");
|
|
572
|
+
}
|
|
573
|
+
const blob = toUploadBlob(file.data, file.contentType);
|
|
574
|
+
if (blob.size === 0) {
|
|
575
|
+
throw new Error("file data cannot be empty");
|
|
576
|
+
}
|
|
577
|
+
if (options) ensureValidParseOptions(options);
|
|
578
|
+
const version = getVersion();
|
|
579
|
+
const normalizedOptions = {
|
|
580
|
+
...options ?? {},
|
|
581
|
+
origin: typeof options?.origin === "string" && options.origin.includes("mcp") ? options.origin : options?.origin ?? `js-sdk@${version}`
|
|
582
|
+
};
|
|
583
|
+
const formData = new FormData();
|
|
584
|
+
formData.append("options", JSON.stringify(normalizedOptions));
|
|
585
|
+
formData.append(
|
|
586
|
+
"file",
|
|
587
|
+
toUploadBlob(file.data, file.contentType),
|
|
588
|
+
file.filename.trim()
|
|
589
|
+
);
|
|
590
|
+
const requestTimeoutMs = typeof normalizedOptions.timeout === "number" ? normalizedOptions.timeout + 5e3 : void 0;
|
|
591
|
+
try {
|
|
592
|
+
const res = await http.postMultipart("/v2/parse", formData, void 0, requestTimeoutMs);
|
|
593
|
+
if (res.status !== 200 || !res.data?.success) {
|
|
594
|
+
throwForBadResponse(res, "parse");
|
|
595
|
+
}
|
|
596
|
+
return res.data.data || {};
|
|
597
|
+
} catch (err) {
|
|
598
|
+
if (err?.isAxiosError) return normalizeAxiosError(err, "parse");
|
|
599
|
+
throw err;
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
|
|
458
603
|
// src/v2/methods/search.ts
|
|
459
604
|
function prepareSearchPayload(req) {
|
|
460
605
|
if (!req.query || !req.query.trim()) throw new Error("Query cannot be empty");
|
|
@@ -589,6 +734,7 @@ function prepareCrawlPayload(request) {
|
|
|
589
734
|
if (request.includePaths) data.includePaths = request.includePaths;
|
|
590
735
|
if (request.maxDiscoveryDepth != null) data.maxDiscoveryDepth = request.maxDiscoveryDepth;
|
|
591
736
|
if (request.sitemap != null) data.sitemap = request.sitemap;
|
|
737
|
+
if (request.robotsUserAgent != null) data.robotsUserAgent = request.robotsUserAgent;
|
|
592
738
|
if (request.ignoreQueryParameters != null) data.ignoreQueryParameters = request.ignoreQueryParameters;
|
|
593
739
|
if (request.deduplicateSimilarURLs != null) data.deduplicateSimilarURLs = request.deduplicateSimilarURLs;
|
|
594
740
|
if (request.limit != null) data.limit = request.limit;
|
|
@@ -1402,6 +1548,9 @@ var FirecrawlClient = class {
|
|
|
1402
1548
|
async deleteScrapeBrowser(jobId) {
|
|
1403
1549
|
return this.stopInteraction(jobId);
|
|
1404
1550
|
}
|
|
1551
|
+
async parse(file, options) {
|
|
1552
|
+
return parse(this.http, file, options);
|
|
1553
|
+
}
|
|
1405
1554
|
// Search
|
|
1406
1555
|
/**
|
|
1407
1556
|
* Search the web and optionally scrape each result.
|
package/dist/index.d.cts
CHANGED
|
@@ -45,6 +45,11 @@ interface QueryFormat {
|
|
|
45
45
|
prompt: string;
|
|
46
46
|
}
|
|
47
47
|
type FormatOption = FormatString | Format | JsonFormat | ChangeTrackingFormat | ScreenshotFormat | AttributesFormat | QueryFormat;
|
|
48
|
+
type ParseFormatString = Exclude<FormatString, 'screenshot' | 'changeTracking' | 'branding'>;
|
|
49
|
+
interface ParseFormat {
|
|
50
|
+
type: ParseFormatString;
|
|
51
|
+
}
|
|
52
|
+
type ParseFormatOption = ParseFormatString | ParseFormat | JsonFormat | AttributesFormat | QueryFormat;
|
|
48
53
|
interface LocationConfig$1 {
|
|
49
54
|
country?: string;
|
|
50
55
|
languages?: string[];
|
|
@@ -126,6 +131,16 @@ interface ScrapeOptions {
|
|
|
126
131
|
integration?: string;
|
|
127
132
|
origin?: string;
|
|
128
133
|
}
|
|
134
|
+
type ParseFileData = Blob | File | Buffer | Uint8Array | ArrayBuffer | string;
|
|
135
|
+
interface ParseFile {
|
|
136
|
+
data: ParseFileData;
|
|
137
|
+
filename: string;
|
|
138
|
+
contentType?: string;
|
|
139
|
+
}
|
|
140
|
+
type ParseOptions = Omit<ScrapeOptions, 'formats' | 'waitFor' | 'mobile' | 'actions' | 'location' | 'maxAge' | 'minAge' | 'storeInCache' | 'proxy'> & {
|
|
141
|
+
formats?: ParseFormatOption[];
|
|
142
|
+
proxy?: 'basic' | 'auto';
|
|
143
|
+
};
|
|
129
144
|
interface WebhookConfig {
|
|
130
145
|
url: string;
|
|
131
146
|
headers?: Record<string, string>;
|
|
@@ -645,6 +660,7 @@ declare class HttpClient {
|
|
|
645
660
|
private request;
|
|
646
661
|
private sleep;
|
|
647
662
|
post<T = any>(endpoint: string, body: Record<string, unknown>, headers?: Record<string, string>): Promise<AxiosResponse<T, any, {}>>;
|
|
663
|
+
postMultipart<T = any>(endpoint: string, formData: FormData, headers?: Record<string, string>, timeoutMs?: number): Promise<AxiosResponse<T, any, {}>>;
|
|
648
664
|
get<T = any>(endpoint: string, headers?: Record<string, string>): Promise<AxiosResponse<T, any, {}>>;
|
|
649
665
|
delete<T = any>(endpoint: string, headers?: Record<string, string>): Promise<AxiosResponse<T, any, {}>>;
|
|
650
666
|
prepareHeaders(idempotencyKey?: string): Record<string, string>;
|
|
@@ -796,6 +812,18 @@ declare class FirecrawlClient {
|
|
|
796
812
|
* @deprecated Use stopInteraction().
|
|
797
813
|
*/
|
|
798
814
|
deleteScrapeBrowser(jobId: string): Promise<ScrapeBrowserDeleteResponse>;
|
|
815
|
+
/**
|
|
816
|
+
* Parse an uploaded file via the v2 parse endpoint.
|
|
817
|
+
* @param file File payload (data, filename, optional contentType).
|
|
818
|
+
* @param options Optional parse options (formats, parsers, etc.).
|
|
819
|
+
* Note: parse does not support changeTracking, screenshot, branding,
|
|
820
|
+
* actions, waitFor, location, or mobile options.
|
|
821
|
+
* @returns Parsed document with requested formats.
|
|
822
|
+
*/
|
|
823
|
+
parse<Opts extends ParseOptions>(file: ParseFile, options: Opts): Promise<Omit<Document, "json"> & {
|
|
824
|
+
json?: InferredJsonFromOptions<Opts>;
|
|
825
|
+
}>;
|
|
826
|
+
parse(file: ParseFile, options?: ParseOptions): Promise<Document>;
|
|
799
827
|
/**
|
|
800
828
|
* Search the web and optionally scrape each result.
|
|
801
829
|
* @param query Search query string.
|
|
@@ -1892,4 +1920,4 @@ declare class Firecrawl extends FirecrawlClient {
|
|
|
1892
1920
|
get v1(): FirecrawlApp;
|
|
1893
1921
|
}
|
|
1894
1922
|
|
|
1895
|
-
export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AgentOptions$1 as AgentOptions, type AgentResponse, type AgentStatusResponse, type AgentWebhookConfig, type AgentWebhookEvent, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type BrandingProfile, type BrowserCreateResponse, type BrowserDeleteResponse, type BrowserExecuteResponse, type BrowserListResponse, type BrowserSession, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type CreditUsageHistoricalPeriod, type CreditUsageHistoricalResponse, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, JobTimeoutError, type JsonFormat, type LocationConfig$1 as LocationConfig, type MapData, type MapOptions, type PDFAction, type PaginationConfig, type PressAction, type QueryFormat, type QueueStatusResponse$1 as QueueStatusResponse, type ScrapeAction, type ScrapeBrowserDeleteResponse, type ScrapeExecuteRequest, type ScrapeExecuteResponse, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type TokenUsageHistoricalPeriod, type TokenUsageHistoricalResponse, type Viewport, type WaitAction, Watcher, type WatcherOptions, type WebhookConfig, type WriteAction, Firecrawl as default };
|
|
1923
|
+
export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AgentOptions$1 as AgentOptions, type AgentResponse, type AgentStatusResponse, type AgentWebhookConfig, type AgentWebhookEvent, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type BrandingProfile, type BrowserCreateResponse, type BrowserDeleteResponse, type BrowserExecuteResponse, type BrowserListResponse, type BrowserSession, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type CreditUsageHistoricalPeriod, type CreditUsageHistoricalResponse, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, JobTimeoutError, type JsonFormat, type LocationConfig$1 as LocationConfig, type MapData, type MapOptions, type PDFAction, type PaginationConfig, type ParseFile, type ParseFileData, type ParseFormat, type ParseFormatOption, type ParseFormatString, type ParseOptions, type PressAction, type QueryFormat, type QueueStatusResponse$1 as QueueStatusResponse, type ScrapeAction, type ScrapeBrowserDeleteResponse, type ScrapeExecuteRequest, type ScrapeExecuteResponse, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type TokenUsageHistoricalPeriod, type TokenUsageHistoricalResponse, type Viewport, type WaitAction, Watcher, type WatcherOptions, type WebhookConfig, type WriteAction, Firecrawl as default };
|
package/dist/index.d.ts
CHANGED
|
@@ -45,6 +45,11 @@ interface QueryFormat {
|
|
|
45
45
|
prompt: string;
|
|
46
46
|
}
|
|
47
47
|
type FormatOption = FormatString | Format | JsonFormat | ChangeTrackingFormat | ScreenshotFormat | AttributesFormat | QueryFormat;
|
|
48
|
+
type ParseFormatString = Exclude<FormatString, 'screenshot' | 'changeTracking' | 'branding'>;
|
|
49
|
+
interface ParseFormat {
|
|
50
|
+
type: ParseFormatString;
|
|
51
|
+
}
|
|
52
|
+
type ParseFormatOption = ParseFormatString | ParseFormat | JsonFormat | AttributesFormat | QueryFormat;
|
|
48
53
|
interface LocationConfig$1 {
|
|
49
54
|
country?: string;
|
|
50
55
|
languages?: string[];
|
|
@@ -126,6 +131,16 @@ interface ScrapeOptions {
|
|
|
126
131
|
integration?: string;
|
|
127
132
|
origin?: string;
|
|
128
133
|
}
|
|
134
|
+
type ParseFileData = Blob | File | Buffer | Uint8Array | ArrayBuffer | string;
|
|
135
|
+
interface ParseFile {
|
|
136
|
+
data: ParseFileData;
|
|
137
|
+
filename: string;
|
|
138
|
+
contentType?: string;
|
|
139
|
+
}
|
|
140
|
+
type ParseOptions = Omit<ScrapeOptions, 'formats' | 'waitFor' | 'mobile' | 'actions' | 'location' | 'maxAge' | 'minAge' | 'storeInCache' | 'proxy'> & {
|
|
141
|
+
formats?: ParseFormatOption[];
|
|
142
|
+
proxy?: 'basic' | 'auto';
|
|
143
|
+
};
|
|
129
144
|
interface WebhookConfig {
|
|
130
145
|
url: string;
|
|
131
146
|
headers?: Record<string, string>;
|
|
@@ -645,6 +660,7 @@ declare class HttpClient {
|
|
|
645
660
|
private request;
|
|
646
661
|
private sleep;
|
|
647
662
|
post<T = any>(endpoint: string, body: Record<string, unknown>, headers?: Record<string, string>): Promise<AxiosResponse<T, any, {}>>;
|
|
663
|
+
postMultipart<T = any>(endpoint: string, formData: FormData, headers?: Record<string, string>, timeoutMs?: number): Promise<AxiosResponse<T, any, {}>>;
|
|
648
664
|
get<T = any>(endpoint: string, headers?: Record<string, string>): Promise<AxiosResponse<T, any, {}>>;
|
|
649
665
|
delete<T = any>(endpoint: string, headers?: Record<string, string>): Promise<AxiosResponse<T, any, {}>>;
|
|
650
666
|
prepareHeaders(idempotencyKey?: string): Record<string, string>;
|
|
@@ -796,6 +812,18 @@ declare class FirecrawlClient {
|
|
|
796
812
|
* @deprecated Use stopInteraction().
|
|
797
813
|
*/
|
|
798
814
|
deleteScrapeBrowser(jobId: string): Promise<ScrapeBrowserDeleteResponse>;
|
|
815
|
+
/**
|
|
816
|
+
* Parse an uploaded file via the v2 parse endpoint.
|
|
817
|
+
* @param file File payload (data, filename, optional contentType).
|
|
818
|
+
* @param options Optional parse options (formats, parsers, etc.).
|
|
819
|
+
* Note: parse does not support changeTracking, screenshot, branding,
|
|
820
|
+
* actions, waitFor, location, or mobile options.
|
|
821
|
+
* @returns Parsed document with requested formats.
|
|
822
|
+
*/
|
|
823
|
+
parse<Opts extends ParseOptions>(file: ParseFile, options: Opts): Promise<Omit<Document, "json"> & {
|
|
824
|
+
json?: InferredJsonFromOptions<Opts>;
|
|
825
|
+
}>;
|
|
826
|
+
parse(file: ParseFile, options?: ParseOptions): Promise<Document>;
|
|
799
827
|
/**
|
|
800
828
|
* Search the web and optionally scrape each result.
|
|
801
829
|
* @param query Search query string.
|
|
@@ -1892,4 +1920,4 @@ declare class Firecrawl extends FirecrawlClient {
|
|
|
1892
1920
|
get v1(): FirecrawlApp;
|
|
1893
1921
|
}
|
|
1894
1922
|
|
|
1895
|
-
export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AgentOptions$1 as AgentOptions, type AgentResponse, type AgentStatusResponse, type AgentWebhookConfig, type AgentWebhookEvent, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type BrandingProfile, type BrowserCreateResponse, type BrowserDeleteResponse, type BrowserExecuteResponse, type BrowserListResponse, type BrowserSession, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type CreditUsageHistoricalPeriod, type CreditUsageHistoricalResponse, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, JobTimeoutError, type JsonFormat, type LocationConfig$1 as LocationConfig, type MapData, type MapOptions, type PDFAction, type PaginationConfig, type PressAction, type QueryFormat, type QueueStatusResponse$1 as QueueStatusResponse, type ScrapeAction, type ScrapeBrowserDeleteResponse, type ScrapeExecuteRequest, type ScrapeExecuteResponse, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type TokenUsageHistoricalPeriod, type TokenUsageHistoricalResponse, type Viewport, type WaitAction, Watcher, type WatcherOptions, type WebhookConfig, type WriteAction, Firecrawl as default };
|
|
1923
|
+
export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AgentOptions$1 as AgentOptions, type AgentResponse, type AgentStatusResponse, type AgentWebhookConfig, type AgentWebhookEvent, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type BrandingProfile, type BrowserCreateResponse, type BrowserDeleteResponse, type BrowserExecuteResponse, type BrowserListResponse, type BrowserSession, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type CreditUsageHistoricalPeriod, type CreditUsageHistoricalResponse, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, JobTimeoutError, type JsonFormat, type LocationConfig$1 as LocationConfig, type MapData, type MapOptions, type PDFAction, type PaginationConfig, type ParseFile, type ParseFileData, type ParseFormat, type ParseFormatOption, type ParseFormatString, type ParseOptions, type PressAction, type QueryFormat, type QueueStatusResponse$1 as QueueStatusResponse, type ScrapeAction, type ScrapeBrowserDeleteResponse, type ScrapeExecuteRequest, type ScrapeExecuteResponse, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type TokenUsageHistoricalPeriod, type TokenUsageHistoricalResponse, type Viewport, type WaitAction, Watcher, type WatcherOptions, type WebhookConfig, type WriteAction, Firecrawl as default };
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
require_package
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-SEIHZPTI.js";
|
|
4
4
|
|
|
5
5
|
// src/v2/utils/httpClient.ts
|
|
6
6
|
import axios from "axios";
|
|
@@ -34,7 +34,6 @@ var HttpClient = class {
|
|
|
34
34
|
baseURL: this.apiUrl,
|
|
35
35
|
timeout: options.timeoutMs ?? 3e5,
|
|
36
36
|
headers: {
|
|
37
|
-
"Content-Type": "application/json",
|
|
38
37
|
Authorization: `Bearer ${this.apiKey}`
|
|
39
38
|
},
|
|
40
39
|
transitional: { clarifyTimeoutError: true }
|
|
@@ -55,13 +54,20 @@ var HttpClient = class {
|
|
|
55
54
|
for (let attempt = 0; attempt < this.maxRetries; attempt++) {
|
|
56
55
|
try {
|
|
57
56
|
const cfg = { ...config };
|
|
58
|
-
|
|
57
|
+
const isFormDataBody = typeof FormData !== "undefined" && cfg.data instanceof FormData;
|
|
58
|
+
const isPlainObjectBody = !isFormDataBody && cfg.data != null && typeof cfg.data === "object" && !Array.isArray(cfg.data);
|
|
59
|
+
if (isPlainObjectBody && cfg.method && ["post", "put", "patch"].includes(cfg.method.toLowerCase())) {
|
|
59
60
|
const data = cfg.data ?? {};
|
|
60
61
|
cfg.data = { ...data, origin: typeof data.origin === "string" && data.origin.includes("mcp") ? data.origin : `js-sdk@${version}` };
|
|
61
62
|
if (typeof data.timeout === "number") {
|
|
62
63
|
cfg.timeout = data.timeout + 5e3;
|
|
63
64
|
}
|
|
64
65
|
}
|
|
66
|
+
if (isFormDataBody) {
|
|
67
|
+
cfg.headers = { ...cfg.headers || {} };
|
|
68
|
+
delete cfg.headers["Content-Type"];
|
|
69
|
+
delete cfg.headers["content-type"];
|
|
70
|
+
}
|
|
65
71
|
const res = await this.instance.request(cfg);
|
|
66
72
|
if (res.status === 502 && attempt < this.maxRetries - 1) {
|
|
67
73
|
await this.sleep(this.backoffFactor * Math.pow(2, attempt));
|
|
@@ -86,6 +92,15 @@ var HttpClient = class {
|
|
|
86
92
|
post(endpoint, body, headers) {
|
|
87
93
|
return this.request({ method: "post", url: endpoint, data: body, headers });
|
|
88
94
|
}
|
|
95
|
+
postMultipart(endpoint, formData, headers, timeoutMs) {
|
|
96
|
+
return this.request({
|
|
97
|
+
method: "post",
|
|
98
|
+
url: endpoint,
|
|
99
|
+
data: formData,
|
|
100
|
+
headers,
|
|
101
|
+
timeout: timeoutMs
|
|
102
|
+
});
|
|
103
|
+
}
|
|
89
104
|
get(endpoint, headers) {
|
|
90
105
|
return this.request({ method: "get", url: endpoint, headers });
|
|
91
106
|
}
|
|
@@ -232,6 +247,76 @@ function ensureValidScrapeOptions(options) {
|
|
|
232
247
|
}
|
|
233
248
|
ensureValidFormats(options.formats);
|
|
234
249
|
}
|
|
250
|
+
function ensureValidParseFormats(formats) {
|
|
251
|
+
if (!formats) return;
|
|
252
|
+
for (const fmt of formats) {
|
|
253
|
+
if (typeof fmt === "string") {
|
|
254
|
+
if (fmt === "json") {
|
|
255
|
+
throw new Error("json format must be an object with { type: 'json', prompt, schema }");
|
|
256
|
+
}
|
|
257
|
+
if (fmt === "screenshot") {
|
|
258
|
+
throw new Error("parse does not support screenshot format");
|
|
259
|
+
}
|
|
260
|
+
if (fmt === "changeTracking") {
|
|
261
|
+
throw new Error("parse does not support changeTracking format");
|
|
262
|
+
}
|
|
263
|
+
if (fmt === "branding") {
|
|
264
|
+
throw new Error("parse does not support branding format");
|
|
265
|
+
}
|
|
266
|
+
continue;
|
|
267
|
+
}
|
|
268
|
+
const type = fmt.type;
|
|
269
|
+
if (type === "changeTracking") {
|
|
270
|
+
throw new Error("parse does not support changeTracking format");
|
|
271
|
+
}
|
|
272
|
+
if (type === "screenshot") {
|
|
273
|
+
throw new Error("parse does not support screenshot format");
|
|
274
|
+
}
|
|
275
|
+
if (type === "branding") {
|
|
276
|
+
throw new Error("parse does not support branding format");
|
|
277
|
+
}
|
|
278
|
+
if (fmt.type === "json") {
|
|
279
|
+
const j = fmt;
|
|
280
|
+
if (!j.prompt && !j.schema) {
|
|
281
|
+
throw new Error("json format requires either 'prompt' or 'schema' (or both)");
|
|
282
|
+
}
|
|
283
|
+
const maybeSchema = j.schema;
|
|
284
|
+
if (isZodSchema(maybeSchema)) {
|
|
285
|
+
j.schema = zodSchemaToJsonSchema(maybeSchema);
|
|
286
|
+
} else if (looksLikeZodShape(maybeSchema)) {
|
|
287
|
+
throw new Error(
|
|
288
|
+
"json format schema appears to be a Zod schema's .shape property. Pass the Zod schema directly (e.g., `schema: MySchema`) instead of `schema: MySchema.shape`. The SDK will automatically convert Zod schemas to JSON Schema format."
|
|
289
|
+
);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
function ensureValidParseOptions(options) {
|
|
295
|
+
if (!options) return;
|
|
296
|
+
if (options.timeout != null && options.timeout <= 0) {
|
|
297
|
+
throw new Error("timeout must be positive");
|
|
298
|
+
}
|
|
299
|
+
const raw = options;
|
|
300
|
+
if (raw.waitFor !== void 0) {
|
|
301
|
+
throw new Error("parse does not support waitFor");
|
|
302
|
+
}
|
|
303
|
+
if (raw.actions !== void 0) {
|
|
304
|
+
throw new Error("parse does not support actions");
|
|
305
|
+
}
|
|
306
|
+
if (raw.location !== void 0) {
|
|
307
|
+
throw new Error("parse does not support location overrides");
|
|
308
|
+
}
|
|
309
|
+
if (raw.mobile !== void 0) {
|
|
310
|
+
throw new Error("parse does not support mobile rendering");
|
|
311
|
+
}
|
|
312
|
+
if (raw.maxAge !== void 0 || raw.minAge !== void 0 || raw.storeInCache !== void 0) {
|
|
313
|
+
throw new Error("parse does not support cache/index options");
|
|
314
|
+
}
|
|
315
|
+
if (raw.proxy !== void 0 && raw.proxy !== "basic" && raw.proxy !== "auto") {
|
|
316
|
+
throw new Error("parse only supports proxy values of 'basic' or 'auto'");
|
|
317
|
+
}
|
|
318
|
+
ensureValidParseFormats(options.formats);
|
|
319
|
+
}
|
|
235
320
|
|
|
236
321
|
// src/v2/utils/errorHandler.ts
|
|
237
322
|
import "axios";
|
|
@@ -332,6 +417,65 @@ async function stopInteraction(http, jobId) {
|
|
|
332
417
|
}
|
|
333
418
|
}
|
|
334
419
|
|
|
420
|
+
// src/v2/methods/parse.ts
|
|
421
|
+
function toUploadBlob(input, contentType) {
|
|
422
|
+
if (typeof Blob !== "undefined" && input instanceof Blob) {
|
|
423
|
+
if (contentType && input.type !== contentType) {
|
|
424
|
+
return new Blob([input], { type: contentType });
|
|
425
|
+
}
|
|
426
|
+
return input;
|
|
427
|
+
}
|
|
428
|
+
if (typeof Buffer !== "undefined" && Buffer.isBuffer(input)) {
|
|
429
|
+
return new Blob([input], { type: contentType });
|
|
430
|
+
}
|
|
431
|
+
if (input instanceof ArrayBuffer) {
|
|
432
|
+
return new Blob([input], { type: contentType });
|
|
433
|
+
}
|
|
434
|
+
if (ArrayBuffer.isView(input)) {
|
|
435
|
+
return new Blob([input], { type: contentType });
|
|
436
|
+
}
|
|
437
|
+
if (typeof input === "string") {
|
|
438
|
+
return new Blob([input], { type: contentType ?? "text/plain; charset=utf-8" });
|
|
439
|
+
}
|
|
440
|
+
throw new Error("Unsupported parse file data type");
|
|
441
|
+
}
|
|
442
|
+
async function parse(http, file, options) {
|
|
443
|
+
if (!file || !file.filename || !file.filename.trim()) {
|
|
444
|
+
throw new Error("filename cannot be empty");
|
|
445
|
+
}
|
|
446
|
+
if (file.data == null) {
|
|
447
|
+
throw new Error("file data cannot be empty");
|
|
448
|
+
}
|
|
449
|
+
const blob = toUploadBlob(file.data, file.contentType);
|
|
450
|
+
if (blob.size === 0) {
|
|
451
|
+
throw new Error("file data cannot be empty");
|
|
452
|
+
}
|
|
453
|
+
if (options) ensureValidParseOptions(options);
|
|
454
|
+
const version = getVersion();
|
|
455
|
+
const normalizedOptions = {
|
|
456
|
+
...options ?? {},
|
|
457
|
+
origin: typeof options?.origin === "string" && options.origin.includes("mcp") ? options.origin : options?.origin ?? `js-sdk@${version}`
|
|
458
|
+
};
|
|
459
|
+
const formData = new FormData();
|
|
460
|
+
formData.append("options", JSON.stringify(normalizedOptions));
|
|
461
|
+
formData.append(
|
|
462
|
+
"file",
|
|
463
|
+
toUploadBlob(file.data, file.contentType),
|
|
464
|
+
file.filename.trim()
|
|
465
|
+
);
|
|
466
|
+
const requestTimeoutMs = typeof normalizedOptions.timeout === "number" ? normalizedOptions.timeout + 5e3 : void 0;
|
|
467
|
+
try {
|
|
468
|
+
const res = await http.postMultipart("/v2/parse", formData, void 0, requestTimeoutMs);
|
|
469
|
+
if (res.status !== 200 || !res.data?.success) {
|
|
470
|
+
throwForBadResponse(res, "parse");
|
|
471
|
+
}
|
|
472
|
+
return res.data.data || {};
|
|
473
|
+
} catch (err) {
|
|
474
|
+
if (err?.isAxiosError) return normalizeAxiosError(err, "parse");
|
|
475
|
+
throw err;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
335
479
|
// src/v2/methods/search.ts
|
|
336
480
|
function prepareSearchPayload(req) {
|
|
337
481
|
if (!req.query || !req.query.trim()) throw new Error("Query cannot be empty");
|
|
@@ -466,6 +610,7 @@ function prepareCrawlPayload(request) {
|
|
|
466
610
|
if (request.includePaths) data.includePaths = request.includePaths;
|
|
467
611
|
if (request.maxDiscoveryDepth != null) data.maxDiscoveryDepth = request.maxDiscoveryDepth;
|
|
468
612
|
if (request.sitemap != null) data.sitemap = request.sitemap;
|
|
613
|
+
if (request.robotsUserAgent != null) data.robotsUserAgent = request.robotsUserAgent;
|
|
469
614
|
if (request.ignoreQueryParameters != null) data.ignoreQueryParameters = request.ignoreQueryParameters;
|
|
470
615
|
if (request.deduplicateSimilarURLs != null) data.deduplicateSimilarURLs = request.deduplicateSimilarURLs;
|
|
471
616
|
if (request.limit != null) data.limit = request.limit;
|
|
@@ -1279,6 +1424,9 @@ var FirecrawlClient = class {
|
|
|
1279
1424
|
async deleteScrapeBrowser(jobId) {
|
|
1280
1425
|
return this.stopInteraction(jobId);
|
|
1281
1426
|
}
|
|
1427
|
+
async parse(file, options) {
|
|
1428
|
+
return parse(this.http, file, options);
|
|
1429
|
+
}
|
|
1282
1430
|
// Search
|
|
1283
1431
|
/**
|
|
1284
1432
|
* Search the web and optionally scrape each result.
|
|
@@ -1559,7 +1707,7 @@ var FirecrawlApp = class {
|
|
|
1559
1707
|
if (typeof process !== "undefined" && process.env && process.env.npm_package_version) {
|
|
1560
1708
|
return process.env.npm_package_version;
|
|
1561
1709
|
}
|
|
1562
|
-
const packageJson = await import("./package-
|
|
1710
|
+
const packageJson = await import("./package-ASKBBK6V.js");
|
|
1563
1711
|
return packageJson.default.version;
|
|
1564
1712
|
} catch (error) {
|
|
1565
1713
|
const isTest = typeof process !== "undefined" && (process.env.JEST_WORKER_ID != null || false);
|
package/package.json
CHANGED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import Firecrawl from "../../../index";
|
|
2
|
+
import { config } from "dotenv";
|
|
3
|
+
import { getIdentity, getApiUrl } from "./utils/idmux";
|
|
4
|
+
import { describe, test, expect, beforeAll } from "@jest/globals";
|
|
5
|
+
|
|
6
|
+
config();
|
|
7
|
+
|
|
8
|
+
const API_URL = getApiUrl();
|
|
9
|
+
let client: Firecrawl;
|
|
10
|
+
|
|
11
|
+
beforeAll(async () => {
|
|
12
|
+
const { apiKey } = await getIdentity({ name: "js-e2e-parse" });
|
|
13
|
+
client = new Firecrawl({ apiKey, apiUrl: API_URL });
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
describe("v2.parse e2e", () => {
|
|
17
|
+
test(
|
|
18
|
+
"parses uploaded HTML files",
|
|
19
|
+
async () => {
|
|
20
|
+
if (!client) throw new Error();
|
|
21
|
+
|
|
22
|
+
const doc = await client.parse(
|
|
23
|
+
{
|
|
24
|
+
data: `
|
|
25
|
+
<!DOCTYPE html>
|
|
26
|
+
<html>
|
|
27
|
+
<body>
|
|
28
|
+
<h1>JS SDK Parse E2E</h1>
|
|
29
|
+
<p>multipart upload body</p>
|
|
30
|
+
</body>
|
|
31
|
+
</html>
|
|
32
|
+
`,
|
|
33
|
+
filename: "parse-e2e.html",
|
|
34
|
+
contentType: "text/html",
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
formats: ["markdown"],
|
|
38
|
+
},
|
|
39
|
+
);
|
|
40
|
+
|
|
41
|
+
expect(doc.markdown).toContain("JS SDK Parse E2E");
|
|
42
|
+
expect(doc.metadata?.creditsUsed).toBe(1);
|
|
43
|
+
},
|
|
44
|
+
60_000,
|
|
45
|
+
);
|
|
46
|
+
|
|
47
|
+
test(
|
|
48
|
+
"returns errors for unsupported file types",
|
|
49
|
+
async () => {
|
|
50
|
+
if (!client) throw new Error();
|
|
51
|
+
|
|
52
|
+
await expect(
|
|
53
|
+
client.parse(
|
|
54
|
+
{
|
|
55
|
+
data: Buffer.from("image-data"),
|
|
56
|
+
filename: "parse-e2e.png",
|
|
57
|
+
contentType: "image/png",
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
formats: ["markdown"],
|
|
61
|
+
},
|
|
62
|
+
),
|
|
63
|
+
).rejects.toThrow();
|
|
64
|
+
},
|
|
65
|
+
60_000,
|
|
66
|
+
);
|
|
67
|
+
});
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { describe, test, expect } from "@jest/globals";
|
|
2
|
+
import { FirecrawlClient } from "../../../v2/client";
|
|
3
|
+
|
|
4
|
+
describe("v2.parse unit", () => {
|
|
5
|
+
test("rejects empty filenames before making requests", async () => {
|
|
6
|
+
const client = new FirecrawlClient({
|
|
7
|
+
apiKey: "test-key",
|
|
8
|
+
apiUrl: "https://localhost:3002",
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
await expect(
|
|
12
|
+
client.parse(
|
|
13
|
+
{
|
|
14
|
+
data: "<html><body>test</body></html>",
|
|
15
|
+
filename: " ",
|
|
16
|
+
contentType: "text/html",
|
|
17
|
+
},
|
|
18
|
+
{ formats: ["markdown"] },
|
|
19
|
+
),
|
|
20
|
+
).rejects.toThrow("filename cannot be empty");
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
test("rejects changeTracking format before making requests", async () => {
|
|
24
|
+
const client = new FirecrawlClient({
|
|
25
|
+
apiKey: "test-key",
|
|
26
|
+
apiUrl: "https://localhost:3002",
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
await expect(
|
|
30
|
+
client.parse(
|
|
31
|
+
{
|
|
32
|
+
data: "<html><body>test</body></html>",
|
|
33
|
+
filename: "upload.html",
|
|
34
|
+
contentType: "text/html",
|
|
35
|
+
},
|
|
36
|
+
{ formats: ["markdown", { type: "changeTracking" } as any] },
|
|
37
|
+
),
|
|
38
|
+
).rejects.toThrow("parse does not support changeTracking format");
|
|
39
|
+
});
|
|
40
|
+
});
|
package/src/v2/client.ts
CHANGED
|
@@ -4,6 +4,7 @@ import {
|
|
|
4
4
|
interact as interactMethod,
|
|
5
5
|
stopInteraction as stopInteractionMethod,
|
|
6
6
|
} from "./methods/scrape";
|
|
7
|
+
import { parse as parseMethod } from "./methods/parse";
|
|
7
8
|
import { search } from "./methods/search";
|
|
8
9
|
import { map as mapMethod } from "./methods/map";
|
|
9
10
|
import {
|
|
@@ -33,6 +34,8 @@ import {
|
|
|
33
34
|
import { getConcurrency, getCreditUsage, getQueueStatus, getTokenUsage, getCreditUsageHistorical, getTokenUsageHistorical } from "./methods/usage";
|
|
34
35
|
import type {
|
|
35
36
|
Document,
|
|
37
|
+
ParseFile,
|
|
38
|
+
ParseOptions,
|
|
36
39
|
ScrapeOptions,
|
|
37
40
|
SearchData,
|
|
38
41
|
SearchRequest,
|
|
@@ -177,6 +180,24 @@ export class FirecrawlClient {
|
|
|
177
180
|
return this.stopInteraction(jobId);
|
|
178
181
|
}
|
|
179
182
|
|
|
183
|
+
// Parse
|
|
184
|
+
/**
|
|
185
|
+
* Parse an uploaded file via the v2 parse endpoint.
|
|
186
|
+
* @param file File payload (data, filename, optional contentType).
|
|
187
|
+
* @param options Optional parse options (formats, parsers, etc.).
|
|
188
|
+
* Note: parse does not support changeTracking, screenshot, branding,
|
|
189
|
+
* actions, waitFor, location, or mobile options.
|
|
190
|
+
* @returns Parsed document with requested formats.
|
|
191
|
+
*/
|
|
192
|
+
async parse<Opts extends ParseOptions>(
|
|
193
|
+
file: ParseFile,
|
|
194
|
+
options: Opts
|
|
195
|
+
): Promise<Omit<Document, "json"> & { json?: InferredJsonFromOptions<Opts> }>;
|
|
196
|
+
async parse(file: ParseFile, options?: ParseOptions): Promise<Document>;
|
|
197
|
+
async parse(file: ParseFile, options?: ParseOptions): Promise<Document> {
|
|
198
|
+
return parseMethod(this.http, file, options);
|
|
199
|
+
}
|
|
200
|
+
|
|
180
201
|
// Search
|
|
181
202
|
/**
|
|
182
203
|
* Search the web and optionally scrape each result.
|
package/src/v2/methods/crawl.ts
CHANGED
|
@@ -27,6 +27,7 @@ function prepareCrawlPayload(request: CrawlRequest): Record<string, unknown> {
|
|
|
27
27
|
if (request.includePaths) data.includePaths = request.includePaths;
|
|
28
28
|
if (request.maxDiscoveryDepth != null) data.maxDiscoveryDepth = request.maxDiscoveryDepth;
|
|
29
29
|
if (request.sitemap != null) data.sitemap = request.sitemap;
|
|
30
|
+
if (request.robotsUserAgent != null) data.robotsUserAgent = request.robotsUserAgent;
|
|
30
31
|
if (request.ignoreQueryParameters != null) data.ignoreQueryParameters = request.ignoreQueryParameters;
|
|
31
32
|
if (request.deduplicateSimilarURLs != null) data.deduplicateSimilarURLs = request.deduplicateSimilarURLs;
|
|
32
33
|
if (request.limit != null) data.limit = request.limit;
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { type Document, type ParseFile, type ParseOptions } from "../types";
|
|
2
|
+
import { HttpClient } from "../utils/httpClient";
|
|
3
|
+
import { ensureValidParseOptions } from "../utils/validation";
|
|
4
|
+
import { throwForBadResponse, normalizeAxiosError } from "../utils/errorHandler";
|
|
5
|
+
import { getVersion } from "../utils/getVersion";
|
|
6
|
+
|
|
7
|
+
function toUploadBlob(input: ParseFile["data"], contentType?: string): Blob {
|
|
8
|
+
if (typeof Blob !== "undefined" && input instanceof Blob) {
|
|
9
|
+
if (contentType && input.type !== contentType) {
|
|
10
|
+
return new Blob([input], { type: contentType });
|
|
11
|
+
}
|
|
12
|
+
return input;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
if (typeof Buffer !== "undefined" && Buffer.isBuffer(input)) {
|
|
16
|
+
return new Blob([input], { type: contentType });
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
if (input instanceof ArrayBuffer) {
|
|
20
|
+
return new Blob([input], { type: contentType });
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
if (ArrayBuffer.isView(input)) {
|
|
24
|
+
return new Blob([input], { type: contentType });
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
if (typeof input === "string") {
|
|
28
|
+
return new Blob([input], { type: contentType ?? "text/plain; charset=utf-8" });
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
throw new Error("Unsupported parse file data type");
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export async function parse(
|
|
35
|
+
http: HttpClient,
|
|
36
|
+
file: ParseFile,
|
|
37
|
+
options?: ParseOptions,
|
|
38
|
+
): Promise<Document> {
|
|
39
|
+
if (!file || !file.filename || !file.filename.trim()) {
|
|
40
|
+
throw new Error("filename cannot be empty");
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if (file.data == null) {
|
|
44
|
+
throw new Error("file data cannot be empty");
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const blob = toUploadBlob(file.data, file.contentType);
|
|
48
|
+
if (blob.size === 0) {
|
|
49
|
+
throw new Error("file data cannot be empty");
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
if (options) ensureValidParseOptions(options);
|
|
53
|
+
|
|
54
|
+
const version = getVersion();
|
|
55
|
+
const normalizedOptions: ParseOptions = {
|
|
56
|
+
...(options ?? {}),
|
|
57
|
+
origin:
|
|
58
|
+
typeof options?.origin === "string" && options.origin.includes("mcp")
|
|
59
|
+
? options.origin
|
|
60
|
+
: options?.origin ?? `js-sdk@${version}`,
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
const formData = new FormData();
|
|
64
|
+
formData.append("options", JSON.stringify(normalizedOptions));
|
|
65
|
+
formData.append(
|
|
66
|
+
"file",
|
|
67
|
+
toUploadBlob(file.data, file.contentType),
|
|
68
|
+
file.filename.trim(),
|
|
69
|
+
);
|
|
70
|
+
|
|
71
|
+
const requestTimeoutMs =
|
|
72
|
+
typeof normalizedOptions.timeout === "number"
|
|
73
|
+
? normalizedOptions.timeout + 5000
|
|
74
|
+
: undefined;
|
|
75
|
+
|
|
76
|
+
try {
|
|
77
|
+
const res = await http.postMultipart<{
|
|
78
|
+
success: boolean;
|
|
79
|
+
data?: Document;
|
|
80
|
+
error?: string;
|
|
81
|
+
}>("/v2/parse", formData, undefined, requestTimeoutMs);
|
|
82
|
+
if (res.status !== 200 || !res.data?.success) {
|
|
83
|
+
throwForBadResponse(res, "parse");
|
|
84
|
+
}
|
|
85
|
+
return (res.data.data || {}) as Document;
|
|
86
|
+
} catch (err: any) {
|
|
87
|
+
if (err?.isAxiosError) return normalizeAxiosError(err, "parse");
|
|
88
|
+
throw err;
|
|
89
|
+
}
|
|
90
|
+
}
|
package/src/v2/types.ts
CHANGED
|
@@ -66,6 +66,22 @@ export type FormatOption =
|
|
|
66
66
|
| AttributesFormat
|
|
67
67
|
| QueryFormat;
|
|
68
68
|
|
|
69
|
+
export type ParseFormatString = Exclude<
|
|
70
|
+
FormatString,
|
|
71
|
+
'screenshot' | 'changeTracking' | 'branding'
|
|
72
|
+
>;
|
|
73
|
+
|
|
74
|
+
export interface ParseFormat {
|
|
75
|
+
type: ParseFormatString;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export type ParseFormatOption =
|
|
79
|
+
| ParseFormatString
|
|
80
|
+
| ParseFormat
|
|
81
|
+
| JsonFormat
|
|
82
|
+
| AttributesFormat
|
|
83
|
+
| QueryFormat;
|
|
84
|
+
|
|
69
85
|
export interface LocationConfig {
|
|
70
86
|
country?: string;
|
|
71
87
|
languages?: string[];
|
|
@@ -172,6 +188,36 @@ export interface ScrapeOptions {
|
|
|
172
188
|
origin?: string;
|
|
173
189
|
}
|
|
174
190
|
|
|
191
|
+
export type ParseFileData =
|
|
192
|
+
| Blob
|
|
193
|
+
| File
|
|
194
|
+
| Buffer
|
|
195
|
+
| Uint8Array
|
|
196
|
+
| ArrayBuffer
|
|
197
|
+
| string;
|
|
198
|
+
|
|
199
|
+
export interface ParseFile {
|
|
200
|
+
data: ParseFileData;
|
|
201
|
+
filename: string;
|
|
202
|
+
contentType?: string;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
export type ParseOptions = Omit<
|
|
206
|
+
ScrapeOptions,
|
|
207
|
+
| 'formats'
|
|
208
|
+
| 'waitFor'
|
|
209
|
+
| 'mobile'
|
|
210
|
+
| 'actions'
|
|
211
|
+
| 'location'
|
|
212
|
+
| 'maxAge'
|
|
213
|
+
| 'minAge'
|
|
214
|
+
| 'storeInCache'
|
|
215
|
+
| 'proxy'
|
|
216
|
+
> & {
|
|
217
|
+
formats?: ParseFormatOption[];
|
|
218
|
+
proxy?: 'basic' | 'auto';
|
|
219
|
+
};
|
|
220
|
+
|
|
175
221
|
export interface WebhookConfig {
|
|
176
222
|
url: string;
|
|
177
223
|
headers?: Record<string, string>;
|
|
@@ -25,7 +25,6 @@ export class HttpClient {
|
|
|
25
25
|
baseURL: this.apiUrl,
|
|
26
26
|
timeout: options.timeoutMs ?? 300000,
|
|
27
27
|
headers: {
|
|
28
|
-
"Content-Type": "application/json",
|
|
29
28
|
Authorization: `Bearer ${this.apiKey}`,
|
|
30
29
|
},
|
|
31
30
|
transitional: { clarifyTimeoutError: true },
|
|
@@ -50,16 +49,35 @@ export class HttpClient {
|
|
|
50
49
|
for (let attempt = 0; attempt < this.maxRetries; attempt++) {
|
|
51
50
|
try {
|
|
52
51
|
const cfg: AxiosRequestConfig = { ...config };
|
|
53
|
-
|
|
54
|
-
|
|
52
|
+
const isFormDataBody =
|
|
53
|
+
typeof FormData !== "undefined" && cfg.data instanceof FormData;
|
|
54
|
+
const isPlainObjectBody =
|
|
55
|
+
!isFormDataBody &&
|
|
56
|
+
cfg.data != null &&
|
|
57
|
+
typeof cfg.data === "object" &&
|
|
58
|
+
!Array.isArray(cfg.data);
|
|
59
|
+
|
|
60
|
+
// For JSON POST/PUT/PATCH, ensure origin is present in body
|
|
61
|
+
if (
|
|
62
|
+
isPlainObjectBody &&
|
|
63
|
+
cfg.method &&
|
|
64
|
+
["post", "put", "patch"].includes(cfg.method.toLowerCase())
|
|
65
|
+
) {
|
|
55
66
|
const data = (cfg.data ?? {}) as Record<string, unknown>;
|
|
56
67
|
cfg.data = { ...data, origin: typeof data.origin === "string" && data.origin.includes("mcp") ? data.origin : `js-sdk@${version}` };
|
|
57
|
-
|
|
68
|
+
|
|
58
69
|
// If timeout is specified in the body, use it to override the request timeout
|
|
59
70
|
if (typeof data.timeout === "number") {
|
|
60
71
|
cfg.timeout = data.timeout + 5000;
|
|
61
72
|
}
|
|
62
73
|
}
|
|
74
|
+
|
|
75
|
+
if (isFormDataBody) {
|
|
76
|
+
cfg.headers = { ...(cfg.headers || {}) };
|
|
77
|
+
delete (cfg.headers as Record<string, unknown>)["Content-Type"];
|
|
78
|
+
delete (cfg.headers as Record<string, unknown>)["content-type"];
|
|
79
|
+
}
|
|
80
|
+
|
|
63
81
|
const res = await this.instance.request<T>(cfg);
|
|
64
82
|
if (res.status === 502 && attempt < this.maxRetries - 1) {
|
|
65
83
|
await this.sleep(this.backoffFactor * Math.pow(2, attempt));
|
|
@@ -87,6 +105,21 @@ export class HttpClient {
|
|
|
87
105
|
return this.request<T>({ method: "post", url: endpoint, data: body, headers });
|
|
88
106
|
}
|
|
89
107
|
|
|
108
|
+
postMultipart<T = any>(
|
|
109
|
+
endpoint: string,
|
|
110
|
+
formData: FormData,
|
|
111
|
+
headers?: Record<string, string>,
|
|
112
|
+
timeoutMs?: number,
|
|
113
|
+
) {
|
|
114
|
+
return this.request<T>({
|
|
115
|
+
method: "post",
|
|
116
|
+
url: endpoint,
|
|
117
|
+
data: formData,
|
|
118
|
+
headers,
|
|
119
|
+
timeout: timeoutMs,
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
|
|
90
123
|
get<T = any>(endpoint: string, headers?: Record<string, string>) {
|
|
91
124
|
return this.request<T>({ method: "get", url: endpoint, headers });
|
|
92
125
|
}
|
|
@@ -1,4 +1,12 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import {
|
|
2
|
+
type ChangeTrackingFormat,
|
|
3
|
+
type FormatOption,
|
|
4
|
+
type JsonFormat,
|
|
5
|
+
type ParseFormatOption,
|
|
6
|
+
type ParseOptions,
|
|
7
|
+
type ScrapeOptions,
|
|
8
|
+
type ScreenshotFormat,
|
|
9
|
+
} from "../types";
|
|
2
10
|
import { isZodSchema, zodSchemaToJsonSchema, looksLikeZodShape } from "../../utils/zodSchemaToJson";
|
|
3
11
|
|
|
4
12
|
export function ensureValidFormats(formats?: FormatOption[]): void {
|
|
@@ -62,3 +70,82 @@ export function ensureValidScrapeOptions(options?: ScrapeOptions): void {
|
|
|
62
70
|
ensureValidFormats(options.formats);
|
|
63
71
|
}
|
|
64
72
|
|
|
73
|
+
export function ensureValidParseFormats(formats?: ParseFormatOption[]): void {
|
|
74
|
+
if (!formats) return;
|
|
75
|
+
|
|
76
|
+
for (const fmt of formats) {
|
|
77
|
+
if (typeof fmt === "string") {
|
|
78
|
+
if (fmt === "json") {
|
|
79
|
+
throw new Error("json format must be an object with { type: 'json', prompt, schema }");
|
|
80
|
+
}
|
|
81
|
+
if (fmt === "screenshot") {
|
|
82
|
+
throw new Error("parse does not support screenshot format");
|
|
83
|
+
}
|
|
84
|
+
if (fmt === "changeTracking") {
|
|
85
|
+
throw new Error("parse does not support changeTracking format");
|
|
86
|
+
}
|
|
87
|
+
if (fmt === "branding") {
|
|
88
|
+
throw new Error("parse does not support branding format");
|
|
89
|
+
}
|
|
90
|
+
continue;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const type = (fmt as any).type;
|
|
94
|
+
if (type === "changeTracking") {
|
|
95
|
+
throw new Error("parse does not support changeTracking format");
|
|
96
|
+
}
|
|
97
|
+
if (type === "screenshot") {
|
|
98
|
+
throw new Error("parse does not support screenshot format");
|
|
99
|
+
}
|
|
100
|
+
if (type === "branding") {
|
|
101
|
+
throw new Error("parse does not support branding format");
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if ((fmt as JsonFormat).type === "json") {
|
|
105
|
+
const j = fmt as JsonFormat;
|
|
106
|
+
if (!j.prompt && !j.schema) {
|
|
107
|
+
throw new Error("json format requires either 'prompt' or 'schema' (or both)");
|
|
108
|
+
}
|
|
109
|
+
const maybeSchema = j.schema;
|
|
110
|
+
if (isZodSchema(maybeSchema)) {
|
|
111
|
+
(j as any).schema = zodSchemaToJsonSchema(maybeSchema);
|
|
112
|
+
} else if (looksLikeZodShape(maybeSchema)) {
|
|
113
|
+
throw new Error(
|
|
114
|
+
"json format schema appears to be a Zod schema's .shape property. " +
|
|
115
|
+
"Pass the Zod schema directly (e.g., `schema: MySchema`) instead of `schema: MySchema.shape`. " +
|
|
116
|
+
"The SDK will automatically convert Zod schemas to JSON Schema format."
|
|
117
|
+
);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
export function ensureValidParseOptions(options?: ParseOptions): void {
|
|
124
|
+
if (!options) return;
|
|
125
|
+
if (options.timeout != null && options.timeout <= 0) {
|
|
126
|
+
throw new Error("timeout must be positive");
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const raw = options as Record<string, unknown>;
|
|
130
|
+
if (raw.waitFor !== undefined) {
|
|
131
|
+
throw new Error("parse does not support waitFor");
|
|
132
|
+
}
|
|
133
|
+
if (raw.actions !== undefined) {
|
|
134
|
+
throw new Error("parse does not support actions");
|
|
135
|
+
}
|
|
136
|
+
if (raw.location !== undefined) {
|
|
137
|
+
throw new Error("parse does not support location overrides");
|
|
138
|
+
}
|
|
139
|
+
if (raw.mobile !== undefined) {
|
|
140
|
+
throw new Error("parse does not support mobile rendering");
|
|
141
|
+
}
|
|
142
|
+
if (raw.maxAge !== undefined || raw.minAge !== undefined || raw.storeInCache !== undefined) {
|
|
143
|
+
throw new Error("parse does not support cache/index options");
|
|
144
|
+
}
|
|
145
|
+
if (raw.proxy !== undefined && raw.proxy !== "basic" && raw.proxy !== "auto") {
|
|
146
|
+
throw new Error("parse only supports proxy values of 'basic' or 'auto'");
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
ensureValidParseFormats(options.formats);
|
|
150
|
+
}
|
|
151
|
+
|