firecrawl-mcp 3.10.3 → 3.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +0 -0
- package/README.md +21 -17
- package/dist/index.js +454 -14
- package/package.json +19 -20
- package/dist/index-v1.js +0 -1313
- package/dist/index.test.js +0 -255
- package/dist/jest.setup.js +0 -58
- package/dist/server-v1.js +0 -1154
- package/dist/server-v2.js +0 -1067
- package/dist/src/index.js +0 -1053
- package/dist/src/index.test.js +0 -225
- package/dist/versioned-server.js +0 -203
package/dist/index.js
CHANGED
|
@@ -3,6 +3,8 @@ import dotenv from 'dotenv';
|
|
|
3
3
|
import { FastMCP } from 'firecrawl-fastmcp';
|
|
4
4
|
import { z } from 'zod';
|
|
5
5
|
import FirecrawlApp from '@mendable/firecrawl-js';
|
|
6
|
+
import { readFile } from 'node:fs/promises';
|
|
7
|
+
import path from 'node:path';
|
|
6
8
|
dotenv.config({ debug: false, quiet: true });
|
|
7
9
|
function extractApiKey(headers) {
|
|
8
10
|
const headerAuth = headers['authorization'];
|
|
@@ -35,6 +37,24 @@ function removeEmptyTopLevel(obj) {
|
|
|
35
37
|
}
|
|
36
38
|
return out;
|
|
37
39
|
}
|
|
40
|
+
const searchDomainSchema = z
|
|
41
|
+
.string()
|
|
42
|
+
.trim()
|
|
43
|
+
.toLowerCase()
|
|
44
|
+
.regex(/^(?=.{1,253}$)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]$/, 'Domain must be a valid hostname without protocol or path');
|
|
45
|
+
function buildSearchQueryWithDomains(query, includeDomains, excludeDomains) {
|
|
46
|
+
if (includeDomains?.length) {
|
|
47
|
+
return `${query} (${includeDomains
|
|
48
|
+
.map((domain) => `site:${domain}`)
|
|
49
|
+
.join(' OR ')})`;
|
|
50
|
+
}
|
|
51
|
+
if (excludeDomains?.length) {
|
|
52
|
+
return `${query} ${excludeDomains
|
|
53
|
+
.map((domain) => `-site:${domain}`)
|
|
54
|
+
.join(' ')}`;
|
|
55
|
+
}
|
|
56
|
+
return query;
|
|
57
|
+
}
|
|
38
58
|
class ConsoleLogger {
|
|
39
59
|
shouldLog = process.env.CLOUD_SERVICE === 'true' ||
|
|
40
60
|
process.env.SSE_LOCAL === 'true' ||
|
|
@@ -152,6 +172,10 @@ function buildFormatsArray(args) {
|
|
|
152
172
|
const jsonOpts = args.jsonOptions;
|
|
153
173
|
result.push({ type: 'json', ...jsonOpts });
|
|
154
174
|
}
|
|
175
|
+
else if (fmt === 'query') {
|
|
176
|
+
const queryOpts = args.queryOptions;
|
|
177
|
+
result.push({ type: 'query', ...queryOpts });
|
|
178
|
+
}
|
|
155
179
|
else if (fmt === 'screenshot' && args.screenshotOptions) {
|
|
156
180
|
const ssOpts = args.screenshotOptions;
|
|
157
181
|
result.push({ type: 'screenshot', ...ssOpts });
|
|
@@ -197,6 +221,7 @@ function transformScrapeParams(args) {
|
|
|
197
221
|
if (parsers)
|
|
198
222
|
out.parsers = parsers;
|
|
199
223
|
delete out.jsonOptions;
|
|
224
|
+
delete out.queryOptions;
|
|
200
225
|
delete out.screenshotOptions;
|
|
201
226
|
delete out.pdfOptions;
|
|
202
227
|
return out;
|
|
@@ -214,6 +239,8 @@ const scrapeParamsSchema = z.object({
|
|
|
214
239
|
'changeTracking',
|
|
215
240
|
'branding',
|
|
216
241
|
'json',
|
|
242
|
+
'query',
|
|
243
|
+
'audio',
|
|
217
244
|
]))
|
|
218
245
|
.optional(),
|
|
219
246
|
jsonOptions: z
|
|
@@ -222,6 +249,11 @@ const scrapeParamsSchema = z.object({
|
|
|
222
249
|
schema: z.record(z.string(), z.any()).optional(),
|
|
223
250
|
})
|
|
224
251
|
.optional(),
|
|
252
|
+
queryOptions: z
|
|
253
|
+
.object({
|
|
254
|
+
prompt: z.string().max(10000),
|
|
255
|
+
})
|
|
256
|
+
.optional(),
|
|
225
257
|
screenshotOptions: z
|
|
226
258
|
.object({
|
|
227
259
|
fullPage: z.boolean().optional(),
|
|
@@ -269,10 +301,22 @@ const scrapeParamsSchema = z.object({
|
|
|
269
301
|
storeInCache: z.boolean().optional(),
|
|
270
302
|
zeroDataRetention: z.boolean().optional(),
|
|
271
303
|
maxAge: z.number().optional(),
|
|
304
|
+
lockdown: z.boolean().optional(),
|
|
272
305
|
proxy: z.enum(['basic', 'stealth', 'enhanced', 'auto']).optional(),
|
|
306
|
+
profile: z
|
|
307
|
+
.object({
|
|
308
|
+
name: z.string(),
|
|
309
|
+
saveChanges: z.boolean().optional(),
|
|
310
|
+
})
|
|
311
|
+
.optional(),
|
|
273
312
|
});
|
|
274
313
|
server.addTool({
|
|
275
314
|
name: 'firecrawl_scrape',
|
|
315
|
+
annotations: {
|
|
316
|
+
title: 'Scrape a URL',
|
|
317
|
+
readOnlyHint: SAFE_MODE,
|
|
318
|
+
openWorldHint: true,
|
|
319
|
+
},
|
|
276
320
|
description: `
|
|
277
321
|
Scrape content from a single URL with advanced options.
|
|
278
322
|
This is the most powerful, fastest and most reliable scraper tool, if available you should always default to using this tool for any web scraping needs.
|
|
@@ -335,7 +379,18 @@ If JSON extraction returns empty, minimal, or just navigation content, the page
|
|
|
335
379
|
}
|
|
336
380
|
}
|
|
337
381
|
\`\`\`
|
|
338
|
-
|
|
382
|
+
|
|
383
|
+
**Prefer markdown format by default.** You can read and reason over the full page content directly — no need for an intermediate query step. Use markdown for questions about page content, factual lookups, and any task where you need to understand the page.
|
|
384
|
+
|
|
385
|
+
**Use JSON format when user needs:**
|
|
386
|
+
- Structured data with specific fields (extract all products with name, price, description)
|
|
387
|
+
- Data in a specific schema for downstream processing
|
|
388
|
+
|
|
389
|
+
**Use query format only when:**
|
|
390
|
+
- The page is extremely long and you need a single targeted answer without processing the full content
|
|
391
|
+
- You want a quick factual answer and don't need to retain the page content
|
|
392
|
+
|
|
393
|
+
**Usage Example (markdown format - default for most tasks):**
|
|
339
394
|
\`\`\`json
|
|
340
395
|
{
|
|
341
396
|
"name": "firecrawl_scrape",
|
|
@@ -358,6 +413,7 @@ If JSON extraction returns empty, minimal, or just navigation content, the page
|
|
|
358
413
|
\`\`\`
|
|
359
414
|
**Branding format:** Extracts comprehensive brand identity (colors, fonts, typography, spacing, logo, UI components) for design analysis or style replication.
|
|
360
415
|
**Performance:** Add maxAge parameter for 500% faster scrapes using cached data.
|
|
416
|
+
**Lockdown mode:** Set \`lockdown: true\` to serve the request only from the existing index/cache without any outbound network request. For air-gapped or compliance-constrained use where the request URL itself is considered sensitive. Errors on cache miss. Billed at 5 credits.
|
|
361
417
|
**Returns:** JSON structured data, markdown, branding profile, or other formats as specified.
|
|
362
418
|
${SAFE_MODE
|
|
363
419
|
? '**Safe Mode:** Read-only content extraction. Interactive actions (click, write, executeJavascript) are disabled for security.'
|
|
@@ -369,7 +425,12 @@ ${SAFE_MODE
|
|
|
369
425
|
const client = getClient(session);
|
|
370
426
|
const transformed = transformScrapeParams(options);
|
|
371
427
|
const cleaned = removeEmptyTopLevel(transformed);
|
|
372
|
-
|
|
428
|
+
if (cleaned.lockdown) {
|
|
429
|
+
log.info('Scraping URL (lockdown)');
|
|
430
|
+
}
|
|
431
|
+
else {
|
|
432
|
+
log.info('Scraping URL', { url: String(url) });
|
|
433
|
+
}
|
|
373
434
|
const res = await client.scrape(String(url), {
|
|
374
435
|
...cleaned,
|
|
375
436
|
origin: ORIGIN,
|
|
@@ -379,6 +440,11 @@ ${SAFE_MODE
|
|
|
379
440
|
});
|
|
380
441
|
server.addTool({
|
|
381
442
|
name: 'firecrawl_map',
|
|
443
|
+
annotations: {
|
|
444
|
+
title: 'Map a website',
|
|
445
|
+
readOnlyHint: true,
|
|
446
|
+
openWorldHint: true,
|
|
447
|
+
},
|
|
382
448
|
description: `
|
|
383
449
|
Map a website to discover all indexed URLs on the site.
|
|
384
450
|
|
|
@@ -432,6 +498,11 @@ Map a website to discover all indexed URLs on the site.
|
|
|
432
498
|
});
|
|
433
499
|
server.addTool({
|
|
434
500
|
name: 'firecrawl_search',
|
|
501
|
+
annotations: {
|
|
502
|
+
title: 'Search the web',
|
|
503
|
+
readOnlyHint: true,
|
|
504
|
+
openWorldHint: true,
|
|
505
|
+
},
|
|
435
506
|
description: `
|
|
436
507
|
Search the web and optionally extract content from search results. This is the most powerful web search tool available, and if available you should always default to using this tool for any web search needs.
|
|
437
508
|
|
|
@@ -454,6 +525,7 @@ The query also supports search operators, that you can use if needed to refine t
|
|
|
454
525
|
**Common mistakes:** Using crawl or map for open-ended questions (use search instead).
|
|
455
526
|
**Prompt Example:** "Find the latest research papers on AI published in 2023."
|
|
456
527
|
**Sources:** web, images, news, default to web unless needed images or news.
|
|
528
|
+
**Domain filters:** Use includeDomains to restrict results to specific domains, or excludeDomains to remove domains. Do not use both in the same request. Domains must be hostnames only, without protocol or path.
|
|
457
529
|
**Scrape Options:** Only use scrapeOptions when you think it is absolutely necessary. When you do so default to a lower limit to avoid timeouts, 5 or lower.
|
|
458
530
|
**Optimal Workflow:** Search first using firecrawl_search without formats, then after fetching the results, use the scrape tool to get the content of the relevantpage(s) that you want to scrape
|
|
459
531
|
|
|
@@ -464,6 +536,7 @@ The query also supports search operators, that you can use if needed to refine t
|
|
|
464
536
|
"arguments": {
|
|
465
537
|
"query": "top AI companies",
|
|
466
538
|
"limit": 5,
|
|
539
|
+
"includeDomains": ["example.com"],
|
|
467
540
|
"sources": [
|
|
468
541
|
{ "type": "web" }
|
|
469
542
|
]
|
|
@@ -493,28 +566,40 @@ The query also supports search operators, that you can use if needed to refine t
|
|
|
493
566
|
\`\`\`
|
|
494
567
|
**Returns:** Array of search results (with optional scraped content).
|
|
495
568
|
`,
|
|
496
|
-
parameters: z
|
|
569
|
+
parameters: z
|
|
570
|
+
.object({
|
|
497
571
|
query: z.string().min(1),
|
|
498
572
|
limit: z.number().optional(),
|
|
499
573
|
tbs: z.string().optional(),
|
|
500
574
|
filter: z.string().optional(),
|
|
501
575
|
location: z.string().optional(),
|
|
576
|
+
includeDomains: z.array(searchDomainSchema).optional(),
|
|
577
|
+
excludeDomains: z.array(searchDomainSchema).optional(),
|
|
502
578
|
sources: z
|
|
503
579
|
.array(z.object({ type: z.enum(['web', 'images', 'news']) }))
|
|
504
580
|
.optional(),
|
|
505
|
-
scrapeOptions: scrapeParamsSchema
|
|
581
|
+
scrapeOptions: scrapeParamsSchema
|
|
582
|
+
.omit({ url: true })
|
|
583
|
+
.partial()
|
|
584
|
+
.optional(),
|
|
506
585
|
enterprise: z.array(z.enum(['default', 'anon', 'zdr'])).optional(),
|
|
507
|
-
})
|
|
586
|
+
})
|
|
587
|
+
.refine((args) => !(args.includeDomains?.length && args.excludeDomains?.length), 'includeDomains and excludeDomains cannot both be specified'),
|
|
508
588
|
execute: async (args, { session, log }) => {
|
|
509
589
|
const client = getClient(session);
|
|
510
590
|
const { query, ...opts } = args;
|
|
511
591
|
const searchOpts = { ...opts };
|
|
592
|
+
const includeDomains = searchOpts.includeDomains;
|
|
593
|
+
const excludeDomains = searchOpts.excludeDomains;
|
|
594
|
+
delete searchOpts.includeDomains;
|
|
595
|
+
delete searchOpts.excludeDomains;
|
|
512
596
|
if (searchOpts.scrapeOptions) {
|
|
513
597
|
searchOpts.scrapeOptions = transformScrapeParams(searchOpts.scrapeOptions);
|
|
514
598
|
}
|
|
515
599
|
const cleaned = removeEmptyTopLevel(searchOpts);
|
|
516
|
-
|
|
517
|
-
|
|
600
|
+
const searchQuery = buildSearchQueryWithDomains(query, includeDomains, excludeDomains);
|
|
601
|
+
log.info('Searching', { query: searchQuery });
|
|
602
|
+
const res = await client.search(searchQuery, {
|
|
518
603
|
...cleaned,
|
|
519
604
|
origin: ORIGIN,
|
|
520
605
|
});
|
|
@@ -523,6 +608,12 @@ The query also supports search operators, that you can use if needed to refine t
|
|
|
523
608
|
});
|
|
524
609
|
server.addTool({
|
|
525
610
|
name: 'firecrawl_crawl',
|
|
611
|
+
annotations: {
|
|
612
|
+
title: 'Start a site crawl',
|
|
613
|
+
readOnlyHint: false,
|
|
614
|
+
openWorldHint: true,
|
|
615
|
+
destructiveHint: false,
|
|
616
|
+
},
|
|
526
617
|
description: `
|
|
527
618
|
Starts a crawl job on a website and extracts content from all pages.
|
|
528
619
|
|
|
@@ -595,6 +686,11 @@ server.addTool({
|
|
|
595
686
|
});
|
|
596
687
|
server.addTool({
|
|
597
688
|
name: 'firecrawl_check_crawl_status',
|
|
689
|
+
annotations: {
|
|
690
|
+
title: 'Get crawl status',
|
|
691
|
+
readOnlyHint: true,
|
|
692
|
+
openWorldHint: false,
|
|
693
|
+
},
|
|
598
694
|
description: `
|
|
599
695
|
Check the status of a crawl job.
|
|
600
696
|
|
|
@@ -618,6 +714,11 @@ Check the status of a crawl job.
|
|
|
618
714
|
});
|
|
619
715
|
server.addTool({
|
|
620
716
|
name: 'firecrawl_extract',
|
|
717
|
+
annotations: {
|
|
718
|
+
title: 'Extract structured data',
|
|
719
|
+
readOnlyHint: true,
|
|
720
|
+
openWorldHint: true,
|
|
721
|
+
},
|
|
621
722
|
description: `
|
|
622
723
|
Extract structured information from web pages using LLM capabilities. Supports both cloud AI and self-hosted LLM extraction.
|
|
623
724
|
|
|
@@ -684,6 +785,12 @@ Extract structured information from web pages using LLM capabilities. Supports b
|
|
|
684
785
|
});
|
|
685
786
|
server.addTool({
|
|
686
787
|
name: 'firecrawl_agent',
|
|
788
|
+
annotations: {
|
|
789
|
+
title: 'Start a research agent',
|
|
790
|
+
readOnlyHint: false,
|
|
791
|
+
openWorldHint: true,
|
|
792
|
+
destructiveHint: false,
|
|
793
|
+
},
|
|
687
794
|
description: `
|
|
688
795
|
Autonomous web research agent. This is a separate AI agent layer that independently browses the internet, searches for information, navigates through pages, and extracts structured data based on your query. You describe what you need, and the agent figures out where to find it.
|
|
689
796
|
|
|
@@ -702,7 +809,11 @@ Autonomous web research agent. This is a separate AI agent layer that independen
|
|
|
702
809
|
- Deep research tasks: 5+ minutes
|
|
703
810
|
|
|
704
811
|
**Best for:** Complex research tasks where you don't know the exact URLs; multi-source data gathering; finding information scattered across the web; extracting data from JavaScript-heavy SPAs that fail with regular scrape.
|
|
705
|
-
**Not recommended for:**
|
|
812
|
+
**Not recommended for:**
|
|
813
|
+
- Single-page extraction when you have a URL (use firecrawl_scrape, faster and cheaper)
|
|
814
|
+
- Web search (use firecrawl_search first)
|
|
815
|
+
- Interactive page tasks like clicking, filling forms, login, or navigating JS-heavy SPAs (use firecrawl_scrape + firecrawl_interact)
|
|
816
|
+
- Extracting specific data from a known page (use firecrawl_scrape with JSON format)
|
|
706
817
|
|
|
707
818
|
**Arguments:**
|
|
708
819
|
- prompt: Natural language description of the data you want (required, max 10,000 characters)
|
|
@@ -775,6 +886,11 @@ Then poll with \`firecrawl_agent_status\` every 15-30 seconds for at least 2-3 m
|
|
|
775
886
|
});
|
|
776
887
|
server.addTool({
|
|
777
888
|
name: 'firecrawl_agent_status',
|
|
889
|
+
annotations: {
|
|
890
|
+
title: 'Get agent job status',
|
|
891
|
+
readOnlyHint: true,
|
|
892
|
+
openWorldHint: false,
|
|
893
|
+
},
|
|
778
894
|
description: `
|
|
779
895
|
Check the status of an agent job and retrieve results when complete. Use this to poll for results after starting an agent with \`firecrawl_agent\`.
|
|
780
896
|
|
|
@@ -809,14 +925,19 @@ Check the status of an agent job and retrieve results when complete. Use this to
|
|
|
809
925
|
return asText(res);
|
|
810
926
|
},
|
|
811
927
|
});
|
|
812
|
-
// Browser session tools
|
|
928
|
+
// Browser session tools (deprecated — prefer firecrawl_scrape + firecrawl_interact)
|
|
813
929
|
server.addTool({
|
|
814
930
|
name: 'firecrawl_browser_create',
|
|
931
|
+
annotations: {
|
|
932
|
+
title: 'Create browser session',
|
|
933
|
+
readOnlyHint: false,
|
|
934
|
+
openWorldHint: false,
|
|
935
|
+
destructiveHint: false,
|
|
936
|
+
},
|
|
815
937
|
description: `
|
|
816
|
-
|
|
938
|
+
**DEPRECATED — prefer firecrawl_scrape + firecrawl_interact instead.** Interact lets you scrape a page and then click, fill forms, and navigate without managing sessions manually.
|
|
817
939
|
|
|
818
|
-
|
|
819
|
-
**Not recommended for:** Simple page scraping (use firecrawl_scrape instead).
|
|
940
|
+
Create a browser session for code execution via CDP (Chrome DevTools Protocol).
|
|
820
941
|
|
|
821
942
|
**Arguments:**
|
|
822
943
|
- ttl: Total session lifetime in seconds (30-3600, optional)
|
|
@@ -858,10 +979,16 @@ Create a browser session for code execution via CDP (Chrome DevTools Protocol).
|
|
|
858
979
|
if (!SAFE_MODE) {
|
|
859
980
|
server.addTool({
|
|
860
981
|
name: 'firecrawl_browser_execute',
|
|
982
|
+
annotations: {
|
|
983
|
+
title: 'Run code in browser session',
|
|
984
|
+
readOnlyHint: false,
|
|
985
|
+
openWorldHint: false,
|
|
986
|
+
destructiveHint: true,
|
|
987
|
+
},
|
|
861
988
|
description: `
|
|
862
|
-
|
|
989
|
+
**DEPRECATED — prefer firecrawl_scrape + firecrawl_interact instead.** Interact lets you scrape a page and then click, fill forms, and navigate without managing sessions manually.
|
|
863
990
|
|
|
864
|
-
|
|
991
|
+
Execute code in a browser session. Supports agent-browser commands (bash), Python, or JavaScript.
|
|
865
992
|
**Requires:** An active browser session (create one with firecrawl_browser_create first).
|
|
866
993
|
|
|
867
994
|
**Arguments:**
|
|
@@ -927,7 +1054,15 @@ Execute code in a browser session. Supports agent-browser commands (bash), Pytho
|
|
|
927
1054
|
}
|
|
928
1055
|
server.addTool({
|
|
929
1056
|
name: 'firecrawl_browser_delete',
|
|
1057
|
+
annotations: {
|
|
1058
|
+
title: 'Delete browser session',
|
|
1059
|
+
readOnlyHint: false,
|
|
1060
|
+
openWorldHint: false,
|
|
1061
|
+
destructiveHint: true,
|
|
1062
|
+
},
|
|
930
1063
|
description: `
|
|
1064
|
+
**DEPRECATED — prefer firecrawl_scrape + firecrawl_interact instead.**
|
|
1065
|
+
|
|
931
1066
|
Destroy a browser session.
|
|
932
1067
|
|
|
933
1068
|
**Usage Example:**
|
|
@@ -954,7 +1089,14 @@ Destroy a browser session.
|
|
|
954
1089
|
});
|
|
955
1090
|
server.addTool({
|
|
956
1091
|
name: 'firecrawl_browser_list',
|
|
1092
|
+
annotations: {
|
|
1093
|
+
title: 'List browser sessions',
|
|
1094
|
+
readOnlyHint: true,
|
|
1095
|
+
openWorldHint: false,
|
|
1096
|
+
},
|
|
957
1097
|
description: `
|
|
1098
|
+
**DEPRECATED — prefer firecrawl_scrape + firecrawl_interact instead.**
|
|
1099
|
+
|
|
958
1100
|
List browser sessions, optionally filtered by status.
|
|
959
1101
|
|
|
960
1102
|
**Usage Example:**
|
|
@@ -979,6 +1121,304 @@ List browser sessions, optionally filtered by status.
|
|
|
979
1121
|
return asText(res);
|
|
980
1122
|
},
|
|
981
1123
|
});
|
|
1124
|
+
// Interact tools (scrape-bound browser sessions)
|
|
1125
|
+
server.addTool({
|
|
1126
|
+
name: 'firecrawl_interact',
|
|
1127
|
+
annotations: {
|
|
1128
|
+
title: 'Interact with a scraped page',
|
|
1129
|
+
readOnlyHint: false,
|
|
1130
|
+
openWorldHint: true,
|
|
1131
|
+
destructiveHint: false,
|
|
1132
|
+
},
|
|
1133
|
+
description: `
|
|
1134
|
+
Interact with a previously scraped page in a live browser session. Scrape a page first with firecrawl_scrape, then use the returned scrapeId to click buttons, fill forms, extract dynamic content, or navigate deeper.
|
|
1135
|
+
|
|
1136
|
+
**Best for:** Multi-step workflows on a single page — searching a site, clicking through results, filling forms, extracting data that requires interaction.
|
|
1137
|
+
**Requires:** A scrapeId from a previous firecrawl_scrape call (found in the metadata of the scrape response).
|
|
1138
|
+
|
|
1139
|
+
**Arguments:**
|
|
1140
|
+
- scrapeId: The scrape job ID from a previous scrape (required)
|
|
1141
|
+
- prompt: Natural language instruction describing the action to take (use this OR code)
|
|
1142
|
+
- code: Code to execute in the browser session (use this OR prompt)
|
|
1143
|
+
- language: "bash", "python", or "node" (optional, defaults to "node", only used with code)
|
|
1144
|
+
- timeout: Execution timeout in seconds, 1-300 (optional, defaults to 30)
|
|
1145
|
+
|
|
1146
|
+
**Usage Example (prompt):**
|
|
1147
|
+
\`\`\`json
|
|
1148
|
+
{
|
|
1149
|
+
"name": "firecrawl_interact",
|
|
1150
|
+
"arguments": {
|
|
1151
|
+
"scrapeId": "scrape-id-from-previous-scrape",
|
|
1152
|
+
"prompt": "Click on the first product and tell me its price"
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
\`\`\`
|
|
1156
|
+
|
|
1157
|
+
**Usage Example (code):**
|
|
1158
|
+
\`\`\`json
|
|
1159
|
+
{
|
|
1160
|
+
"name": "firecrawl_interact",
|
|
1161
|
+
"arguments": {
|
|
1162
|
+
"scrapeId": "scrape-id-from-previous-scrape",
|
|
1163
|
+
"code": "agent-browser click @e5",
|
|
1164
|
+
"language": "bash"
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
\`\`\`
|
|
1168
|
+
**Returns:** Execution result including output, stdout, stderr, exit code, and live view URLs.
|
|
1169
|
+
`,
|
|
1170
|
+
parameters: z.object({
|
|
1171
|
+
scrapeId: z.string(),
|
|
1172
|
+
prompt: z.string().optional(),
|
|
1173
|
+
code: z.string().optional(),
|
|
1174
|
+
language: z.enum(['bash', 'python', 'node']).optional(),
|
|
1175
|
+
timeout: z.number().min(1).max(300).optional(),
|
|
1176
|
+
}).refine(data => data.code || data.prompt, {
|
|
1177
|
+
message: "Either 'code' or 'prompt' must be provided.",
|
|
1178
|
+
}),
|
|
1179
|
+
execute: async (args, { session, log }) => {
|
|
1180
|
+
const client = getClient(session);
|
|
1181
|
+
const { scrapeId, prompt, code, language, timeout } = args;
|
|
1182
|
+
log.info('Interacting with scraped page', { scrapeId });
|
|
1183
|
+
const interactArgs = { origin: ORIGIN };
|
|
1184
|
+
if (prompt)
|
|
1185
|
+
interactArgs.prompt = prompt;
|
|
1186
|
+
if (code)
|
|
1187
|
+
interactArgs.code = code;
|
|
1188
|
+
if (language)
|
|
1189
|
+
interactArgs.language = language;
|
|
1190
|
+
if (timeout != null)
|
|
1191
|
+
interactArgs.timeout = timeout;
|
|
1192
|
+
const res = await client.interact(scrapeId, interactArgs);
|
|
1193
|
+
return asText(res);
|
|
1194
|
+
},
|
|
1195
|
+
});
|
|
1196
|
+
server.addTool({
|
|
1197
|
+
name: 'firecrawl_interact_stop',
|
|
1198
|
+
annotations: {
|
|
1199
|
+
title: 'Stop interact session',
|
|
1200
|
+
readOnlyHint: false,
|
|
1201
|
+
openWorldHint: false,
|
|
1202
|
+
destructiveHint: true,
|
|
1203
|
+
},
|
|
1204
|
+
description: `
|
|
1205
|
+
Stop an interact session for a scraped page. Call this when you are done interacting to free resources.
|
|
1206
|
+
|
|
1207
|
+
**Usage Example:**
|
|
1208
|
+
\`\`\`json
|
|
1209
|
+
{
|
|
1210
|
+
"name": "firecrawl_interact_stop",
|
|
1211
|
+
"arguments": {
|
|
1212
|
+
"scrapeId": "scrape-id-here"
|
|
1213
|
+
}
|
|
1214
|
+
}
|
|
1215
|
+
\`\`\`
|
|
1216
|
+
**Returns:** Success confirmation.
|
|
1217
|
+
`,
|
|
1218
|
+
parameters: z.object({
|
|
1219
|
+
scrapeId: z.string(),
|
|
1220
|
+
}),
|
|
1221
|
+
execute: async (args, { session, log }) => {
|
|
1222
|
+
const client = getClient(session);
|
|
1223
|
+
const { scrapeId } = args;
|
|
1224
|
+
log.info('Stopping interact session', { scrapeId });
|
|
1225
|
+
const res = await client.stopInteraction(scrapeId);
|
|
1226
|
+
return asText(res);
|
|
1227
|
+
},
|
|
1228
|
+
});
|
|
1229
|
+
// Local-only: parse a local file via the self-hosted Firecrawl /v2/parse endpoint.
|
|
1230
|
+
// The parse endpoint is only exposed on self-hosted/local Firecrawl API deployments,
|
|
1231
|
+
// so this tool is registered only when the MCP is NOT running in cloud mode.
|
|
1232
|
+
if (process.env.CLOUD_SERVICE !== 'true') {
|
|
1233
|
+
const parseParamsSchema = z.object({
|
|
1234
|
+
filePath: z
|
|
1235
|
+
.string()
|
|
1236
|
+
.min(1)
|
|
1237
|
+
.describe('Absolute or relative path to a local file to parse. Supported: .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls'),
|
|
1238
|
+
contentType: z
|
|
1239
|
+
.string()
|
|
1240
|
+
.optional()
|
|
1241
|
+
.describe('Optional MIME type override. If omitted, the server infers the file kind from the extension.'),
|
|
1242
|
+
formats: z
|
|
1243
|
+
.array(z.enum([
|
|
1244
|
+
'markdown',
|
|
1245
|
+
'html',
|
|
1246
|
+
'rawHtml',
|
|
1247
|
+
'links',
|
|
1248
|
+
'summary',
|
|
1249
|
+
'json',
|
|
1250
|
+
'query',
|
|
1251
|
+
]))
|
|
1252
|
+
.optional(),
|
|
1253
|
+
jsonOptions: z
|
|
1254
|
+
.object({
|
|
1255
|
+
prompt: z.string().optional(),
|
|
1256
|
+
schema: z.record(z.string(), z.any()).optional(),
|
|
1257
|
+
})
|
|
1258
|
+
.optional(),
|
|
1259
|
+
queryOptions: z
|
|
1260
|
+
.object({
|
|
1261
|
+
prompt: z.string().max(10000),
|
|
1262
|
+
})
|
|
1263
|
+
.optional(),
|
|
1264
|
+
parsers: z.array(z.enum(['pdf'])).optional(),
|
|
1265
|
+
pdfOptions: z
|
|
1266
|
+
.object({
|
|
1267
|
+
maxPages: z.number().int().min(1).max(10000).optional(),
|
|
1268
|
+
})
|
|
1269
|
+
.optional(),
|
|
1270
|
+
onlyMainContent: z.boolean().optional(),
|
|
1271
|
+
includeTags: z.array(z.string()).optional(),
|
|
1272
|
+
excludeTags: z.array(z.string()).optional(),
|
|
1273
|
+
removeBase64Images: z.boolean().optional(),
|
|
1274
|
+
skipTlsVerification: z.boolean().optional(),
|
|
1275
|
+
storeInCache: z.boolean().optional(),
|
|
1276
|
+
zeroDataRetention: z.boolean().optional(),
|
|
1277
|
+
maxAge: z.number().optional(),
|
|
1278
|
+
proxy: z.enum(['basic', 'auto']).optional(),
|
|
1279
|
+
});
|
|
1280
|
+
const EXTENSION_CONTENT_TYPES = {
|
|
1281
|
+
'.html': 'text/html',
|
|
1282
|
+
'.htm': 'text/html',
|
|
1283
|
+
'.pdf': 'application/pdf',
|
|
1284
|
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
1285
|
+
'.doc': 'application/msword',
|
|
1286
|
+
'.odt': 'application/vnd.oasis.opendocument.text',
|
|
1287
|
+
'.rtf': 'application/rtf',
|
|
1288
|
+
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
1289
|
+
'.xls': 'application/vnd.ms-excel',
|
|
1290
|
+
};
|
|
1291
|
+
function inferContentType(filename) {
|
|
1292
|
+
const ext = path.extname(filename).toLowerCase();
|
|
1293
|
+
return EXTENSION_CONTENT_TYPES[ext] ?? 'application/octet-stream';
|
|
1294
|
+
}
|
|
1295
|
+
server.addTool({
|
|
1296
|
+
name: 'firecrawl_parse',
|
|
1297
|
+
annotations: {
|
|
1298
|
+
title: 'Parse a local file',
|
|
1299
|
+
readOnlyHint: true,
|
|
1300
|
+
openWorldHint: false,
|
|
1301
|
+
},
|
|
1302
|
+
description: `
|
|
1303
|
+
Parse a file from the local filesystem using a self-hosted Firecrawl API's /v2/parse endpoint.
|
|
1304
|
+
This is the fastest and most reliable way to extract content from a document on disk — if the file lives locally and the MCP is pointed at a self-hosted Firecrawl instance, you should always prefer this tool over uploading the file elsewhere and then scraping it.
|
|
1305
|
+
|
|
1306
|
+
**Best for:** Extracting content from a local document (PDF, Word, Excel, HTML, etc.) when you don't want to host it on the public web first; pulling structured data out of a file with JSON format; converting binary documents into markdown for downstream reasoning.
|
|
1307
|
+
**Not recommended for:** Remote URLs (use firecrawl_scrape); multiple files at once (call parse multiple times); documents that require interactive actions, screenshots, or change tracking — those aren't supported by the parse endpoint.
|
|
1308
|
+
**Common mistakes:** Passing a URL instead of a local file path; requesting an unsupported format (screenshot, branding, changeTracking); setting waitFor, location, mobile, or a non-basic/auto proxy — parse uploads reject all of those.
|
|
1309
|
+
|
|
1310
|
+
**Supported file types:** .html, .htm, .xhtml, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls
|
|
1311
|
+
**Unsupported options:** actions, screenshot/branding/changeTracking formats, waitFor > 0, location, mobile, proxy values other than "auto" or "basic".
|
|
1312
|
+
|
|
1313
|
+
**CRITICAL - Format Selection (same rules as firecrawl_scrape):**
|
|
1314
|
+
When the user asks for SPECIFIC data points from a document, you MUST use JSON format with a schema. Only use markdown when the user needs the ENTIRE document content.
|
|
1315
|
+
|
|
1316
|
+
**Use JSON format when the user asks for:**
|
|
1317
|
+
- Specific fields, parameters, or values from a form / PDF / spreadsheet
|
|
1318
|
+
- Prices, numbers, or other structured data
|
|
1319
|
+
- Lists of items or properties
|
|
1320
|
+
|
|
1321
|
+
**Use markdown format when:**
|
|
1322
|
+
- User wants to read, summarize, or analyze the full document
|
|
1323
|
+
- User explicitly asks for the complete content
|
|
1324
|
+
|
|
1325
|
+
**Handling PDFs:**
|
|
1326
|
+
Add \`"parsers": ["pdf"]\` (optionally with \`pdfOptions.maxPages\`) when parsing a PDF so the PDF engine is invoked explicitly. For very long documents, cap \`maxPages\` to keep the response within token limits.
|
|
1327
|
+
|
|
1328
|
+
**Usage Example (markdown from a local PDF):**
|
|
1329
|
+
\`\`\`json
|
|
1330
|
+
{
|
|
1331
|
+
"name": "firecrawl_parse",
|
|
1332
|
+
"arguments": {
|
|
1333
|
+
"filePath": "/absolute/path/to/document.pdf",
|
|
1334
|
+
"formats": ["markdown"],
|
|
1335
|
+
"parsers": ["pdf"],
|
|
1336
|
+
"onlyMainContent": true
|
|
1337
|
+
}
|
|
1338
|
+
}
|
|
1339
|
+
\`\`\`
|
|
1340
|
+
|
|
1341
|
+
**Usage Example (structured JSON extraction from a local HTML file):**
|
|
1342
|
+
\`\`\`json
|
|
1343
|
+
{
|
|
1344
|
+
"name": "firecrawl_parse",
|
|
1345
|
+
"arguments": {
|
|
1346
|
+
"filePath": "./invoice.html",
|
|
1347
|
+
"formats": ["json"],
|
|
1348
|
+
"jsonOptions": {
|
|
1349
|
+
"prompt": "Extract the invoice number, total, and line items",
|
|
1350
|
+
"schema": {
|
|
1351
|
+
"type": "object",
|
|
1352
|
+
"properties": {
|
|
1353
|
+
"invoiceNumber": { "type": "string" },
|
|
1354
|
+
"total": { "type": "number" },
|
|
1355
|
+
"lineItems": {
|
|
1356
|
+
"type": "array",
|
|
1357
|
+
"items": {
|
|
1358
|
+
"type": "object",
|
|
1359
|
+
"properties": {
|
|
1360
|
+
"description": { "type": "string" },
|
|
1361
|
+
"amount": { "type": "number" }
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
}
|
|
1368
|
+
}
|
|
1369
|
+
}
|
|
1370
|
+
\`\`\`
|
|
1371
|
+
**Returns:** A parsed document with markdown, html, links, summary, json, or query results depending on the requested formats.
|
|
1372
|
+
`,
|
|
1373
|
+
parameters: parseParamsSchema,
|
|
1374
|
+
execute: async (args, { session, log }) => {
|
|
1375
|
+
const apiUrl = process.env.FIRECRAWL_API_URL;
|
|
1376
|
+
if (!apiUrl) {
|
|
1377
|
+
throw new Error('firecrawl_parse requires FIRECRAWL_API_URL to be set to a self-hosted Firecrawl API instance.');
|
|
1378
|
+
}
|
|
1379
|
+
const { filePath, contentType: overrideContentType, ...options } = args;
|
|
1380
|
+
const absPath = path.resolve(filePath);
|
|
1381
|
+
const buffer = await readFile(absPath);
|
|
1382
|
+
const filename = path.basename(absPath);
|
|
1383
|
+
const fileContentType = overrideContentType && overrideContentType.length > 0
|
|
1384
|
+
? overrideContentType
|
|
1385
|
+
: inferContentType(filename);
|
|
1386
|
+
const transformed = transformScrapeParams(options);
|
|
1387
|
+
const cleaned = removeEmptyTopLevel(transformed);
|
|
1388
|
+
const optionsPayload = { origin: ORIGIN, ...cleaned };
|
|
1389
|
+
const form = new FormData();
|
|
1390
|
+
const blob = new Blob([new Uint8Array(buffer)], { type: fileContentType });
|
|
1391
|
+
form.append('file', blob, filename);
|
|
1392
|
+
form.append('options', JSON.stringify(optionsPayload));
|
|
1393
|
+
const headers = {};
|
|
1394
|
+
const apiKey = session?.firecrawlApiKey;
|
|
1395
|
+
if (apiKey) {
|
|
1396
|
+
headers['Authorization'] = `Bearer ${apiKey}`;
|
|
1397
|
+
}
|
|
1398
|
+
const endpoint = `${apiUrl.replace(/\/$/, '')}/v2/parse`;
|
|
1399
|
+
log.info('Parsing local file', {
|
|
1400
|
+
endpoint,
|
|
1401
|
+
filename,
|
|
1402
|
+
size: buffer.length,
|
|
1403
|
+
});
|
|
1404
|
+
const response = await fetch(endpoint, {
|
|
1405
|
+
method: 'POST',
|
|
1406
|
+
headers,
|
|
1407
|
+
body: form,
|
|
1408
|
+
});
|
|
1409
|
+
const responseText = await response.text();
|
|
1410
|
+
if (!response.ok) {
|
|
1411
|
+
throw new Error(`Parse request failed with status ${response.status}: ${responseText}`);
|
|
1412
|
+
}
|
|
1413
|
+
try {
|
|
1414
|
+
return asText(JSON.parse(responseText));
|
|
1415
|
+
}
|
|
1416
|
+
catch {
|
|
1417
|
+
return responseText;
|
|
1418
|
+
}
|
|
1419
|
+
},
|
|
1420
|
+
});
|
|
1421
|
+
}
|
|
982
1422
|
const PORT = Number(process.env.PORT || 3000);
|
|
983
1423
|
const HOST = process.env.CLOUD_SERVICE === 'true'
|
|
984
1424
|
? '0.0.0.0'
|