firecrawl-mcp 3.10.3 → 3.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3,6 +3,8 @@ import dotenv from 'dotenv';
3
3
  import { FastMCP } from 'firecrawl-fastmcp';
4
4
  import { z } from 'zod';
5
5
  import FirecrawlApp from '@mendable/firecrawl-js';
6
+ import { readFile } from 'node:fs/promises';
7
+ import path from 'node:path';
6
8
  dotenv.config({ debug: false, quiet: true });
7
9
  function extractApiKey(headers) {
8
10
  const headerAuth = headers['authorization'];
@@ -35,6 +37,24 @@ function removeEmptyTopLevel(obj) {
35
37
  }
36
38
  return out;
37
39
  }
40
+ const searchDomainSchema = z
41
+ .string()
42
+ .trim()
43
+ .toLowerCase()
44
+ .regex(/^(?=.{1,253}$)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]$/, 'Domain must be a valid hostname without protocol or path');
45
+ function buildSearchQueryWithDomains(query, includeDomains, excludeDomains) {
46
+ if (includeDomains?.length) {
47
+ return `${query} (${includeDomains
48
+ .map((domain) => `site:${domain}`)
49
+ .join(' OR ')})`;
50
+ }
51
+ if (excludeDomains?.length) {
52
+ return `${query} ${excludeDomains
53
+ .map((domain) => `-site:${domain}`)
54
+ .join(' ')}`;
55
+ }
56
+ return query;
57
+ }
38
58
  class ConsoleLogger {
39
59
  shouldLog = process.env.CLOUD_SERVICE === 'true' ||
40
60
  process.env.SSE_LOCAL === 'true' ||
@@ -152,6 +172,10 @@ function buildFormatsArray(args) {
152
172
  const jsonOpts = args.jsonOptions;
153
173
  result.push({ type: 'json', ...jsonOpts });
154
174
  }
175
+ else if (fmt === 'query') {
176
+ const queryOpts = args.queryOptions;
177
+ result.push({ type: 'query', ...queryOpts });
178
+ }
155
179
  else if (fmt === 'screenshot' && args.screenshotOptions) {
156
180
  const ssOpts = args.screenshotOptions;
157
181
  result.push({ type: 'screenshot', ...ssOpts });
@@ -197,6 +221,7 @@ function transformScrapeParams(args) {
197
221
  if (parsers)
198
222
  out.parsers = parsers;
199
223
  delete out.jsonOptions;
224
+ delete out.queryOptions;
200
225
  delete out.screenshotOptions;
201
226
  delete out.pdfOptions;
202
227
  return out;
@@ -214,6 +239,8 @@ const scrapeParamsSchema = z.object({
214
239
  'changeTracking',
215
240
  'branding',
216
241
  'json',
242
+ 'query',
243
+ 'audio',
217
244
  ]))
218
245
  .optional(),
219
246
  jsonOptions: z
@@ -222,6 +249,11 @@ const scrapeParamsSchema = z.object({
222
249
  schema: z.record(z.string(), z.any()).optional(),
223
250
  })
224
251
  .optional(),
252
+ queryOptions: z
253
+ .object({
254
+ prompt: z.string().max(10000),
255
+ })
256
+ .optional(),
225
257
  screenshotOptions: z
226
258
  .object({
227
259
  fullPage: z.boolean().optional(),
@@ -269,10 +301,22 @@ const scrapeParamsSchema = z.object({
269
301
  storeInCache: z.boolean().optional(),
270
302
  zeroDataRetention: z.boolean().optional(),
271
303
  maxAge: z.number().optional(),
304
+ lockdown: z.boolean().optional(),
272
305
  proxy: z.enum(['basic', 'stealth', 'enhanced', 'auto']).optional(),
306
+ profile: z
307
+ .object({
308
+ name: z.string(),
309
+ saveChanges: z.boolean().optional(),
310
+ })
311
+ .optional(),
273
312
  });
274
313
  server.addTool({
275
314
  name: 'firecrawl_scrape',
315
+ annotations: {
316
+ title: 'Scrape a URL',
317
+ readOnlyHint: SAFE_MODE,
318
+ openWorldHint: true,
319
+ },
276
320
  description: `
277
321
  Scrape content from a single URL with advanced options.
278
322
  This is the most powerful, fastest and most reliable scraper tool, if available you should always default to using this tool for any web scraping needs.
@@ -335,7 +379,18 @@ If JSON extraction returns empty, minimal, or just navigation content, the page
335
379
  }
336
380
  }
337
381
  \`\`\`
338
- **Usage Example (markdown format - ONLY when full content genuinely needed):**
382
+
383
+ **Prefer markdown format by default.** You can read and reason over the full page content directly — no need for an intermediate query step. Use markdown for questions about page content, factual lookups, and any task where you need to understand the page.
384
+
385
+ **Use JSON format when user needs:**
386
+ - Structured data with specific fields (extract all products with name, price, description)
387
+ - Data in a specific schema for downstream processing
388
+
389
+ **Use query format only when:**
390
+ - The page is extremely long and you need a single targeted answer without processing the full content
391
+ - You want a quick factual answer and don't need to retain the page content
392
+
393
+ **Usage Example (markdown format - default for most tasks):**
339
394
  \`\`\`json
340
395
  {
341
396
  "name": "firecrawl_scrape",
@@ -358,6 +413,7 @@ If JSON extraction returns empty, minimal, or just navigation content, the page
358
413
  \`\`\`
359
414
  **Branding format:** Extracts comprehensive brand identity (colors, fonts, typography, spacing, logo, UI components) for design analysis or style replication.
360
415
  **Performance:** Add maxAge parameter for 500% faster scrapes using cached data.
416
+ **Lockdown mode:** Set \`lockdown: true\` to serve the request only from the existing index/cache without any outbound network request. For air-gapped or compliance-constrained use where the request URL itself is considered sensitive. Errors on cache miss. Billed at 5 credits.
361
417
  **Returns:** JSON structured data, markdown, branding profile, or other formats as specified.
362
418
  ${SAFE_MODE
363
419
  ? '**Safe Mode:** Read-only content extraction. Interactive actions (click, write, executeJavascript) are disabled for security.'
@@ -369,7 +425,12 @@ ${SAFE_MODE
369
425
  const client = getClient(session);
370
426
  const transformed = transformScrapeParams(options);
371
427
  const cleaned = removeEmptyTopLevel(transformed);
372
- log.info('Scraping URL', { url: String(url) });
428
+ if (cleaned.lockdown) {
429
+ log.info('Scraping URL (lockdown)');
430
+ }
431
+ else {
432
+ log.info('Scraping URL', { url: String(url) });
433
+ }
373
434
  const res = await client.scrape(String(url), {
374
435
  ...cleaned,
375
436
  origin: ORIGIN,
@@ -379,6 +440,11 @@ ${SAFE_MODE
379
440
  });
380
441
  server.addTool({
381
442
  name: 'firecrawl_map',
443
+ annotations: {
444
+ title: 'Map a website',
445
+ readOnlyHint: true,
446
+ openWorldHint: true,
447
+ },
382
448
  description: `
383
449
  Map a website to discover all indexed URLs on the site.
384
450
 
@@ -432,6 +498,11 @@ Map a website to discover all indexed URLs on the site.
432
498
  });
433
499
  server.addTool({
434
500
  name: 'firecrawl_search',
501
+ annotations: {
502
+ title: 'Search the web',
503
+ readOnlyHint: true,
504
+ openWorldHint: true,
505
+ },
435
506
  description: `
436
507
  Search the web and optionally extract content from search results. This is the most powerful web search tool available, and if available you should always default to using this tool for any web search needs.
437
508
 
@@ -454,6 +525,7 @@ The query also supports search operators, that you can use if needed to refine t
454
525
  **Common mistakes:** Using crawl or map for open-ended questions (use search instead).
455
526
  **Prompt Example:** "Find the latest research papers on AI published in 2023."
456
527
  **Sources:** web, images, news, default to web unless needed images or news.
528
+ **Domain filters:** Use includeDomains to restrict results to specific domains, or excludeDomains to remove domains. Do not use both in the same request. Domains must be hostnames only, without protocol or path.
457
529
  **Scrape Options:** Only use scrapeOptions when you think it is absolutely necessary. When you do so default to a lower limit to avoid timeouts, 5 or lower.
458
530
  **Optimal Workflow:** Search first using firecrawl_search without formats, then after fetching the results, use the scrape tool to get the content of the relevantpage(s) that you want to scrape
459
531
 
@@ -464,6 +536,7 @@ The query also supports search operators, that you can use if needed to refine t
464
536
  "arguments": {
465
537
  "query": "top AI companies",
466
538
  "limit": 5,
539
+ "includeDomains": ["example.com"],
467
540
  "sources": [
468
541
  { "type": "web" }
469
542
  ]
@@ -493,28 +566,40 @@ The query also supports search operators, that you can use if needed to refine t
493
566
  \`\`\`
494
567
  **Returns:** Array of search results (with optional scraped content).
495
568
  `,
496
- parameters: z.object({
569
+ parameters: z
570
+ .object({
497
571
  query: z.string().min(1),
498
572
  limit: z.number().optional(),
499
573
  tbs: z.string().optional(),
500
574
  filter: z.string().optional(),
501
575
  location: z.string().optional(),
576
+ includeDomains: z.array(searchDomainSchema).optional(),
577
+ excludeDomains: z.array(searchDomainSchema).optional(),
502
578
  sources: z
503
579
  .array(z.object({ type: z.enum(['web', 'images', 'news']) }))
504
580
  .optional(),
505
- scrapeOptions: scrapeParamsSchema.omit({ url: true }).partial().optional(),
581
+ scrapeOptions: scrapeParamsSchema
582
+ .omit({ url: true })
583
+ .partial()
584
+ .optional(),
506
585
  enterprise: z.array(z.enum(['default', 'anon', 'zdr'])).optional(),
507
- }),
586
+ })
587
+ .refine((args) => !(args.includeDomains?.length && args.excludeDomains?.length), 'includeDomains and excludeDomains cannot both be specified'),
508
588
  execute: async (args, { session, log }) => {
509
589
  const client = getClient(session);
510
590
  const { query, ...opts } = args;
511
591
  const searchOpts = { ...opts };
592
+ const includeDomains = searchOpts.includeDomains;
593
+ const excludeDomains = searchOpts.excludeDomains;
594
+ delete searchOpts.includeDomains;
595
+ delete searchOpts.excludeDomains;
512
596
  if (searchOpts.scrapeOptions) {
513
597
  searchOpts.scrapeOptions = transformScrapeParams(searchOpts.scrapeOptions);
514
598
  }
515
599
  const cleaned = removeEmptyTopLevel(searchOpts);
516
- log.info('Searching', { query: String(query) });
517
- const res = await client.search(query, {
600
+ const searchQuery = buildSearchQueryWithDomains(query, includeDomains, excludeDomains);
601
+ log.info('Searching', { query: searchQuery });
602
+ const res = await client.search(searchQuery, {
518
603
  ...cleaned,
519
604
  origin: ORIGIN,
520
605
  });
@@ -523,6 +608,12 @@ The query also supports search operators, that you can use if needed to refine t
523
608
  });
524
609
  server.addTool({
525
610
  name: 'firecrawl_crawl',
611
+ annotations: {
612
+ title: 'Start a site crawl',
613
+ readOnlyHint: false,
614
+ openWorldHint: true,
615
+ destructiveHint: false,
616
+ },
526
617
  description: `
527
618
  Starts a crawl job on a website and extracts content from all pages.
528
619
 
@@ -595,6 +686,11 @@ server.addTool({
595
686
  });
596
687
  server.addTool({
597
688
  name: 'firecrawl_check_crawl_status',
689
+ annotations: {
690
+ title: 'Get crawl status',
691
+ readOnlyHint: true,
692
+ openWorldHint: false,
693
+ },
598
694
  description: `
599
695
  Check the status of a crawl job.
600
696
 
@@ -618,6 +714,11 @@ Check the status of a crawl job.
618
714
  });
619
715
  server.addTool({
620
716
  name: 'firecrawl_extract',
717
+ annotations: {
718
+ title: 'Extract structured data',
719
+ readOnlyHint: true,
720
+ openWorldHint: true,
721
+ },
621
722
  description: `
622
723
  Extract structured information from web pages using LLM capabilities. Supports both cloud AI and self-hosted LLM extraction.
623
724
 
@@ -684,6 +785,12 @@ Extract structured information from web pages using LLM capabilities. Supports b
684
785
  });
685
786
  server.addTool({
686
787
  name: 'firecrawl_agent',
788
+ annotations: {
789
+ title: 'Start a research agent',
790
+ readOnlyHint: false,
791
+ openWorldHint: true,
792
+ destructiveHint: false,
793
+ },
687
794
  description: `
688
795
  Autonomous web research agent. This is a separate AI agent layer that independently browses the internet, searches for information, navigates through pages, and extracts structured data based on your query. You describe what you need, and the agent figures out where to find it.
689
796
 
@@ -702,7 +809,11 @@ Autonomous web research agent. This is a separate AI agent layer that independen
702
809
  - Deep research tasks: 5+ minutes
703
810
 
704
811
  **Best for:** Complex research tasks where you don't know the exact URLs; multi-source data gathering; finding information scattered across the web; extracting data from JavaScript-heavy SPAs that fail with regular scrape.
705
- **Not recommended for:** Simple single-page scraping where you know the URL (use scrape with JSON format instead - faster and cheaper).
812
+ **Not recommended for:**
813
+ - Single-page extraction when you have a URL (use firecrawl_scrape, faster and cheaper)
814
+ - Web search (use firecrawl_search first)
815
+ - Interactive page tasks like clicking, filling forms, login, or navigating JS-heavy SPAs (use firecrawl_scrape + firecrawl_interact)
816
+ - Extracting specific data from a known page (use firecrawl_scrape with JSON format)
706
817
 
707
818
  **Arguments:**
708
819
  - prompt: Natural language description of the data you want (required, max 10,000 characters)
@@ -775,6 +886,11 @@ Then poll with \`firecrawl_agent_status\` every 15-30 seconds for at least 2-3 m
775
886
  });
776
887
  server.addTool({
777
888
  name: 'firecrawl_agent_status',
889
+ annotations: {
890
+ title: 'Get agent job status',
891
+ readOnlyHint: true,
892
+ openWorldHint: false,
893
+ },
778
894
  description: `
779
895
  Check the status of an agent job and retrieve results when complete. Use this to poll for results after starting an agent with \`firecrawl_agent\`.
780
896
 
@@ -809,14 +925,19 @@ Check the status of an agent job and retrieve results when complete. Use this to
809
925
  return asText(res);
810
926
  },
811
927
  });
812
- // Browser session tools
928
+ // Browser session tools (deprecated — prefer firecrawl_scrape + firecrawl_interact)
813
929
  server.addTool({
814
930
  name: 'firecrawl_browser_create',
931
+ annotations: {
932
+ title: 'Create browser session',
933
+ readOnlyHint: false,
934
+ openWorldHint: false,
935
+ destructiveHint: false,
936
+ },
815
937
  description: `
816
- Create a browser session for code execution via CDP (Chrome DevTools Protocol).
938
+ **DEPRECATED — prefer firecrawl_scrape + firecrawl_interact instead.** Interact lets you scrape a page and then click, fill forms, and navigate without managing sessions manually.
817
939
 
818
- **Best for:** Running code (Python/JS) that interacts with a live browser page, multi-step browser automation, sessions with profiles that survive across multiple tool calls.
819
- **Not recommended for:** Simple page scraping (use firecrawl_scrape instead).
940
+ Create a browser session for code execution via CDP (Chrome DevTools Protocol).
820
941
 
821
942
  **Arguments:**
822
943
  - ttl: Total session lifetime in seconds (30-3600, optional)
@@ -858,10 +979,16 @@ Create a browser session for code execution via CDP (Chrome DevTools Protocol).
858
979
  if (!SAFE_MODE) {
859
980
  server.addTool({
860
981
  name: 'firecrawl_browser_execute',
982
+ annotations: {
983
+ title: 'Run code in browser session',
984
+ readOnlyHint: false,
985
+ openWorldHint: false,
986
+ destructiveHint: true,
987
+ },
861
988
  description: `
862
- Execute code in a browser session. Supports agent-browser commands (bash), Python, or JavaScript.
989
+ **DEPRECATED prefer firecrawl_scrape + firecrawl_interact instead.** Interact lets you scrape a page and then click, fill forms, and navigate without managing sessions manually.
863
990
 
864
- **Best for:** Browser automation, navigating pages, clicking elements, extracting data, multi-step browser workflows.
991
+ Execute code in a browser session. Supports agent-browser commands (bash), Python, or JavaScript.
865
992
  **Requires:** An active browser session (create one with firecrawl_browser_create first).
866
993
 
867
994
  **Arguments:**
@@ -927,7 +1054,15 @@ Execute code in a browser session. Supports agent-browser commands (bash), Pytho
927
1054
  }
928
1055
  server.addTool({
929
1056
  name: 'firecrawl_browser_delete',
1057
+ annotations: {
1058
+ title: 'Delete browser session',
1059
+ readOnlyHint: false,
1060
+ openWorldHint: false,
1061
+ destructiveHint: true,
1062
+ },
930
1063
  description: `
1064
+ **DEPRECATED — prefer firecrawl_scrape + firecrawl_interact instead.**
1065
+
931
1066
  Destroy a browser session.
932
1067
 
933
1068
  **Usage Example:**
@@ -954,7 +1089,14 @@ Destroy a browser session.
954
1089
  });
955
1090
  server.addTool({
956
1091
  name: 'firecrawl_browser_list',
1092
+ annotations: {
1093
+ title: 'List browser sessions',
1094
+ readOnlyHint: true,
1095
+ openWorldHint: false,
1096
+ },
957
1097
  description: `
1098
+ **DEPRECATED — prefer firecrawl_scrape + firecrawl_interact instead.**
1099
+
958
1100
  List browser sessions, optionally filtered by status.
959
1101
 
960
1102
  **Usage Example:**
@@ -979,6 +1121,304 @@ List browser sessions, optionally filtered by status.
979
1121
  return asText(res);
980
1122
  },
981
1123
  });
1124
+ // Interact tools (scrape-bound browser sessions)
1125
+ server.addTool({
1126
+ name: 'firecrawl_interact',
1127
+ annotations: {
1128
+ title: 'Interact with a scraped page',
1129
+ readOnlyHint: false,
1130
+ openWorldHint: true,
1131
+ destructiveHint: false,
1132
+ },
1133
+ description: `
1134
+ Interact with a previously scraped page in a live browser session. Scrape a page first with firecrawl_scrape, then use the returned scrapeId to click buttons, fill forms, extract dynamic content, or navigate deeper.
1135
+
1136
+ **Best for:** Multi-step workflows on a single page — searching a site, clicking through results, filling forms, extracting data that requires interaction.
1137
+ **Requires:** A scrapeId from a previous firecrawl_scrape call (found in the metadata of the scrape response).
1138
+
1139
+ **Arguments:**
1140
+ - scrapeId: The scrape job ID from a previous scrape (required)
1141
+ - prompt: Natural language instruction describing the action to take (use this OR code)
1142
+ - code: Code to execute in the browser session (use this OR prompt)
1143
+ - language: "bash", "python", or "node" (optional, defaults to "node", only used with code)
1144
+ - timeout: Execution timeout in seconds, 1-300 (optional, defaults to 30)
1145
+
1146
+ **Usage Example (prompt):**
1147
+ \`\`\`json
1148
+ {
1149
+ "name": "firecrawl_interact",
1150
+ "arguments": {
1151
+ "scrapeId": "scrape-id-from-previous-scrape",
1152
+ "prompt": "Click on the first product and tell me its price"
1153
+ }
1154
+ }
1155
+ \`\`\`
1156
+
1157
+ **Usage Example (code):**
1158
+ \`\`\`json
1159
+ {
1160
+ "name": "firecrawl_interact",
1161
+ "arguments": {
1162
+ "scrapeId": "scrape-id-from-previous-scrape",
1163
+ "code": "agent-browser click @e5",
1164
+ "language": "bash"
1165
+ }
1166
+ }
1167
+ \`\`\`
1168
+ **Returns:** Execution result including output, stdout, stderr, exit code, and live view URLs.
1169
+ `,
1170
+ parameters: z.object({
1171
+ scrapeId: z.string(),
1172
+ prompt: z.string().optional(),
1173
+ code: z.string().optional(),
1174
+ language: z.enum(['bash', 'python', 'node']).optional(),
1175
+ timeout: z.number().min(1).max(300).optional(),
1176
+ }).refine(data => data.code || data.prompt, {
1177
+ message: "Either 'code' or 'prompt' must be provided.",
1178
+ }),
1179
+ execute: async (args, { session, log }) => {
1180
+ const client = getClient(session);
1181
+ const { scrapeId, prompt, code, language, timeout } = args;
1182
+ log.info('Interacting with scraped page', { scrapeId });
1183
+ const interactArgs = { origin: ORIGIN };
1184
+ if (prompt)
1185
+ interactArgs.prompt = prompt;
1186
+ if (code)
1187
+ interactArgs.code = code;
1188
+ if (language)
1189
+ interactArgs.language = language;
1190
+ if (timeout != null)
1191
+ interactArgs.timeout = timeout;
1192
+ const res = await client.interact(scrapeId, interactArgs);
1193
+ return asText(res);
1194
+ },
1195
+ });
1196
+ server.addTool({
1197
+ name: 'firecrawl_interact_stop',
1198
+ annotations: {
1199
+ title: 'Stop interact session',
1200
+ readOnlyHint: false,
1201
+ openWorldHint: false,
1202
+ destructiveHint: true,
1203
+ },
1204
+ description: `
1205
+ Stop an interact session for a scraped page. Call this when you are done interacting to free resources.
1206
+
1207
+ **Usage Example:**
1208
+ \`\`\`json
1209
+ {
1210
+ "name": "firecrawl_interact_stop",
1211
+ "arguments": {
1212
+ "scrapeId": "scrape-id-here"
1213
+ }
1214
+ }
1215
+ \`\`\`
1216
+ **Returns:** Success confirmation.
1217
+ `,
1218
+ parameters: z.object({
1219
+ scrapeId: z.string(),
1220
+ }),
1221
+ execute: async (args, { session, log }) => {
1222
+ const client = getClient(session);
1223
+ const { scrapeId } = args;
1224
+ log.info('Stopping interact session', { scrapeId });
1225
+ const res = await client.stopInteraction(scrapeId);
1226
+ return asText(res);
1227
+ },
1228
+ });
1229
+ // Local-only: parse a local file via the self-hosted Firecrawl /v2/parse endpoint.
1230
+ // The parse endpoint is only exposed on self-hosted/local Firecrawl API deployments,
1231
+ // so this tool is registered only when the MCP is NOT running in cloud mode.
1232
+ if (process.env.CLOUD_SERVICE !== 'true') {
1233
+ const parseParamsSchema = z.object({
1234
+ filePath: z
1235
+ .string()
1236
+ .min(1)
1237
+ .describe('Absolute or relative path to a local file to parse. Supported: .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls'),
1238
+ contentType: z
1239
+ .string()
1240
+ .optional()
1241
+ .describe('Optional MIME type override. If omitted, the server infers the file kind from the extension.'),
1242
+ formats: z
1243
+ .array(z.enum([
1244
+ 'markdown',
1245
+ 'html',
1246
+ 'rawHtml',
1247
+ 'links',
1248
+ 'summary',
1249
+ 'json',
1250
+ 'query',
1251
+ ]))
1252
+ .optional(),
1253
+ jsonOptions: z
1254
+ .object({
1255
+ prompt: z.string().optional(),
1256
+ schema: z.record(z.string(), z.any()).optional(),
1257
+ })
1258
+ .optional(),
1259
+ queryOptions: z
1260
+ .object({
1261
+ prompt: z.string().max(10000),
1262
+ })
1263
+ .optional(),
1264
+ parsers: z.array(z.enum(['pdf'])).optional(),
1265
+ pdfOptions: z
1266
+ .object({
1267
+ maxPages: z.number().int().min(1).max(10000).optional(),
1268
+ })
1269
+ .optional(),
1270
+ onlyMainContent: z.boolean().optional(),
1271
+ includeTags: z.array(z.string()).optional(),
1272
+ excludeTags: z.array(z.string()).optional(),
1273
+ removeBase64Images: z.boolean().optional(),
1274
+ skipTlsVerification: z.boolean().optional(),
1275
+ storeInCache: z.boolean().optional(),
1276
+ zeroDataRetention: z.boolean().optional(),
1277
+ maxAge: z.number().optional(),
1278
+ proxy: z.enum(['basic', 'auto']).optional(),
1279
+ });
1280
+ const EXTENSION_CONTENT_TYPES = {
1281
+ '.html': 'text/html',
1282
+ '.htm': 'text/html',
1283
+ '.pdf': 'application/pdf',
1284
+ '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
1285
+ '.doc': 'application/msword',
1286
+ '.odt': 'application/vnd.oasis.opendocument.text',
1287
+ '.rtf': 'application/rtf',
1288
+ '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
1289
+ '.xls': 'application/vnd.ms-excel',
1290
+ };
1291
+ function inferContentType(filename) {
1292
+ const ext = path.extname(filename).toLowerCase();
1293
+ return EXTENSION_CONTENT_TYPES[ext] ?? 'application/octet-stream';
1294
+ }
1295
+ server.addTool({
1296
+ name: 'firecrawl_parse',
1297
+ annotations: {
1298
+ title: 'Parse a local file',
1299
+ readOnlyHint: true,
1300
+ openWorldHint: false,
1301
+ },
1302
+ description: `
1303
+ Parse a file from the local filesystem using a self-hosted Firecrawl API's /v2/parse endpoint.
1304
+ This is the fastest and most reliable way to extract content from a document on disk — if the file lives locally and the MCP is pointed at a self-hosted Firecrawl instance, you should always prefer this tool over uploading the file elsewhere and then scraping it.
1305
+
1306
+ **Best for:** Extracting content from a local document (PDF, Word, Excel, HTML, etc.) when you don't want to host it on the public web first; pulling structured data out of a file with JSON format; converting binary documents into markdown for downstream reasoning.
1307
+ **Not recommended for:** Remote URLs (use firecrawl_scrape); multiple files at once (call parse multiple times); documents that require interactive actions, screenshots, or change tracking — those aren't supported by the parse endpoint.
1308
+ **Common mistakes:** Passing a URL instead of a local file path; requesting an unsupported format (screenshot, branding, changeTracking); setting waitFor, location, mobile, or a non-basic/auto proxy — parse uploads reject all of those.
1309
+
1310
+ **Supported file types:** .html, .htm, .xhtml, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls
1311
+ **Unsupported options:** actions, screenshot/branding/changeTracking formats, waitFor > 0, location, mobile, proxy values other than "auto" or "basic".
1312
+
1313
+ **CRITICAL - Format Selection (same rules as firecrawl_scrape):**
1314
+ When the user asks for SPECIFIC data points from a document, you MUST use JSON format with a schema. Only use markdown when the user needs the ENTIRE document content.
1315
+
1316
+ **Use JSON format when the user asks for:**
1317
+ - Specific fields, parameters, or values from a form / PDF / spreadsheet
1318
+ - Prices, numbers, or other structured data
1319
+ - Lists of items or properties
1320
+
1321
+ **Use markdown format when:**
1322
+ - User wants to read, summarize, or analyze the full document
1323
+ - User explicitly asks for the complete content
1324
+
1325
+ **Handling PDFs:**
1326
+ Add \`"parsers": ["pdf"]\` (optionally with \`pdfOptions.maxPages\`) when parsing a PDF so the PDF engine is invoked explicitly. For very long documents, cap \`maxPages\` to keep the response within token limits.
1327
+
1328
+ **Usage Example (markdown from a local PDF):**
1329
+ \`\`\`json
1330
+ {
1331
+ "name": "firecrawl_parse",
1332
+ "arguments": {
1333
+ "filePath": "/absolute/path/to/document.pdf",
1334
+ "formats": ["markdown"],
1335
+ "parsers": ["pdf"],
1336
+ "onlyMainContent": true
1337
+ }
1338
+ }
1339
+ \`\`\`
1340
+
1341
+ **Usage Example (structured JSON extraction from a local HTML file):**
1342
+ \`\`\`json
1343
+ {
1344
+ "name": "firecrawl_parse",
1345
+ "arguments": {
1346
+ "filePath": "./invoice.html",
1347
+ "formats": ["json"],
1348
+ "jsonOptions": {
1349
+ "prompt": "Extract the invoice number, total, and line items",
1350
+ "schema": {
1351
+ "type": "object",
1352
+ "properties": {
1353
+ "invoiceNumber": { "type": "string" },
1354
+ "total": { "type": "number" },
1355
+ "lineItems": {
1356
+ "type": "array",
1357
+ "items": {
1358
+ "type": "object",
1359
+ "properties": {
1360
+ "description": { "type": "string" },
1361
+ "amount": { "type": "number" }
1362
+ }
1363
+ }
1364
+ }
1365
+ }
1366
+ }
1367
+ }
1368
+ }
1369
+ }
1370
+ \`\`\`
1371
+ **Returns:** A parsed document with markdown, html, links, summary, json, or query results depending on the requested formats.
1372
+ `,
1373
+ parameters: parseParamsSchema,
1374
+ execute: async (args, { session, log }) => {
1375
+ const apiUrl = process.env.FIRECRAWL_API_URL;
1376
+ if (!apiUrl) {
1377
+ throw new Error('firecrawl_parse requires FIRECRAWL_API_URL to be set to a self-hosted Firecrawl API instance.');
1378
+ }
1379
+ const { filePath, contentType: overrideContentType, ...options } = args;
1380
+ const absPath = path.resolve(filePath);
1381
+ const buffer = await readFile(absPath);
1382
+ const filename = path.basename(absPath);
1383
+ const fileContentType = overrideContentType && overrideContentType.length > 0
1384
+ ? overrideContentType
1385
+ : inferContentType(filename);
1386
+ const transformed = transformScrapeParams(options);
1387
+ const cleaned = removeEmptyTopLevel(transformed);
1388
+ const optionsPayload = { origin: ORIGIN, ...cleaned };
1389
+ const form = new FormData();
1390
+ const blob = new Blob([new Uint8Array(buffer)], { type: fileContentType });
1391
+ form.append('file', blob, filename);
1392
+ form.append('options', JSON.stringify(optionsPayload));
1393
+ const headers = {};
1394
+ const apiKey = session?.firecrawlApiKey;
1395
+ if (apiKey) {
1396
+ headers['Authorization'] = `Bearer ${apiKey}`;
1397
+ }
1398
+ const endpoint = `${apiUrl.replace(/\/$/, '')}/v2/parse`;
1399
+ log.info('Parsing local file', {
1400
+ endpoint,
1401
+ filename,
1402
+ size: buffer.length,
1403
+ });
1404
+ const response = await fetch(endpoint, {
1405
+ method: 'POST',
1406
+ headers,
1407
+ body: form,
1408
+ });
1409
+ const responseText = await response.text();
1410
+ if (!response.ok) {
1411
+ throw new Error(`Parse request failed with status ${response.status}: ${responseText}`);
1412
+ }
1413
+ try {
1414
+ return asText(JSON.parse(responseText));
1415
+ }
1416
+ catch {
1417
+ return responseText;
1418
+ }
1419
+ },
1420
+ });
1421
+ }
982
1422
  const PORT = Number(process.env.PORT || 3000);
983
1423
  const HOST = process.env.CLOUD_SERVICE === 'true'
984
1424
  ? '0.0.0.0'