npm - firecrawl-mcp - Versions diffs - 3.20.1 → 3.20.3 - Mend

firecrawl-mcp 3.20.1 → 3.20.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -428,6 +428,7 @@ Scrape content from a single URL with advanced options.
 ```
 **Branding format:** Extracts comprehensive brand identity (colors, fonts, typography, spacing, logo, UI components) for design analysis or style replication.
+**Privacy:** Set `redactPII: true` to return content with personally identifiable information redacted.
 **Returns:**
@@ -565,7 +566,8 @@ Search the web and optionally extract content from search results.
     "country": "us",
     "scrapeOptions": {
       "formats": ["markdown"],
-      "onlyMainContent": true
+      "onlyMainContent": true,
+      "redactPII": true
     }
   }
 }

package/dist/index.js CHANGED Viewed

@@ -6,7 +6,30 @@ import { readFile } from 'node:fs/promises';
 import path from 'node:path';
 import { z } from 'zod';
 import { registerMonitorTools } from './monitor.js';
+import { registerResearchTools } from './research.js';
 dotenv.config({ debug: false, quiet: true });
+/**
+ * Decide whether the research tools should be visible for a session.
+ * Local/stdio/self-hosted: gated by `FIRECRAWL_RESEARCH=true`.
+ * Remote (HTTP): additionally enabled by a `?research=true` query param on the
+ * incoming MCP request URL.
+ */
+function isResearchEnabled(request) {
+    if (process.env.FIRECRAWL_RESEARCH === 'true')
+        return true;
+    const url = request?.url;
+    if (url) {
+        try {
+            const research = new URL(url, 'http://localhost').searchParams.get('research');
+            if (research === 'true')
+                return true;
+        }
+        catch {
+            // malformed URL — fall through to disabled
+        }
+    }
+    return false;
+}
 function normalizeHeader(value) {
     if (value == null)
         return undefined;
@@ -187,6 +210,7 @@ const server = new FastMCP({
         protectedResourceMetadataUrl: getOAuthProtectedResourceMetadataUrl(),
     },
     authenticate: async (request) => {
+        const research = isResearchEnabled(request);
         // FastMCP invokes `authenticate(undefined)` for the stdio transport
         // because there is no HTTP request context. Without this null guard,
         // accessing `request.headers` throws a TypeError, FastMCP silently
@@ -199,9 +223,9 @@ const server = new FastMCP({
         const envCred = resolveCredentialFromEnv();
         if (process.env.CLOUD_SERVICE === 'true') {
             if (!headerCred) {
-                throw new Error('Firecrawl credentials required: OAuth access token (Authorization: Bearer fco_…) or API key (x-firecrawl-api-key)');
+                throw new Error('Firecrawl credentials required: OAuth access token (Authorization: Bearer fco_...) or API key (x-firecrawl-api-key)');
             }
-            return { firecrawlApiKey: headerCred };
+            return { firecrawlApiKey: headerCred, research };
         }
         const credential = headerCred ?? envCred;
         // Self-hosted / stdio / HTTP streamable — headers supply MCP OAuth token when present
@@ -213,10 +237,10 @@ const server = new FastMCP({
             process.exit(1);
         }
         if (httpStreaming && !credential && !process.env.FIRECRAWL_API_URL) {
-            console.error('HTTP MCP transport requires FIRECRAWL_API_URL and/or credentials (OAuth: Authorization Bearer fco_…, or FIRECRAWL_API_KEY / FIRECRAWL_OAUTH_TOKEN)');
+            console.error('HTTP MCP transport requires FIRECRAWL_API_URL and/or credentials (OAuth: Authorization Bearer fco_..., or FIRECRAWL_API_KEY / FIRECRAWL_OAUTH_TOKEN)');
             process.exit(1);
         }
-        return { firecrawlApiKey: credential };
+        return { firecrawlApiKey: credential, research };
     },
     // Lightweight health endpoint for LB checks
     health: {
@@ -380,6 +404,7 @@ const scrapeParamsSchema = z.object({
     })
         .optional(),
     onlyMainContent: z.boolean().optional(),
+    redactPII: z.boolean().optional(),
     includeTags: z.array(z.string()).optional(),
     excludeTags: z.array(z.string()).optional(),
     waitFor: z.number().optional(),
@@ -525,6 +550,7 @@ If JSON extraction returns empty, minimal, or just navigation content, the page
 **Branding format:** Extracts comprehensive brand identity (colors, fonts, typography, spacing, logo, UI components) for design analysis or style replication.
 **Performance:** Add maxAge parameter for 500% faster scrapes using cached data.
 **Lockdown mode:** Set \`lockdown: true\` to serve the request only from the existing index/cache without any outbound network request. For air-gapped or compliance-constrained use where the request URL itself is considered sensitive. Errors on cache miss. Billed at 5 credits.
+**Privacy:** Set \`redactPII: true\` to return content with personally identifiable information redacted.
 **Returns:** JSON structured data, markdown, branding profile, or other formats as specified.
 ${SAFE_MODE
         ? '**Safe Mode:** Read-only content extraction. Interactive actions (click, write, executeJavascript) are disabled for security.'
@@ -1361,6 +1387,7 @@ if (process.env.CLOUD_SERVICE !== 'true') {
         })
             .optional(),
         onlyMainContent: z.boolean().optional(),
+        redactPII: z.boolean().optional(),
         includeTags: z.array(z.string()).optional(),
         excludeTags: z.array(z.string()).optional(),
         removeBase64Images: z.boolean().optional(),
@@ -1402,6 +1429,7 @@ This is the fastest and most reliable way to extract content from a document on
 **Supported file types:** .html, .htm, .xhtml, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls
 **Unsupported options:** actions, screenshot/branding/changeTracking formats, waitFor > 0, location, mobile, proxy values other than "auto" or "basic".
+**Privacy:** Set \`redactPII: true\` to return content with personally identifiable information redacted.
 **CRITICAL - Format Selection (same rules as firecrawl_scrape):**
 When the user asks for SPECIFIC data points from a document, you MUST use JSON format with a schema. Only use markdown when the user needs the ENTIRE document content.
@@ -1538,4 +1566,18 @@ else {
     };
 }
 registerMonitorTools(server);
+// Research tools gating. FastMCP's `canAccess` is only honored on the HTTP
+// transport (the stdio path exposes every registered tool regardless), so we
+// split the two cases:
+//   - HTTP (cloud / SSE_LOCAL / HTTP_STREAMABLE_SERVER): always register; each
+//     tool's `canAccess` hides it unless the session has research enabled
+//     (`FIRECRAWL_RESEARCH=true` env or `?research=true` on the request).
+//   - stdio (local): register only when `FIRECRAWL_RESEARCH=true`, since
+//     `canAccess` cannot hide them there.
+const isHttpTransport = process.env.CLOUD_SERVICE === 'true' ||
+    process.env.SSE_LOCAL === 'true' ||
+    process.env.HTTP_STREAMABLE_SERVER === 'true';
+if (isHttpTransport || process.env.FIRECRAWL_RESEARCH === 'true') {
+    registerResearchTools(server, getClient);
+}
 await server.start(args);

package/dist/research.js ADDED Viewed

@@ -0,0 +1,193 @@
+/**
+ * Firecrawl Research tools (experimental).
+ *
+ * Thin MCP wrappers over the `/v2/research/*` endpoints (arXiv papers + GitHub
+ * history/readmes). These tools are hidden unless research is enabled for the
+ * session — locally via `FIRECRAWL_RESEARCH=true`, or remotely via the
+ * `?research=true` query param on the MCP endpoint (see `isResearchEnabled` in
+ * index.ts, which sets `session.research`).
+ *
+ * The installed `@mendable/firecrawl-js` predates the SDK's `research` client,
+ * so we call the endpoints directly through the SDK's HTTP layer (auth +
+ * retries) via `client.http.get(...)`, mirroring how the search tool reaches
+ * `/v2/search`.
+ */
+import { z } from 'zod';
+const BASE = '/v2/research';
+function asText(data) {
+    return JSON.stringify(data, null, 2);
+}
+/** Append a value (or repeated array values) to a URLSearchParams instance. */
+function appendParam(params, key, value) {
+    if (value == null)
+        return;
+    if (Array.isArray(value)) {
+        for (const v of value) {
+            if (v != null && String(v).length > 0)
+                params.append(key, String(v));
+        }
+    }
+    else {
+        params.append(key, String(value));
+    }
+}
+function withQuery(path, params) {
+    const qs = params.toString();
+    return qs ? `${path}?${qs}` : path;
+}
+/** Only present these tools when the session has research enabled. */
+const canAccess = (session) => session?.research === true;
+export function registerResearchTools(server, getClient) {
+    // --- search_papers ---
+    server.addTool({
+        name: 'firecrawl_research_search_papers',
+        canAccess,
+        annotations: {
+            title: 'Search arXiv papers',
+            readOnlyHint: true,
+            openWorldHint: true,
+        },
+        description: 'Primary entry point for finding arXiv papers by topic. Semantic (HyDE) search over arXiv ' +
+            'abstracts; returns ranked papers with arXiv id, title, and abstract. The query should be a ' +
+            'natural-language description of what you want. Run SEVERAL distinct framings of the question ' +
+            '(sibling domains, rival methods, dataset/benchmark names) rather than one query — recall ' +
+            'improves markedly with diverse framings. Returns up to `k` results (default 40).',
+        parameters: z.object({
+            query: z.string().min(1),
+            k: z.number().int().min(1).max(500).optional(),
+            authors: z
+                .array(z.string())
+                .optional()
+                .describe('Author substring filter(s); ALL must match (case-insensitive).'),
+            categories: z
+                .array(z.string())
+                .optional()
+                .describe('arXiv category filter(s) (e.g. `cs.LG`); ALL must match.'),
+            from: z
+                .string()
+                .optional()
+                .describe('Inclusive lower bound on created/updated date (`YYYY-MM-DD`).'),
+            to: z
+                .string()
+                .optional()
+                .describe('Inclusive upper bound on created/updated date (`YYYY-MM-DD`).'),
+        }),
+        execute: async (args, { session }) => {
+            const { query, k, authors, categories, from, to } = args;
+            const params = new URLSearchParams();
+            appendParam(params, 'query', query);
+            appendParam(params, 'k', k);
+            appendParam(params, 'authors', authors);
+            appendParam(params, 'categories', categories);
+            appendParam(params, 'from', from);
+            appendParam(params, 'to', to);
+            const client = getClient(session);
+            const res = await client.http.get(withQuery(`${BASE}/papers`, params));
+            return asText(res.data);
+        },
+    });
+    // --- related_papers ---
+    server.addTool({
+        name: 'firecrawl_research_related_papers',
+        canAccess,
+        annotations: {
+            title: 'Find related arXiv papers',
+            readOnlyHint: true,
+            openWorldHint: true,
+        },
+        description: 'Expand from anchor papers you have already found, via the citation graph, ranked and filtered ' +
+            'to a natural-language `intent`. Pass arXiv ids of your strongest hits as `seed_ids`. Modes: ' +
+            '`similar` (cocitation/coupling — papers in the same niche; the default), `citers` (papers ' +
+            'that cite the anchors), `references` (papers the anchors cite). This reaches relevant papers ' +
+            'that plain search misses, so use it on your best hits before finishing. A `similar` call ' +
+            'already runs a DEEP multi-round expansion internally (re-seeding from each round’s best ' +
+            'finds), so one call reaches the wider neighborhood — no need to chain many. Returns the ' +
+            'candidates plus the pool size.',
+        parameters: z.object({
+            seed_ids: z.array(z.string()).min(1).max(10),
+            intent: z.string().min(1),
+            mode: z.enum(['similar', 'citers', 'references']).optional(),
+            k: z.number().int().min(1).max(500).optional(),
+            rerank: z
+                .boolean()
+                .optional()
+                .describe('Apply an additional rerank over the fused candidates.'),
+        }),
+        execute: async (args, { session }) => {
+            const { seed_ids, intent, mode, k, rerank } = args;
+            // The endpoint takes a single primary seed in the path; any additional
+            // seeds ride along as repeated `anchor` params.
+            const [primary, ...anchors] = seed_ids;
+            const params = new URLSearchParams();
+            appendParam(params, 'intent', intent);
+            appendParam(params, 'mode', mode);
+            appendParam(params, 'k', k);
+            if (rerank != null)
+                appendParam(params, 'rerank', rerank);
+            appendParam(params, 'anchor', anchors);
+            const client = getClient(session);
+            const res = await client.http.get(withQuery(`${BASE}/papers/${encodeURIComponent(primary)}/similar`, params));
+            return asText(res.data);
+        },
+    });
+    // --- read_paper ---
+    server.addTool({
+        name: 'firecrawl_research_read_paper',
+        canAccess,
+        annotations: {
+            title: 'Read an arXiv paper',
+            readOnlyHint: true,
+            openWorldHint: true,
+        },
+        description: 'Read the most relevant in-body (full-text) passages of ONE specific paper for a question. Use ' +
+            'this to VERIFY whether a candidate actually satisfies a constraint before you include or ' +
+            "reject it (e.g. 'does this paper actually use technique X / report a score on benchmark Y'). " +
+            "Returns the best-matching passages, or a notice if the paper's full text is unavailable.",
+        parameters: z.object({
+            arxiv_id: z.string().min(1),
+            question: z.string().min(1),
+            k: z
+                .number()
+                .int()
+                .min(1)
+                .max(50)
+                .optional()
+                .describe('Number of passages to return (default 4).'),
+        }),
+        execute: async (args, { session }) => {
+            const { arxiv_id, question, k } = args;
+            const params = new URLSearchParams();
+            appendParam(params, 'query', question);
+            appendParam(params, 'k', k);
+            const client = getClient(session);
+            const res = await client.http.get(withQuery(`${BASE}/papers/${encodeURIComponent(arxiv_id)}`, params));
+            return asText(res.data);
+        },
+    });
+    // --- search_github ---
+    // TODO: description pending — the user is writing this one.
+    server.addTool({
+        name: 'firecrawl_research_search_github',
+        canAccess,
+        annotations: {
+            title: 'Search GitHub history',
+            readOnlyHint: true,
+            openWorldHint: true,
+        },
+        description: 'Search GitHub issue/PR history and repository readmes. Returns ranked matches with repo, ' +
+            'url, a short snippet, and (when available) the full matched content in markdown.',
+        parameters: z.object({
+            query: z.string().min(1),
+            k: z.number().int().min(1).max(100).optional(),
+        }),
+        execute: async (args, { session }) => {
+            const { query, k } = args;
+            const params = new URLSearchParams();
+            appendParam(params, 'query', query);
+            appendParam(params, 'k', k);
+            const client = getClient(session);
+            const res = await client.http.get(withQuery(`${BASE}/github`, params));
+            return asText(res.data);
+        },
+    });
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "firecrawl-mcp",
-  "version": "3.20.1",
+  "version": "3.20.3",
   "description": "MCP server for Firecrawl — search, scrape, and interact with the web. Supports both cloud and self-hosted instances. Features include web search, scraping, page interaction, batch processing, and LLM-powered content analysis.",
   "type": "module",
   "mcpName": "io.github.firecrawl/firecrawl-mcp-server",
@@ -15,7 +15,7 @@
   },
   "license": "MIT",
   "dependencies": {
-    "@mendable/firecrawl-js": "4.24.0",
+    "@mendable/firecrawl-js": "4.25.2",
     "dotenv": "^17.2.2",
     "firecrawl-fastmcp": "^1.0.5",
     "typescript": "^5.9.2",