firecrawl-mcp 3.20.1 → 3.20.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -1
- package/dist/index.js +46 -4
- package/dist/research.js +193 -0
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -428,6 +428,7 @@ Scrape content from a single URL with advanced options.
|
|
|
428
428
|
```
|
|
429
429
|
|
|
430
430
|
**Branding format:** Extracts comprehensive brand identity (colors, fonts, typography, spacing, logo, UI components) for design analysis or style replication.
|
|
431
|
+
**Privacy:** Set `redactPII: true` to return content with personally identifiable information redacted.
|
|
431
432
|
|
|
432
433
|
**Returns:**
|
|
433
434
|
|
|
@@ -565,7 +566,8 @@ Search the web and optionally extract content from search results.
|
|
|
565
566
|
"country": "us",
|
|
566
567
|
"scrapeOptions": {
|
|
567
568
|
"formats": ["markdown"],
|
|
568
|
-
"onlyMainContent": true
|
|
569
|
+
"onlyMainContent": true,
|
|
570
|
+
"redactPII": true
|
|
569
571
|
}
|
|
570
572
|
}
|
|
571
573
|
}
|
package/dist/index.js
CHANGED
|
@@ -6,7 +6,30 @@ import { readFile } from 'node:fs/promises';
|
|
|
6
6
|
import path from 'node:path';
|
|
7
7
|
import { z } from 'zod';
|
|
8
8
|
import { registerMonitorTools } from './monitor.js';
|
|
9
|
+
import { registerResearchTools } from './research.js';
|
|
9
10
|
dotenv.config({ debug: false, quiet: true });
|
|
11
|
+
/**
|
|
12
|
+
* Decide whether the research tools should be visible for a session.
|
|
13
|
+
* Local/stdio/self-hosted: gated by `FIRECRAWL_RESEARCH=true`.
|
|
14
|
+
* Remote (HTTP): additionally enabled by a `?research=true` query param on the
|
|
15
|
+
* incoming MCP request URL.
|
|
16
|
+
*/
|
|
17
|
+
function isResearchEnabled(request) {
|
|
18
|
+
if (process.env.FIRECRAWL_RESEARCH === 'true')
|
|
19
|
+
return true;
|
|
20
|
+
const url = request?.url;
|
|
21
|
+
if (url) {
|
|
22
|
+
try {
|
|
23
|
+
const research = new URL(url, 'http://localhost').searchParams.get('research');
|
|
24
|
+
if (research === 'true')
|
|
25
|
+
return true;
|
|
26
|
+
}
|
|
27
|
+
catch {
|
|
28
|
+
// malformed URL — fall through to disabled
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
return false;
|
|
32
|
+
}
|
|
10
33
|
function normalizeHeader(value) {
|
|
11
34
|
if (value == null)
|
|
12
35
|
return undefined;
|
|
@@ -187,6 +210,7 @@ const server = new FastMCP({
|
|
|
187
210
|
protectedResourceMetadataUrl: getOAuthProtectedResourceMetadataUrl(),
|
|
188
211
|
},
|
|
189
212
|
authenticate: async (request) => {
|
|
213
|
+
const research = isResearchEnabled(request);
|
|
190
214
|
// FastMCP invokes `authenticate(undefined)` for the stdio transport
|
|
191
215
|
// because there is no HTTP request context. Without this null guard,
|
|
192
216
|
// accessing `request.headers` throws a TypeError, FastMCP silently
|
|
@@ -199,9 +223,9 @@ const server = new FastMCP({
|
|
|
199
223
|
const envCred = resolveCredentialFromEnv();
|
|
200
224
|
if (process.env.CLOUD_SERVICE === 'true') {
|
|
201
225
|
if (!headerCred) {
|
|
202
|
-
throw new Error('Firecrawl credentials required: OAuth access token (Authorization: Bearer fco_
|
|
226
|
+
throw new Error('Firecrawl credentials required: OAuth access token (Authorization: Bearer fco_...) or API key (x-firecrawl-api-key)');
|
|
203
227
|
}
|
|
204
|
-
return { firecrawlApiKey: headerCred };
|
|
228
|
+
return { firecrawlApiKey: headerCred, research };
|
|
205
229
|
}
|
|
206
230
|
const credential = headerCred ?? envCred;
|
|
207
231
|
// Self-hosted / stdio / HTTP streamable — headers supply MCP OAuth token when present
|
|
@@ -213,10 +237,10 @@ const server = new FastMCP({
|
|
|
213
237
|
process.exit(1);
|
|
214
238
|
}
|
|
215
239
|
if (httpStreaming && !credential && !process.env.FIRECRAWL_API_URL) {
|
|
216
|
-
console.error('HTTP MCP transport requires FIRECRAWL_API_URL and/or credentials (OAuth: Authorization Bearer fco_
|
|
240
|
+
console.error('HTTP MCP transport requires FIRECRAWL_API_URL and/or credentials (OAuth: Authorization Bearer fco_..., or FIRECRAWL_API_KEY / FIRECRAWL_OAUTH_TOKEN)');
|
|
217
241
|
process.exit(1);
|
|
218
242
|
}
|
|
219
|
-
return { firecrawlApiKey: credential };
|
|
243
|
+
return { firecrawlApiKey: credential, research };
|
|
220
244
|
},
|
|
221
245
|
// Lightweight health endpoint for LB checks
|
|
222
246
|
health: {
|
|
@@ -380,6 +404,7 @@ const scrapeParamsSchema = z.object({
|
|
|
380
404
|
})
|
|
381
405
|
.optional(),
|
|
382
406
|
onlyMainContent: z.boolean().optional(),
|
|
407
|
+
redactPII: z.boolean().optional(),
|
|
383
408
|
includeTags: z.array(z.string()).optional(),
|
|
384
409
|
excludeTags: z.array(z.string()).optional(),
|
|
385
410
|
waitFor: z.number().optional(),
|
|
@@ -525,6 +550,7 @@ If JSON extraction returns empty, minimal, or just navigation content, the page
|
|
|
525
550
|
**Branding format:** Extracts comprehensive brand identity (colors, fonts, typography, spacing, logo, UI components) for design analysis or style replication.
|
|
526
551
|
**Performance:** Add maxAge parameter for 500% faster scrapes using cached data.
|
|
527
552
|
**Lockdown mode:** Set \`lockdown: true\` to serve the request only from the existing index/cache without any outbound network request. For air-gapped or compliance-constrained use where the request URL itself is considered sensitive. Errors on cache miss. Billed at 5 credits.
|
|
553
|
+
**Privacy:** Set \`redactPII: true\` to return content with personally identifiable information redacted.
|
|
528
554
|
**Returns:** JSON structured data, markdown, branding profile, or other formats as specified.
|
|
529
555
|
${SAFE_MODE
|
|
530
556
|
? '**Safe Mode:** Read-only content extraction. Interactive actions (click, write, executeJavascript) are disabled for security.'
|
|
@@ -1361,6 +1387,7 @@ if (process.env.CLOUD_SERVICE !== 'true') {
|
|
|
1361
1387
|
})
|
|
1362
1388
|
.optional(),
|
|
1363
1389
|
onlyMainContent: z.boolean().optional(),
|
|
1390
|
+
redactPII: z.boolean().optional(),
|
|
1364
1391
|
includeTags: z.array(z.string()).optional(),
|
|
1365
1392
|
excludeTags: z.array(z.string()).optional(),
|
|
1366
1393
|
removeBase64Images: z.boolean().optional(),
|
|
@@ -1402,6 +1429,7 @@ This is the fastest and most reliable way to extract content from a document on
|
|
|
1402
1429
|
|
|
1403
1430
|
**Supported file types:** .html, .htm, .xhtml, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls
|
|
1404
1431
|
**Unsupported options:** actions, screenshot/branding/changeTracking formats, waitFor > 0, location, mobile, proxy values other than "auto" or "basic".
|
|
1432
|
+
**Privacy:** Set \`redactPII: true\` to return content with personally identifiable information redacted.
|
|
1405
1433
|
|
|
1406
1434
|
**CRITICAL - Format Selection (same rules as firecrawl_scrape):**
|
|
1407
1435
|
When the user asks for SPECIFIC data points from a document, you MUST use JSON format with a schema. Only use markdown when the user needs the ENTIRE document content.
|
|
@@ -1538,4 +1566,18 @@ else {
|
|
|
1538
1566
|
};
|
|
1539
1567
|
}
|
|
1540
1568
|
registerMonitorTools(server);
|
|
1569
|
+
// Research tools gating. FastMCP's `canAccess` is only honored on the HTTP
|
|
1570
|
+
// transport (the stdio path exposes every registered tool regardless), so we
|
|
1571
|
+
// split the two cases:
|
|
1572
|
+
// - HTTP (cloud / SSE_LOCAL / HTTP_STREAMABLE_SERVER): always register; each
|
|
1573
|
+
// tool's `canAccess` hides it unless the session has research enabled
|
|
1574
|
+
// (`FIRECRAWL_RESEARCH=true` env or `?research=true` on the request).
|
|
1575
|
+
// - stdio (local): register only when `FIRECRAWL_RESEARCH=true`, since
|
|
1576
|
+
// `canAccess` cannot hide them there.
|
|
1577
|
+
const isHttpTransport = process.env.CLOUD_SERVICE === 'true' ||
|
|
1578
|
+
process.env.SSE_LOCAL === 'true' ||
|
|
1579
|
+
process.env.HTTP_STREAMABLE_SERVER === 'true';
|
|
1580
|
+
if (isHttpTransport || process.env.FIRECRAWL_RESEARCH === 'true') {
|
|
1581
|
+
registerResearchTools(server, getClient);
|
|
1582
|
+
}
|
|
1541
1583
|
await server.start(args);
|
package/dist/research.js
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Firecrawl Research tools (experimental).
|
|
3
|
+
*
|
|
4
|
+
* Thin MCP wrappers over the `/v2/research/*` endpoints (arXiv papers + GitHub
|
|
5
|
+
* history/readmes). These tools are hidden unless research is enabled for the
|
|
6
|
+
* session — locally via `FIRECRAWL_RESEARCH=true`, or remotely via the
|
|
7
|
+
* `?research=true` query param on the MCP endpoint (see `isResearchEnabled` in
|
|
8
|
+
* index.ts, which sets `session.research`).
|
|
9
|
+
*
|
|
10
|
+
* The installed `@mendable/firecrawl-js` predates the SDK's `research` client,
|
|
11
|
+
* so we call the endpoints directly through the SDK's HTTP layer (auth +
|
|
12
|
+
* retries) via `client.http.get(...)`, mirroring how the search tool reaches
|
|
13
|
+
* `/v2/search`.
|
|
14
|
+
*/
|
|
15
|
+
import { z } from 'zod';
|
|
16
|
+
const BASE = '/v2/research';
|
|
17
|
+
function asText(data) {
|
|
18
|
+
return JSON.stringify(data, null, 2);
|
|
19
|
+
}
|
|
20
|
+
/** Append a value (or repeated array values) to a URLSearchParams instance. */
|
|
21
|
+
function appendParam(params, key, value) {
|
|
22
|
+
if (value == null)
|
|
23
|
+
return;
|
|
24
|
+
if (Array.isArray(value)) {
|
|
25
|
+
for (const v of value) {
|
|
26
|
+
if (v != null && String(v).length > 0)
|
|
27
|
+
params.append(key, String(v));
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
params.append(key, String(value));
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
function withQuery(path, params) {
|
|
35
|
+
const qs = params.toString();
|
|
36
|
+
return qs ? `${path}?${qs}` : path;
|
|
37
|
+
}
|
|
38
|
+
/** Only present these tools when the session has research enabled. */
|
|
39
|
+
const canAccess = (session) => session?.research === true;
|
|
40
|
+
export function registerResearchTools(server, getClient) {
|
|
41
|
+
// --- search_papers ---
|
|
42
|
+
server.addTool({
|
|
43
|
+
name: 'firecrawl_research_search_papers',
|
|
44
|
+
canAccess,
|
|
45
|
+
annotations: {
|
|
46
|
+
title: 'Search arXiv papers',
|
|
47
|
+
readOnlyHint: true,
|
|
48
|
+
openWorldHint: true,
|
|
49
|
+
},
|
|
50
|
+
description: 'Primary entry point for finding arXiv papers by topic. Semantic (HyDE) search over arXiv ' +
|
|
51
|
+
'abstracts; returns ranked papers with arXiv id, title, and abstract. The query should be a ' +
|
|
52
|
+
'natural-language description of what you want. Run SEVERAL distinct framings of the question ' +
|
|
53
|
+
'(sibling domains, rival methods, dataset/benchmark names) rather than one query — recall ' +
|
|
54
|
+
'improves markedly with diverse framings. Returns up to `k` results (default 40).',
|
|
55
|
+
parameters: z.object({
|
|
56
|
+
query: z.string().min(1),
|
|
57
|
+
k: z.number().int().min(1).max(500).optional(),
|
|
58
|
+
authors: z
|
|
59
|
+
.array(z.string())
|
|
60
|
+
.optional()
|
|
61
|
+
.describe('Author substring filter(s); ALL must match (case-insensitive).'),
|
|
62
|
+
categories: z
|
|
63
|
+
.array(z.string())
|
|
64
|
+
.optional()
|
|
65
|
+
.describe('arXiv category filter(s) (e.g. `cs.LG`); ALL must match.'),
|
|
66
|
+
from: z
|
|
67
|
+
.string()
|
|
68
|
+
.optional()
|
|
69
|
+
.describe('Inclusive lower bound on created/updated date (`YYYY-MM-DD`).'),
|
|
70
|
+
to: z
|
|
71
|
+
.string()
|
|
72
|
+
.optional()
|
|
73
|
+
.describe('Inclusive upper bound on created/updated date (`YYYY-MM-DD`).'),
|
|
74
|
+
}),
|
|
75
|
+
execute: async (args, { session }) => {
|
|
76
|
+
const { query, k, authors, categories, from, to } = args;
|
|
77
|
+
const params = new URLSearchParams();
|
|
78
|
+
appendParam(params, 'query', query);
|
|
79
|
+
appendParam(params, 'k', k);
|
|
80
|
+
appendParam(params, 'authors', authors);
|
|
81
|
+
appendParam(params, 'categories', categories);
|
|
82
|
+
appendParam(params, 'from', from);
|
|
83
|
+
appendParam(params, 'to', to);
|
|
84
|
+
const client = getClient(session);
|
|
85
|
+
const res = await client.http.get(withQuery(`${BASE}/papers`, params));
|
|
86
|
+
return asText(res.data);
|
|
87
|
+
},
|
|
88
|
+
});
|
|
89
|
+
// --- related_papers ---
|
|
90
|
+
server.addTool({
|
|
91
|
+
name: 'firecrawl_research_related_papers',
|
|
92
|
+
canAccess,
|
|
93
|
+
annotations: {
|
|
94
|
+
title: 'Find related arXiv papers',
|
|
95
|
+
readOnlyHint: true,
|
|
96
|
+
openWorldHint: true,
|
|
97
|
+
},
|
|
98
|
+
description: 'Expand from anchor papers you have already found, via the citation graph, ranked and filtered ' +
|
|
99
|
+
'to a natural-language `intent`. Pass arXiv ids of your strongest hits as `seed_ids`. Modes: ' +
|
|
100
|
+
'`similar` (cocitation/coupling — papers in the same niche; the default), `citers` (papers ' +
|
|
101
|
+
'that cite the anchors), `references` (papers the anchors cite). This reaches relevant papers ' +
|
|
102
|
+
'that plain search misses, so use it on your best hits before finishing. A `similar` call ' +
|
|
103
|
+
'already runs a DEEP multi-round expansion internally (re-seeding from each round’s best ' +
|
|
104
|
+
'finds), so one call reaches the wider neighborhood — no need to chain many. Returns the ' +
|
|
105
|
+
'candidates plus the pool size.',
|
|
106
|
+
parameters: z.object({
|
|
107
|
+
seed_ids: z.array(z.string()).min(1).max(10),
|
|
108
|
+
intent: z.string().min(1),
|
|
109
|
+
mode: z.enum(['similar', 'citers', 'references']).optional(),
|
|
110
|
+
k: z.number().int().min(1).max(500).optional(),
|
|
111
|
+
rerank: z
|
|
112
|
+
.boolean()
|
|
113
|
+
.optional()
|
|
114
|
+
.describe('Apply an additional rerank over the fused candidates.'),
|
|
115
|
+
}),
|
|
116
|
+
execute: async (args, { session }) => {
|
|
117
|
+
const { seed_ids, intent, mode, k, rerank } = args;
|
|
118
|
+
// The endpoint takes a single primary seed in the path; any additional
|
|
119
|
+
// seeds ride along as repeated `anchor` params.
|
|
120
|
+
const [primary, ...anchors] = seed_ids;
|
|
121
|
+
const params = new URLSearchParams();
|
|
122
|
+
appendParam(params, 'intent', intent);
|
|
123
|
+
appendParam(params, 'mode', mode);
|
|
124
|
+
appendParam(params, 'k', k);
|
|
125
|
+
if (rerank != null)
|
|
126
|
+
appendParam(params, 'rerank', rerank);
|
|
127
|
+
appendParam(params, 'anchor', anchors);
|
|
128
|
+
const client = getClient(session);
|
|
129
|
+
const res = await client.http.get(withQuery(`${BASE}/papers/${encodeURIComponent(primary)}/similar`, params));
|
|
130
|
+
return asText(res.data);
|
|
131
|
+
},
|
|
132
|
+
});
|
|
133
|
+
// --- read_paper ---
|
|
134
|
+
server.addTool({
|
|
135
|
+
name: 'firecrawl_research_read_paper',
|
|
136
|
+
canAccess,
|
|
137
|
+
annotations: {
|
|
138
|
+
title: 'Read an arXiv paper',
|
|
139
|
+
readOnlyHint: true,
|
|
140
|
+
openWorldHint: true,
|
|
141
|
+
},
|
|
142
|
+
description: 'Read the most relevant in-body (full-text) passages of ONE specific paper for a question. Use ' +
|
|
143
|
+
'this to VERIFY whether a candidate actually satisfies a constraint before you include or ' +
|
|
144
|
+
"reject it (e.g. 'does this paper actually use technique X / report a score on benchmark Y'). " +
|
|
145
|
+
"Returns the best-matching passages, or a notice if the paper's full text is unavailable.",
|
|
146
|
+
parameters: z.object({
|
|
147
|
+
arxiv_id: z.string().min(1),
|
|
148
|
+
question: z.string().min(1),
|
|
149
|
+
k: z
|
|
150
|
+
.number()
|
|
151
|
+
.int()
|
|
152
|
+
.min(1)
|
|
153
|
+
.max(50)
|
|
154
|
+
.optional()
|
|
155
|
+
.describe('Number of passages to return (default 4).'),
|
|
156
|
+
}),
|
|
157
|
+
execute: async (args, { session }) => {
|
|
158
|
+
const { arxiv_id, question, k } = args;
|
|
159
|
+
const params = new URLSearchParams();
|
|
160
|
+
appendParam(params, 'query', question);
|
|
161
|
+
appendParam(params, 'k', k);
|
|
162
|
+
const client = getClient(session);
|
|
163
|
+
const res = await client.http.get(withQuery(`${BASE}/papers/${encodeURIComponent(arxiv_id)}`, params));
|
|
164
|
+
return asText(res.data);
|
|
165
|
+
},
|
|
166
|
+
});
|
|
167
|
+
// --- search_github ---
|
|
168
|
+
// TODO: description pending — the user is writing this one.
|
|
169
|
+
server.addTool({
|
|
170
|
+
name: 'firecrawl_research_search_github',
|
|
171
|
+
canAccess,
|
|
172
|
+
annotations: {
|
|
173
|
+
title: 'Search GitHub history',
|
|
174
|
+
readOnlyHint: true,
|
|
175
|
+
openWorldHint: true,
|
|
176
|
+
},
|
|
177
|
+
description: 'Search GitHub issue/PR history and repository readmes. Returns ranked matches with repo, ' +
|
|
178
|
+
'url, a short snippet, and (when available) the full matched content in markdown.',
|
|
179
|
+
parameters: z.object({
|
|
180
|
+
query: z.string().min(1),
|
|
181
|
+
k: z.number().int().min(1).max(100).optional(),
|
|
182
|
+
}),
|
|
183
|
+
execute: async (args, { session }) => {
|
|
184
|
+
const { query, k } = args;
|
|
185
|
+
const params = new URLSearchParams();
|
|
186
|
+
appendParam(params, 'query', query);
|
|
187
|
+
appendParam(params, 'k', k);
|
|
188
|
+
const client = getClient(session);
|
|
189
|
+
const res = await client.http.get(withQuery(`${BASE}/github`, params));
|
|
190
|
+
return asText(res.data);
|
|
191
|
+
},
|
|
192
|
+
});
|
|
193
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "firecrawl-mcp",
|
|
3
|
-
"version": "3.20.
|
|
3
|
+
"version": "3.20.3",
|
|
4
4
|
"description": "MCP server for Firecrawl — search, scrape, and interact with the web. Supports both cloud and self-hosted instances. Features include web search, scraping, page interaction, batch processing, and LLM-powered content analysis.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"mcpName": "io.github.firecrawl/firecrawl-mcp-server",
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
},
|
|
16
16
|
"license": "MIT",
|
|
17
17
|
"dependencies": {
|
|
18
|
-
"@mendable/firecrawl-js": "4.
|
|
18
|
+
"@mendable/firecrawl-js": "4.25.2",
|
|
19
19
|
"dotenv": "^17.2.2",
|
|
20
20
|
"firecrawl-fastmcp": "^1.0.5",
|
|
21
21
|
"typescript": "^5.9.2",
|