firecrawl-mcp 3.20.2 → 3.20.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +40 -2
- package/dist/research.js +291 -0
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -6,7 +6,30 @@ import { readFile } from 'node:fs/promises';
|
|
|
6
6
|
import path from 'node:path';
|
|
7
7
|
import { z } from 'zod';
|
|
8
8
|
import { registerMonitorTools } from './monitor.js';
|
|
9
|
+
import { registerResearchTools } from './research.js';
|
|
9
10
|
dotenv.config({ debug: false, quiet: true });
|
|
11
|
+
/**
|
|
12
|
+
* Decide whether the research tools should be visible for a session.
|
|
13
|
+
* Local/stdio/self-hosted: gated by `FIRECRAWL_RESEARCH=true`.
|
|
14
|
+
* Remote (HTTP): additionally enabled by a `?research=true` query param on the
|
|
15
|
+
* incoming MCP request URL.
|
|
16
|
+
*/
|
|
17
|
+
function isResearchEnabled(request) {
|
|
18
|
+
if (process.env.FIRECRAWL_RESEARCH === 'true')
|
|
19
|
+
return true;
|
|
20
|
+
const url = request?.url;
|
|
21
|
+
if (url) {
|
|
22
|
+
try {
|
|
23
|
+
const research = new URL(url, 'http://localhost').searchParams.get('research');
|
|
24
|
+
if (research === 'true')
|
|
25
|
+
return true;
|
|
26
|
+
}
|
|
27
|
+
catch {
|
|
28
|
+
// malformed URL — fall through to disabled
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
return false;
|
|
32
|
+
}
|
|
10
33
|
function normalizeHeader(value) {
|
|
11
34
|
if (value == null)
|
|
12
35
|
return undefined;
|
|
@@ -187,6 +210,7 @@ const server = new FastMCP({
|
|
|
187
210
|
protectedResourceMetadataUrl: getOAuthProtectedResourceMetadataUrl(),
|
|
188
211
|
},
|
|
189
212
|
authenticate: async (request) => {
|
|
213
|
+
const research = isResearchEnabled(request);
|
|
190
214
|
// FastMCP invokes `authenticate(undefined)` for the stdio transport
|
|
191
215
|
// because there is no HTTP request context. Without this null guard,
|
|
192
216
|
// accessing `request.headers` throws a TypeError, FastMCP silently
|
|
@@ -201,7 +225,7 @@ const server = new FastMCP({
|
|
|
201
225
|
if (!headerCred) {
|
|
202
226
|
throw new Error('Firecrawl credentials required: OAuth access token (Authorization: Bearer fco_...) or API key (x-firecrawl-api-key)');
|
|
203
227
|
}
|
|
204
|
-
return { firecrawlApiKey: headerCred };
|
|
228
|
+
return { firecrawlApiKey: headerCred, research };
|
|
205
229
|
}
|
|
206
230
|
const credential = headerCred ?? envCred;
|
|
207
231
|
// Self-hosted / stdio / HTTP streamable — headers supply MCP OAuth token when present
|
|
@@ -216,7 +240,7 @@ const server = new FastMCP({
|
|
|
216
240
|
console.error('HTTP MCP transport requires FIRECRAWL_API_URL and/or credentials (OAuth: Authorization Bearer fco_..., or FIRECRAWL_API_KEY / FIRECRAWL_OAUTH_TOKEN)');
|
|
217
241
|
process.exit(1);
|
|
218
242
|
}
|
|
219
|
-
return { firecrawlApiKey: credential };
|
|
243
|
+
return { firecrawlApiKey: credential, research };
|
|
220
244
|
},
|
|
221
245
|
// Lightweight health endpoint for LB checks
|
|
222
246
|
health: {
|
|
@@ -1542,4 +1566,18 @@ else {
|
|
|
1542
1566
|
};
|
|
1543
1567
|
}
|
|
1544
1568
|
registerMonitorTools(server);
|
|
1569
|
+
// Research tools gating. FastMCP's `canAccess` is only honored on the HTTP
|
|
1570
|
+
// transport (the stdio path exposes every registered tool regardless), so we
|
|
1571
|
+
// split the two cases:
|
|
1572
|
+
// - HTTP (cloud / SSE_LOCAL / HTTP_STREAMABLE_SERVER): always register; each
|
|
1573
|
+
// tool's `canAccess` hides it unless the session has research enabled
|
|
1574
|
+
// (`FIRECRAWL_RESEARCH=true` env or `?research=true` on the request).
|
|
1575
|
+
// - stdio (local): register only when `FIRECRAWL_RESEARCH=true`, since
|
|
1576
|
+
// `canAccess` cannot hide them there.
|
|
1577
|
+
const isHttpTransport = process.env.CLOUD_SERVICE === 'true' ||
|
|
1578
|
+
process.env.SSE_LOCAL === 'true' ||
|
|
1579
|
+
process.env.HTTP_STREAMABLE_SERVER === 'true';
|
|
1580
|
+
if (isHttpTransport || process.env.FIRECRAWL_RESEARCH === 'true') {
|
|
1581
|
+
registerResearchTools(server, getClient);
|
|
1582
|
+
}
|
|
1545
1583
|
await server.start(args);
|
package/dist/research.js
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Firecrawl Research tools (experimental).
|
|
3
|
+
*
|
|
4
|
+
* Thin MCP wrappers over the `/v2/research/*` endpoints (arXiv papers + GitHub
|
|
5
|
+
* history/readmes). These tools are hidden unless research is enabled for the
|
|
6
|
+
* session — locally via `FIRECRAWL_RESEARCH=true`, or remotely via the
|
|
7
|
+
* `?research=true` query param on the MCP endpoint (see `isResearchEnabled` in
|
|
8
|
+
* index.ts, which sets `session.research`).
|
|
9
|
+
*
|
|
10
|
+
* The installed `@mendable/firecrawl-js` predates the SDK's `research` client,
|
|
11
|
+
* so we call the endpoints directly through the SDK's HTTP layer (auth +
|
|
12
|
+
* retries) via `client.http.get(...)`, mirroring how the search tool reaches
|
|
13
|
+
* `/v2/search`.
|
|
14
|
+
*/
|
|
15
|
+
import { z } from 'zod';
|
|
16
|
+
const BASE = '/v2/research';
|
|
17
|
+
/** Append a value (or repeated array values) to a URLSearchParams instance. */
|
|
18
|
+
function appendParam(params, key, value) {
|
|
19
|
+
if (value == null)
|
|
20
|
+
return;
|
|
21
|
+
if (Array.isArray(value)) {
|
|
22
|
+
for (const v of value) {
|
|
23
|
+
if (v != null && String(v).length > 0)
|
|
24
|
+
params.append(key, String(v));
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
params.append(key, String(value));
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
function withQuery(path, params) {
|
|
32
|
+
const qs = params.toString();
|
|
33
|
+
return qs ? `${path}?${qs}` : path;
|
|
34
|
+
}
|
|
35
|
+
// --- result formatting (ported from research-index-front/src/agent_eval.ts) ---
|
|
36
|
+
// Max authors to print per paper (with affiliations); the rest collapse to a
|
|
37
|
+
// "+N more" tail so a large collaboration doesn't flood the context.
|
|
38
|
+
const MAX_AUTHORS = 15;
|
|
39
|
+
// Cap each abstract so a page of hits stays within the MCP output-token limit.
|
|
40
|
+
const MAX_ABSTRACT_CHARS = 600;
|
|
41
|
+
// Per-affiliation char cap — keeps one long org string (e.g. a full multi-dept
|
|
42
|
+
// university address) from bloating the authors line.
|
|
43
|
+
const MAX_AFFIL_CHARS = 60;
|
|
44
|
+
// Hard ceiling on the whole authors line, as a final guard.
|
|
45
|
+
const MAX_AUTHORS_LINE_CHARS = 400;
|
|
46
|
+
/** Best display id for a paper: its arXiv id, falling back to the canonical id. */
|
|
47
|
+
function displayId(p) {
|
|
48
|
+
return p.ids?.arxiv?.[0] ?? p.paper_id ?? '?';
|
|
49
|
+
}
|
|
50
|
+
/** Format the authors line, accepting either the string or structured form. */
|
|
51
|
+
function fmtAuthors(authors) {
|
|
52
|
+
if (!authors)
|
|
53
|
+
return null;
|
|
54
|
+
let shown;
|
|
55
|
+
let total;
|
|
56
|
+
if (typeof authors === 'string') {
|
|
57
|
+
const names = authors
|
|
58
|
+
.split(',')
|
|
59
|
+
.map((s) => s.trim())
|
|
60
|
+
.filter(Boolean);
|
|
61
|
+
if (names.length === 0)
|
|
62
|
+
return null;
|
|
63
|
+
total = names.length;
|
|
64
|
+
shown = names.slice(0, MAX_AUTHORS);
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
if (authors.length === 0)
|
|
68
|
+
return null;
|
|
69
|
+
total = authors.length;
|
|
70
|
+
shown = authors.slice(0, MAX_AUTHORS).map((a) => {
|
|
71
|
+
const aff = a.affiliation?.trim();
|
|
72
|
+
return aff ? `${a.name} (${aff.slice(0, MAX_AFFIL_CHARS)})` : a.name;
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
const extra = total > MAX_AUTHORS ? `; +${total - MAX_AUTHORS} more` : '';
|
|
76
|
+
return ('Authors: ' + shown.join('; ') + extra).slice(0, MAX_AUTHORS_LINE_CHARS);
|
|
77
|
+
}
|
|
78
|
+
/** Render ranked papers as `[id] title` / authors / abstract blocks. */
|
|
79
|
+
function fmtHits(results) {
|
|
80
|
+
if (!results || results.length === 0)
|
|
81
|
+
return '(no results)';
|
|
82
|
+
return results
|
|
83
|
+
.map((r) => {
|
|
84
|
+
const lines = [`[${displayId(r)}] ${r.title ?? '(untitled)'}`];
|
|
85
|
+
const authors = fmtAuthors(r.authors);
|
|
86
|
+
if (authors)
|
|
87
|
+
lines.push(authors);
|
|
88
|
+
lines.push((r.abstract || '(no abstract)')
|
|
89
|
+
.replace(/\s+/g, ' ')
|
|
90
|
+
.slice(0, MAX_ABSTRACT_CHARS));
|
|
91
|
+
return lines.join('\n');
|
|
92
|
+
})
|
|
93
|
+
.join('\n\n');
|
|
94
|
+
}
|
|
95
|
+
// Cap GitHub matched content so a page of results stays within the MCP
|
|
96
|
+
// output-token limit. Higher than abstracts since issue/PR threads carry the
|
|
97
|
+
// signal (repro steps, stack traces) the agent actually needs to verify.
|
|
98
|
+
const MAX_GITHUB_CONTENT_CHARS = 1200;
|
|
99
|
+
/**
|
|
100
|
+
* Render GitHub history/readme hits as `[repo#number] (kind)` / url / body
|
|
101
|
+
* blocks — the same shape as `fmtHits`, but tuned for issues/PRs and readmes.
|
|
102
|
+
* Markdown content keeps its newlines (so tables/code survive); only readmes and
|
|
103
|
+
* snippets fall back when full content is absent.
|
|
104
|
+
*/
|
|
105
|
+
function fmtGithub(results) {
|
|
106
|
+
if (!results || results.length === 0)
|
|
107
|
+
return '(no results)';
|
|
108
|
+
return results
|
|
109
|
+
.map((r) => {
|
|
110
|
+
const lines = [];
|
|
111
|
+
if (r.resultType === 'repo_readme') {
|
|
112
|
+
lines.push(`[${r.repo ?? '?'}] README`);
|
|
113
|
+
}
|
|
114
|
+
else {
|
|
115
|
+
const ref = r.number != null ? `#${r.number}` : '';
|
|
116
|
+
const meta = [
|
|
117
|
+
r.pageType,
|
|
118
|
+
r.segmentCount ? `${r.segmentCount} segments` : '',
|
|
119
|
+
]
|
|
120
|
+
.filter(Boolean)
|
|
121
|
+
.join(', ');
|
|
122
|
+
lines.push(`[${r.repo ?? '?'}${ref}]${meta ? ` (${meta})` : ''}`);
|
|
123
|
+
}
|
|
124
|
+
const url = r.readmeUrl ?? r.url;
|
|
125
|
+
if (url)
|
|
126
|
+
lines.push(url);
|
|
127
|
+
const body = (r.contentMd || r.snippet || '').trim();
|
|
128
|
+
lines.push(body ? body.slice(0, MAX_GITHUB_CONTENT_CHARS) : '(no content)');
|
|
129
|
+
return lines.join('\n');
|
|
130
|
+
})
|
|
131
|
+
.join('\n\n');
|
|
132
|
+
}
|
|
133
|
+
/** Only present these tools when the session has research enabled. */
|
|
134
|
+
const canAccess = (session) => session?.research === true;
|
|
135
|
+
export function registerResearchTools(server, getClient) {
|
|
136
|
+
// --- search_papers ---
|
|
137
|
+
server.addTool({
|
|
138
|
+
name: 'firecrawl_research_search_papers',
|
|
139
|
+
canAccess,
|
|
140
|
+
annotations: {
|
|
141
|
+
title: 'Search arXiv papers',
|
|
142
|
+
readOnlyHint: true,
|
|
143
|
+
openWorldHint: true,
|
|
144
|
+
},
|
|
145
|
+
description: 'Primary entry point for finding arXiv papers by topic. Semantic (HyDE) search over arXiv ' +
|
|
146
|
+
'abstracts; returns ranked papers with arXiv id, title, and abstract. The query should be a ' +
|
|
147
|
+
'natural-language description of what you want. Run SEVERAL distinct framings of the question ' +
|
|
148
|
+
'(sibling domains, rival methods, dataset/benchmark names) rather than one query — recall ' +
|
|
149
|
+
'improves markedly with diverse framings. Returns up to `k` results (default 40).',
|
|
150
|
+
parameters: z.object({
|
|
151
|
+
query: z.string().min(1),
|
|
152
|
+
k: z.number().int().min(1).max(500).optional(),
|
|
153
|
+
authors: z
|
|
154
|
+
.array(z.string())
|
|
155
|
+
.optional()
|
|
156
|
+
.describe('Author substring filter(s); ALL must match (case-insensitive).'),
|
|
157
|
+
categories: z
|
|
158
|
+
.array(z.string())
|
|
159
|
+
.optional()
|
|
160
|
+
.describe('arXiv category filter(s) (e.g. `cs.LG`); ALL must match.'),
|
|
161
|
+
from: z
|
|
162
|
+
.string()
|
|
163
|
+
.optional()
|
|
164
|
+
.describe('Inclusive lower bound on created/updated date (`YYYY-MM-DD`).'),
|
|
165
|
+
to: z
|
|
166
|
+
.string()
|
|
167
|
+
.optional()
|
|
168
|
+
.describe('Inclusive upper bound on created/updated date (`YYYY-MM-DD`).'),
|
|
169
|
+
}),
|
|
170
|
+
execute: async (args, { session }) => {
|
|
171
|
+
const { query, k, authors, categories, from, to } = args;
|
|
172
|
+
const params = new URLSearchParams();
|
|
173
|
+
appendParam(params, 'query', query);
|
|
174
|
+
appendParam(params, 'k', k);
|
|
175
|
+
appendParam(params, 'authors', authors);
|
|
176
|
+
appendParam(params, 'categories', categories);
|
|
177
|
+
appendParam(params, 'from', from);
|
|
178
|
+
appendParam(params, 'to', to);
|
|
179
|
+
const client = getClient(session);
|
|
180
|
+
const res = await client.http.get(withQuery(`${BASE}/papers`, params));
|
|
181
|
+
return fmtHits(res.data?.results);
|
|
182
|
+
},
|
|
183
|
+
});
|
|
184
|
+
// --- related_papers ---
|
|
185
|
+
server.addTool({
|
|
186
|
+
name: 'firecrawl_research_related_papers',
|
|
187
|
+
canAccess,
|
|
188
|
+
annotations: {
|
|
189
|
+
title: 'Find related arXiv papers',
|
|
190
|
+
readOnlyHint: true,
|
|
191
|
+
openWorldHint: true,
|
|
192
|
+
},
|
|
193
|
+
description: 'Expand from anchor papers you have already found, via the citation graph, ranked and filtered ' +
|
|
194
|
+
'to a natural-language `intent`. Pass arXiv ids of your strongest hits as `seed_ids`. Modes: ' +
|
|
195
|
+
'`similar` (cocitation/coupling — papers in the same niche; the default), `citers` (papers ' +
|
|
196
|
+
'that cite the anchors), `references` (papers the anchors cite). This reaches relevant papers ' +
|
|
197
|
+
'that plain search misses, so use it on your best hits before finishing. A `similar` call ' +
|
|
198
|
+
'already runs a DEEP multi-round expansion internally (re-seeding from each round’s best ' +
|
|
199
|
+
'finds), so one call reaches the wider neighborhood — no need to chain many. Returns the ' +
|
|
200
|
+
'candidates plus the pool size.',
|
|
201
|
+
parameters: z.object({
|
|
202
|
+
seed_ids: z.array(z.string()).min(1).max(10),
|
|
203
|
+
intent: z.string().min(1),
|
|
204
|
+
mode: z.enum(['similar', 'citers', 'references']).optional(),
|
|
205
|
+
k: z.number().int().min(1).max(500).optional(),
|
|
206
|
+
rerank: z
|
|
207
|
+
.boolean()
|
|
208
|
+
.optional()
|
|
209
|
+
.describe('Apply an additional rerank over the fused candidates.'),
|
|
210
|
+
}),
|
|
211
|
+
execute: async (args, { session }) => {
|
|
212
|
+
const { seed_ids, intent, mode, k, rerank } = args;
|
|
213
|
+
// The endpoint takes a single primary seed in the path; any additional
|
|
214
|
+
// seeds ride along as repeated `anchor` params.
|
|
215
|
+
const [primary, ...anchors] = seed_ids;
|
|
216
|
+
const params = new URLSearchParams();
|
|
217
|
+
appendParam(params, 'intent', intent);
|
|
218
|
+
appendParam(params, 'mode', mode);
|
|
219
|
+
appendParam(params, 'k', k);
|
|
220
|
+
if (rerank != null)
|
|
221
|
+
appendParam(params, 'rerank', rerank);
|
|
222
|
+
appendParam(params, 'anchor', anchors);
|
|
223
|
+
const client = getClient(session);
|
|
224
|
+
const res = await client.http.get(withQuery(`${BASE}/papers/${encodeURIComponent(primary)}/similar`, params));
|
|
225
|
+
const note = res.data?.note ? `\nnote: ${res.data.note}` : '';
|
|
226
|
+
return `${fmtHits(res.data?.results)}\n(pool_size=${res.data?.pool_size ?? 0})${note}`;
|
|
227
|
+
},
|
|
228
|
+
});
|
|
229
|
+
// --- read_paper ---
|
|
230
|
+
server.addTool({
|
|
231
|
+
name: 'firecrawl_research_read_paper',
|
|
232
|
+
canAccess,
|
|
233
|
+
annotations: {
|
|
234
|
+
title: 'Read an arXiv paper',
|
|
235
|
+
readOnlyHint: true,
|
|
236
|
+
openWorldHint: true,
|
|
237
|
+
},
|
|
238
|
+
description: 'Read the most relevant in-body (full-text) passages of ONE specific paper for a question. Use ' +
|
|
239
|
+
'this to VERIFY whether a candidate actually satisfies a constraint before you include or ' +
|
|
240
|
+
"reject it (e.g. 'does this paper actually use technique X / report a score on benchmark Y'). " +
|
|
241
|
+
"Returns the best-matching passages, or a notice if the paper's full text is unavailable.",
|
|
242
|
+
parameters: z.object({
|
|
243
|
+
arxiv_id: z.string().min(1),
|
|
244
|
+
question: z.string().min(1),
|
|
245
|
+
k: z
|
|
246
|
+
.number()
|
|
247
|
+
.int()
|
|
248
|
+
.min(1)
|
|
249
|
+
.max(50)
|
|
250
|
+
.optional()
|
|
251
|
+
.describe('Number of passages to return (default 4).'),
|
|
252
|
+
}),
|
|
253
|
+
execute: async (args, { session }) => {
|
|
254
|
+
const { arxiv_id, question, k } = args;
|
|
255
|
+
const params = new URLSearchParams();
|
|
256
|
+
appendParam(params, 'query', question);
|
|
257
|
+
appendParam(params, 'k', k);
|
|
258
|
+
const client = getClient(session);
|
|
259
|
+
const res = await client.http.get(withQuery(`${BASE}/papers/${encodeURIComponent(arxiv_id)}`, params));
|
|
260
|
+
const passages = res.data?.passages ?? [];
|
|
261
|
+
return passages.length
|
|
262
|
+
? passages.map((p) => p.text).join('\n---\n')
|
|
263
|
+
: '(no full-text passages available for this paper)';
|
|
264
|
+
},
|
|
265
|
+
});
|
|
266
|
+
// --- search_github ---
|
|
267
|
+
server.addTool({
|
|
268
|
+
name: 'firecrawl_research_search_github',
|
|
269
|
+
canAccess,
|
|
270
|
+
annotations: {
|
|
271
|
+
title: 'Search GitHub history',
|
|
272
|
+
readOnlyHint: true,
|
|
273
|
+
openWorldHint: true,
|
|
274
|
+
},
|
|
275
|
+
description: 'Search GitHub issue/PR history and repository readmes. Returns ranked matches with repo, ' +
|
|
276
|
+
'url, a short snippet, and (when available) the full matched content in markdown.',
|
|
277
|
+
parameters: z.object({
|
|
278
|
+
query: z.string().min(1),
|
|
279
|
+
k: z.number().int().min(1).max(100).optional(),
|
|
280
|
+
}),
|
|
281
|
+
execute: async (args, { session }) => {
|
|
282
|
+
const { query, k } = args;
|
|
283
|
+
const params = new URLSearchParams();
|
|
284
|
+
appendParam(params, 'query', query);
|
|
285
|
+
appendParam(params, 'k', k);
|
|
286
|
+
const client = getClient(session);
|
|
287
|
+
const res = await client.http.get(withQuery(`${BASE}/github`, params));
|
|
288
|
+
return fmtGithub(res.data?.results);
|
|
289
|
+
},
|
|
290
|
+
});
|
|
291
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "firecrawl-mcp",
|
|
3
|
-
"version": "3.20.
|
|
3
|
+
"version": "3.20.4",
|
|
4
4
|
"description": "MCP server for Firecrawl — search, scrape, and interact with the web. Supports both cloud and self-hosted instances. Features include web search, scraping, page interaction, batch processing, and LLM-powered content analysis.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"mcpName": "io.github.firecrawl/firecrawl-mcp-server",
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
},
|
|
16
16
|
"license": "MIT",
|
|
17
17
|
"dependencies": {
|
|
18
|
-
"@mendable/firecrawl-js": "4.
|
|
18
|
+
"@mendable/firecrawl-js": "4.25.2",
|
|
19
19
|
"dotenv": "^17.2.2",
|
|
20
20
|
"firecrawl-fastmcp": "^1.0.5",
|
|
21
21
|
"typescript": "^5.9.2",
|