firecrawl-mcp 1.12.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/index.js +371 -316
- package/package.json +10 -9
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Firecrawl MCP Server
|
|
2
2
|
|
|
3
|
-
A Model Context Protocol (MCP) server implementation that integrates with [Firecrawl](https://github.com/
|
|
3
|
+
A Model Context Protocol (MCP) server implementation that integrates with [Firecrawl](https://github.com/firecrawl/firecrawl) for web scraping capabilities.
|
|
4
4
|
|
|
5
5
|
> Big thanks to [@vrknetha](https://github.com/vrknetha), [@knacklabs](https://www.knacklabs.ai) for the initial implementation!
|
|
6
6
|
|
package/dist/index.js
CHANGED
|
@@ -4,8 +4,10 @@ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'
|
|
|
4
4
|
import { SSEServerTransport } from '@modelcontextprotocol/sdk/server/sse.js';
|
|
5
5
|
import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
6
6
|
import FirecrawlApp from '@mendable/firecrawl-js';
|
|
7
|
+
import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js';
|
|
7
8
|
import express from 'express';
|
|
8
9
|
import dotenv from 'dotenv';
|
|
10
|
+
import { randomUUID } from 'node:crypto';
|
|
9
11
|
dotenv.config();
|
|
10
12
|
// Tool definitions
|
|
11
13
|
const SCRAPE_TOOL = {
|
|
@@ -25,7 +27,7 @@ This is the most powerful, fastest and most reliable scraper tool, if available
|
|
|
25
27
|
"arguments": {
|
|
26
28
|
"url": "https://example.com",
|
|
27
29
|
"formats": ["markdown"],
|
|
28
|
-
"maxAge":
|
|
30
|
+
"maxAge": 172800000
|
|
29
31
|
}
|
|
30
32
|
}
|
|
31
33
|
\`\`\`
|
|
@@ -42,15 +44,39 @@ This is the most powerful, fastest and most reliable scraper tool, if available
|
|
|
42
44
|
formats: {
|
|
43
45
|
type: 'array',
|
|
44
46
|
items: {
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
47
|
+
oneOf: [
|
|
48
|
+
{
|
|
49
|
+
type: 'string',
|
|
50
|
+
enum: [
|
|
51
|
+
'markdown',
|
|
52
|
+
'html',
|
|
53
|
+
'rawHtml',
|
|
54
|
+
'screenshot',
|
|
55
|
+
'links',
|
|
56
|
+
'extract',
|
|
57
|
+
'summary',
|
|
58
|
+
],
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
type: 'object',
|
|
62
|
+
properties: {
|
|
63
|
+
type: {
|
|
64
|
+
type: 'string',
|
|
65
|
+
enum: ['json'],
|
|
66
|
+
},
|
|
67
|
+
prompt: {
|
|
68
|
+
type: 'string',
|
|
69
|
+
description: 'Prompt to guide JSON extraction',
|
|
70
|
+
},
|
|
71
|
+
schema: {
|
|
72
|
+
type: 'object',
|
|
73
|
+
description: 'JSON schema for structured extraction',
|
|
74
|
+
},
|
|
75
|
+
},
|
|
76
|
+
required: ['type'],
|
|
77
|
+
additionalProperties: true,
|
|
78
|
+
description: 'Advanced format option. Use { type: "json", prompt, schema } to request structured JSON extraction.',
|
|
79
|
+
},
|
|
54
80
|
],
|
|
55
81
|
},
|
|
56
82
|
default: ['markdown'],
|
|
@@ -58,6 +84,7 @@ This is the most powerful, fastest and most reliable scraper tool, if available
|
|
|
58
84
|
},
|
|
59
85
|
onlyMainContent: {
|
|
60
86
|
type: 'boolean',
|
|
87
|
+
default: true,
|
|
61
88
|
description: 'Extract only the main content, filtering out navigation, footers, etc.',
|
|
62
89
|
},
|
|
63
90
|
includeTags: {
|
|
@@ -74,10 +101,6 @@ This is the most powerful, fastest and most reliable scraper tool, if available
|
|
|
74
101
|
type: 'number',
|
|
75
102
|
description: 'Time in milliseconds to wait for dynamic content to load',
|
|
76
103
|
},
|
|
77
|
-
timeout: {
|
|
78
|
-
type: 'number',
|
|
79
|
-
description: 'Maximum time in milliseconds to wait for the page to load',
|
|
80
|
-
},
|
|
81
104
|
actions: {
|
|
82
105
|
type: 'array',
|
|
83
106
|
items: {
|
|
@@ -131,24 +154,6 @@ This is the most powerful, fastest and most reliable scraper tool, if available
|
|
|
131
154
|
},
|
|
132
155
|
description: 'List of actions to perform before scraping',
|
|
133
156
|
},
|
|
134
|
-
extract: {
|
|
135
|
-
type: 'object',
|
|
136
|
-
properties: {
|
|
137
|
-
schema: {
|
|
138
|
-
type: 'object',
|
|
139
|
-
description: 'Schema for structured data extraction',
|
|
140
|
-
},
|
|
141
|
-
systemPrompt: {
|
|
142
|
-
type: 'string',
|
|
143
|
-
description: 'System prompt for LLM extraction',
|
|
144
|
-
},
|
|
145
|
-
prompt: {
|
|
146
|
-
type: 'string',
|
|
147
|
-
description: 'User prompt for LLM extraction',
|
|
148
|
-
},
|
|
149
|
-
},
|
|
150
|
-
description: 'Configuration for structured data extraction',
|
|
151
|
-
},
|
|
152
157
|
mobile: {
|
|
153
158
|
type: 'boolean',
|
|
154
159
|
description: 'Use mobile viewport',
|
|
@@ -176,9 +181,15 @@ This is the most powerful, fastest and most reliable scraper tool, if available
|
|
|
176
181
|
},
|
|
177
182
|
description: 'Location settings for scraping',
|
|
178
183
|
},
|
|
184
|
+
storeInCache: {
|
|
185
|
+
type: 'boolean',
|
|
186
|
+
default: true,
|
|
187
|
+
description: 'If true, the page will be stored in the Firecrawl index and cache. Setting this to false is useful if your scraping activity may have data protection concerns.',
|
|
188
|
+
},
|
|
179
189
|
maxAge: {
|
|
180
190
|
type: 'number',
|
|
181
|
-
|
|
191
|
+
default: 172800000,
|
|
192
|
+
description: 'Maximum age in milliseconds for cached content. Use cached data if available and younger than maxAge, otherwise scrape fresh. Enables 500% faster scrapes for recently cached pages. Default: 172800000',
|
|
182
193
|
},
|
|
183
194
|
},
|
|
184
195
|
required: ['url'],
|
|
@@ -215,13 +226,10 @@ Map a website to discover all indexed URLs on the site.
|
|
|
215
226
|
type: 'string',
|
|
216
227
|
description: 'Optional search term to filter URLs',
|
|
217
228
|
},
|
|
218
|
-
|
|
219
|
-
type: '
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
sitemapOnly: {
|
|
223
|
-
type: 'boolean',
|
|
224
|
-
description: 'Only use sitemap.xml for discovery, ignore HTML links',
|
|
229
|
+
sitemap: {
|
|
230
|
+
type: 'string',
|
|
231
|
+
enum: ['include', 'skip', 'only'],
|
|
232
|
+
description: 'Sitemap handling: "include" - use sitemap + find other pages (default), "skip" - ignore sitemap completely, "only" - only return sitemap URLs',
|
|
225
233
|
},
|
|
226
234
|
includeSubdomains: {
|
|
227
235
|
type: 'boolean',
|
|
@@ -231,6 +239,11 @@ Map a website to discover all indexed URLs on the site.
|
|
|
231
239
|
type: 'number',
|
|
232
240
|
description: 'Maximum number of URLs to return',
|
|
233
241
|
},
|
|
242
|
+
ignoreQueryParameters: {
|
|
243
|
+
type: 'boolean',
|
|
244
|
+
default: true,
|
|
245
|
+
description: 'Do not return URLs with query parameters',
|
|
246
|
+
},
|
|
234
247
|
},
|
|
235
248
|
required: ['url'],
|
|
236
249
|
},
|
|
@@ -238,28 +251,29 @@ Map a website to discover all indexed URLs on the site.
|
|
|
238
251
|
const CRAWL_TOOL = {
|
|
239
252
|
name: 'firecrawl_crawl',
|
|
240
253
|
description: `
|
|
241
|
-
Starts
|
|
242
|
-
|
|
243
|
-
**Best for:** Extracting content from multiple related pages, when you need comprehensive coverage.
|
|
244
|
-
**Not recommended for:** Extracting content from a single page (use scrape); when token limits are a concern (use map + batch_scrape); when you need fast results (crawling can be slow).
|
|
245
|
-
**Warning:** Crawl responses can be very large and may exceed token limits. Limit the crawl depth and number of pages, or use map + batch_scrape for better control.
|
|
246
|
-
**Common mistakes:** Setting limit or
|
|
247
|
-
**Prompt Example:** "Get all blog posts from the first two levels of example.com/blog."
|
|
248
|
-
**Usage Example:**
|
|
249
|
-
\`\`\`json
|
|
250
|
-
{
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
254
|
+
Starts a crawl job on a website and extracts content from all pages.
|
|
255
|
+
|
|
256
|
+
**Best for:** Extracting content from multiple related pages, when you need comprehensive coverage.
|
|
257
|
+
**Not recommended for:** Extracting content from a single page (use scrape); when token limits are a concern (use map + batch_scrape); when you need fast results (crawling can be slow).
|
|
258
|
+
**Warning:** Crawl responses can be very large and may exceed token limits. Limit the crawl depth and number of pages, or use map + batch_scrape for better control.
|
|
259
|
+
**Common mistakes:** Setting limit or maxDiscoveryDepth too high (causes token overflow); using crawl for a single page (use scrape instead).
|
|
260
|
+
**Prompt Example:** "Get all blog posts from the first two levels of example.com/blog."
|
|
261
|
+
**Usage Example:**
|
|
262
|
+
\`\`\`json
|
|
263
|
+
{
|
|
264
|
+
"name": "firecrawl_crawl",
|
|
265
|
+
"arguments": {
|
|
266
|
+
"url": "https://example.com/blog/*",
|
|
267
|
+
"maxDiscoveryDepth": 2,
|
|
268
|
+
"limit": 100,
|
|
269
|
+
"allowExternalLinks": false,
|
|
270
|
+
"deduplicateSimilarURLs": true,
|
|
271
|
+
"sitemap": "include"
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
\`\`\`
|
|
275
|
+
**Returns:** Operation ID for status checking; use firecrawl_check_crawl_status to check progress.
|
|
276
|
+
`,
|
|
263
277
|
inputSchema: {
|
|
264
278
|
type: 'object',
|
|
265
279
|
properties: {
|
|
@@ -267,6 +281,10 @@ Starts an asynchronous crawl job on a website and extracts content from all page
|
|
|
267
281
|
type: 'string',
|
|
268
282
|
description: 'Starting URL for the crawl',
|
|
269
283
|
},
|
|
284
|
+
prompt: {
|
|
285
|
+
type: 'string',
|
|
286
|
+
description: 'Natural language prompt to generate crawler options. Explicitly set parameters will override generated ones.',
|
|
287
|
+
},
|
|
270
288
|
excludePaths: {
|
|
271
289
|
type: 'array',
|
|
272
290
|
items: { type: 'string' },
|
|
@@ -277,26 +295,43 @@ Starts an asynchronous crawl job on a website and extracts content from all page
|
|
|
277
295
|
items: { type: 'string' },
|
|
278
296
|
description: 'Only crawl these URL paths',
|
|
279
297
|
},
|
|
280
|
-
|
|
298
|
+
maxDiscoveryDepth: {
|
|
281
299
|
type: 'number',
|
|
282
|
-
description: 'Maximum
|
|
300
|
+
description: 'Maximum discovery depth to crawl. The root site and sitemapped pages have depth 0.',
|
|
283
301
|
},
|
|
284
|
-
|
|
285
|
-
type: '
|
|
286
|
-
|
|
302
|
+
sitemap: {
|
|
303
|
+
type: 'string',
|
|
304
|
+
enum: ['skip', 'include', 'only'],
|
|
305
|
+
default: 'include',
|
|
306
|
+
description: "Sitemap mode when crawling. 'skip' ignores the sitemap entirely, 'include' uses sitemap plus other discovery methods (default), 'only' restricts crawling to sitemap URLs.",
|
|
287
307
|
},
|
|
288
308
|
limit: {
|
|
289
309
|
type: 'number',
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
allowBackwardLinks: {
|
|
293
|
-
type: 'boolean',
|
|
294
|
-
description: 'Allow crawling links that point to parent directories',
|
|
310
|
+
default: 10000,
|
|
311
|
+
description: 'Maximum number of pages to crawl (default: 10000)',
|
|
295
312
|
},
|
|
296
313
|
allowExternalLinks: {
|
|
297
314
|
type: 'boolean',
|
|
298
315
|
description: 'Allow crawling links to external domains',
|
|
299
316
|
},
|
|
317
|
+
allowSubdomains: {
|
|
318
|
+
type: 'boolean',
|
|
319
|
+
default: false,
|
|
320
|
+
description: 'Allow crawling links to subdomains of the main domain',
|
|
321
|
+
},
|
|
322
|
+
crawlEntireDomain: {
|
|
323
|
+
type: 'boolean',
|
|
324
|
+
default: false,
|
|
325
|
+
description: 'When true, follow internal links to sibling or parent URLs, not just child paths',
|
|
326
|
+
},
|
|
327
|
+
delay: {
|
|
328
|
+
type: 'number',
|
|
329
|
+
description: 'Delay in seconds between scrapes to respect site rate limits',
|
|
330
|
+
},
|
|
331
|
+
maxConcurrency: {
|
|
332
|
+
type: 'number',
|
|
333
|
+
description: 'Maximum number of concurrent scrapes; if unset, team limit is used',
|
|
334
|
+
},
|
|
300
335
|
webhook: {
|
|
301
336
|
oneOf: [
|
|
302
337
|
{
|
|
@@ -325,7 +360,8 @@ Starts an asynchronous crawl job on a website and extracts content from all page
|
|
|
325
360
|
},
|
|
326
361
|
ignoreQueryParameters: {
|
|
327
362
|
type: 'boolean',
|
|
328
|
-
|
|
363
|
+
default: false,
|
|
364
|
+
description: 'Do not re-scrape the same path with different (or none) query parameters',
|
|
329
365
|
},
|
|
330
366
|
scrapeOptions: {
|
|
331
367
|
type: 'object',
|
|
@@ -333,17 +369,43 @@ Starts an asynchronous crawl job on a website and extracts content from all page
|
|
|
333
369
|
formats: {
|
|
334
370
|
type: 'array',
|
|
335
371
|
items: {
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
372
|
+
oneOf: [
|
|
373
|
+
{
|
|
374
|
+
type: 'string',
|
|
375
|
+
enum: [
|
|
376
|
+
'markdown',
|
|
377
|
+
'html',
|
|
378
|
+
'rawHtml',
|
|
379
|
+
'screenshot',
|
|
380
|
+
'links',
|
|
381
|
+
'extract',
|
|
382
|
+
'summary',
|
|
383
|
+
],
|
|
384
|
+
},
|
|
385
|
+
{
|
|
386
|
+
type: 'object',
|
|
387
|
+
properties: {
|
|
388
|
+
type: {
|
|
389
|
+
type: 'string',
|
|
390
|
+
enum: ['json'],
|
|
391
|
+
},
|
|
392
|
+
prompt: {
|
|
393
|
+
type: 'string',
|
|
394
|
+
description: 'Prompt to guide JSON extraction',
|
|
395
|
+
},
|
|
396
|
+
schema: {
|
|
397
|
+
type: 'object',
|
|
398
|
+
description: 'JSON schema for structured extraction',
|
|
399
|
+
},
|
|
400
|
+
},
|
|
401
|
+
required: ['type'],
|
|
402
|
+
additionalProperties: true,
|
|
403
|
+
description: 'Advanced format option. Use { type: "json", prompt, schema } to request structured JSON extraction.',
|
|
404
|
+
},
|
|
345
405
|
],
|
|
346
406
|
},
|
|
407
|
+
default: ['markdown'],
|
|
408
|
+
description: "Content formats to extract (default: ['markdown'])",
|
|
347
409
|
},
|
|
348
410
|
onlyMainContent: {
|
|
349
411
|
type: 'boolean',
|
|
@@ -396,12 +458,13 @@ Check the status of a crawl job.
|
|
|
396
458
|
const SEARCH_TOOL = {
|
|
397
459
|
name: 'firecrawl_search',
|
|
398
460
|
description: `
|
|
399
|
-
Search the web and optionally extract content from search results. This is the most powerful search tool available, and if available you should always default to using this tool for any web search needs.
|
|
461
|
+
Search the web and optionally extract content from search results. This is the most powerful web search tool available, and if available you should always default to using this tool for any web search needs.
|
|
400
462
|
|
|
401
463
|
**Best for:** Finding specific information across multiple websites, when you don't know which website has the information; when you need the most relevant content for a query.
|
|
402
|
-
**Not recommended for:** When you already know which website to scrape (use scrape); when you need comprehensive coverage of a single website (use map or crawl
|
|
464
|
+
**Not recommended for:** When you need to search the filesystem. When you already know which website to scrape (use scrape); when you need comprehensive coverage of a single website (use map or crawl.
|
|
403
465
|
**Common mistakes:** Using crawl or map for open-ended questions (use search instead).
|
|
404
466
|
**Prompt Example:** "Find the latest research papers on AI published in 2023."
|
|
467
|
+
**Sources:** web, images, news, default to web unless needed images or news.
|
|
405
468
|
**Usage Example:**
|
|
406
469
|
\`\`\`json
|
|
407
470
|
{
|
|
@@ -411,6 +474,11 @@ Search the web and optionally extract content from search results. This is the m
|
|
|
411
474
|
"limit": 5,
|
|
412
475
|
"lang": "en",
|
|
413
476
|
"country": "us",
|
|
477
|
+
"sources": [
|
|
478
|
+
"web",
|
|
479
|
+
"images",
|
|
480
|
+
"news"
|
|
481
|
+
],
|
|
414
482
|
"scrapeOptions": {
|
|
415
483
|
"formats": ["markdown"],
|
|
416
484
|
"onlyMainContent": true
|
|
@@ -431,14 +499,6 @@ Search the web and optionally extract content from search results. This is the m
|
|
|
431
499
|
type: 'number',
|
|
432
500
|
description: 'Maximum number of results to return (default: 5)',
|
|
433
501
|
},
|
|
434
|
-
lang: {
|
|
435
|
-
type: 'string',
|
|
436
|
-
description: 'Language code for search results (default: en)',
|
|
437
|
-
},
|
|
438
|
-
country: {
|
|
439
|
-
type: 'string',
|
|
440
|
-
description: 'Country code for search results (default: us)',
|
|
441
|
-
},
|
|
442
502
|
tbs: {
|
|
443
503
|
type: 'string',
|
|
444
504
|
description: 'Time-based search filter',
|
|
@@ -448,19 +508,48 @@ Search the web and optionally extract content from search results. This is the m
|
|
|
448
508
|
description: 'Search filter',
|
|
449
509
|
},
|
|
450
510
|
location: {
|
|
451
|
-
type: '
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
511
|
+
type: 'string',
|
|
512
|
+
description: 'Location parameter for search results',
|
|
513
|
+
},
|
|
514
|
+
sources: {
|
|
515
|
+
type: 'array',
|
|
516
|
+
description: 'Sources to search. Determines which result arrays are included in the response.',
|
|
517
|
+
items: {
|
|
518
|
+
oneOf: [
|
|
519
|
+
{
|
|
520
|
+
type: 'object',
|
|
521
|
+
properties: {
|
|
522
|
+
type: { type: 'string', enum: ['web'] },
|
|
523
|
+
tbs: {
|
|
524
|
+
type: 'string',
|
|
525
|
+
description: 'Time-based search parameter (e.g., qdr:h, qdr:d, qdr:w, qdr:m, qdr:y or custom cdr with cd_min/cd_max)',
|
|
526
|
+
},
|
|
527
|
+
location: {
|
|
528
|
+
type: 'string',
|
|
529
|
+
description: 'Location parameter for search results',
|
|
530
|
+
},
|
|
531
|
+
},
|
|
532
|
+
required: ['type'],
|
|
533
|
+
additionalProperties: false,
|
|
534
|
+
},
|
|
535
|
+
{
|
|
536
|
+
type: 'object',
|
|
537
|
+
properties: {
|
|
538
|
+
type: { type: 'string', enum: ['images'] },
|
|
539
|
+
},
|
|
540
|
+
required: ['type'],
|
|
541
|
+
additionalProperties: false,
|
|
542
|
+
},
|
|
543
|
+
{
|
|
544
|
+
type: 'object',
|
|
545
|
+
properties: {
|
|
546
|
+
type: { type: 'string', enum: ['news'] },
|
|
547
|
+
},
|
|
548
|
+
required: ['type'],
|
|
549
|
+
additionalProperties: false,
|
|
550
|
+
},
|
|
551
|
+
],
|
|
462
552
|
},
|
|
463
|
-
description: 'Location settings for search',
|
|
464
553
|
},
|
|
465
554
|
scrapeOptions: {
|
|
466
555
|
type: 'object',
|
|
@@ -468,8 +557,22 @@ Search the web and optionally extract content from search results. This is the m
|
|
|
468
557
|
formats: {
|
|
469
558
|
type: 'array',
|
|
470
559
|
items: {
|
|
471
|
-
|
|
472
|
-
|
|
560
|
+
oneOf: [
|
|
561
|
+
{
|
|
562
|
+
type: 'string',
|
|
563
|
+
enum: ['markdown', 'html', 'rawHtml'],
|
|
564
|
+
},
|
|
565
|
+
{
|
|
566
|
+
type: 'object',
|
|
567
|
+
properties: {
|
|
568
|
+
type: { type: 'string', enum: ['json'] },
|
|
569
|
+
prompt: { type: 'string' },
|
|
570
|
+
schema: { type: 'object' },
|
|
571
|
+
},
|
|
572
|
+
required: ['type'],
|
|
573
|
+
additionalProperties: true,
|
|
574
|
+
},
|
|
575
|
+
],
|
|
473
576
|
},
|
|
474
577
|
description: 'Content formats to extract from search results',
|
|
475
578
|
},
|
|
@@ -493,12 +596,11 @@ const EXTRACT_TOOL = {
|
|
|
493
596
|
description: `
|
|
494
597
|
Extract structured information from web pages using LLM capabilities. Supports both cloud AI and self-hosted LLM extraction.
|
|
495
598
|
|
|
496
|
-
**Best for:** Extracting specific structured data like prices, names, details.
|
|
599
|
+
**Best for:** Extracting specific structured data like prices, names, details from web pages.
|
|
497
600
|
**Not recommended for:** When you need the full content of a page (use scrape); when you're not looking for specific structured data.
|
|
498
601
|
**Arguments:**
|
|
499
602
|
- urls: Array of URLs to extract information from
|
|
500
603
|
- prompt: Custom prompt for the LLM extraction
|
|
501
|
-
- systemPrompt: System prompt to guide the LLM
|
|
502
604
|
- schema: JSON schema for structured data extraction
|
|
503
605
|
- allowExternalLinks: Allow extraction from external links
|
|
504
606
|
- enableWebSearch: Enable web search for additional context
|
|
@@ -511,7 +613,6 @@ Extract structured information from web pages using LLM capabilities. Supports b
|
|
|
511
613
|
"arguments": {
|
|
512
614
|
"urls": ["https://example.com/page1", "https://example.com/page2"],
|
|
513
615
|
"prompt": "Extract product information including name, price, and description",
|
|
514
|
-
"systemPrompt": "You are a helpful assistant that extracts product information",
|
|
515
616
|
"schema": {
|
|
516
617
|
"type": "object",
|
|
517
618
|
"properties": {
|
|
@@ -541,10 +642,6 @@ Extract structured information from web pages using LLM capabilities. Supports b
|
|
|
541
642
|
type: 'string',
|
|
542
643
|
description: 'Prompt for the LLM extraction',
|
|
543
644
|
},
|
|
544
|
-
systemPrompt: {
|
|
545
|
-
type: 'string',
|
|
546
|
-
description: 'System prompt for LLM extraction',
|
|
547
|
-
},
|
|
548
645
|
schema: {
|
|
549
646
|
type: 'object',
|
|
550
647
|
description: 'JSON schema for structured data extraction',
|
|
@@ -565,100 +662,6 @@ Extract structured information from web pages using LLM capabilities. Supports b
|
|
|
565
662
|
required: ['urls'],
|
|
566
663
|
},
|
|
567
664
|
};
|
|
568
|
-
const DEEP_RESEARCH_TOOL = {
|
|
569
|
-
name: 'firecrawl_deep_research',
|
|
570
|
-
description: `
|
|
571
|
-
Conduct deep web research on a query using intelligent crawling, search, and LLM analysis.
|
|
572
|
-
|
|
573
|
-
**Best for:** Complex research questions requiring multiple sources, in-depth analysis.
|
|
574
|
-
**Not recommended for:** Simple questions that can be answered with a single search; when you need very specific information from a known page (use scrape); when you need results quickly (deep research can take time).
|
|
575
|
-
**Arguments:**
|
|
576
|
-
- query (string, required): The research question or topic to explore.
|
|
577
|
-
- maxDepth (number, optional): Maximum recursive depth for crawling/search (default: 3).
|
|
578
|
-
- timeLimit (number, optional): Time limit in seconds for the research session (default: 120).
|
|
579
|
-
- maxUrls (number, optional): Maximum number of URLs to analyze (default: 50).
|
|
580
|
-
**Prompt Example:** "Research the environmental impact of electric vehicles versus gasoline vehicles."
|
|
581
|
-
**Usage Example:**
|
|
582
|
-
\`\`\`json
|
|
583
|
-
{
|
|
584
|
-
"name": "firecrawl_deep_research",
|
|
585
|
-
"arguments": {
|
|
586
|
-
"query": "What are the environmental impacts of electric vehicles compared to gasoline vehicles?",
|
|
587
|
-
"maxDepth": 3,
|
|
588
|
-
"timeLimit": 120,
|
|
589
|
-
"maxUrls": 50
|
|
590
|
-
}
|
|
591
|
-
}
|
|
592
|
-
\`\`\`
|
|
593
|
-
**Returns:** Final analysis generated by an LLM based on research. (data.finalAnalysis); may also include structured activities and sources used in the research process.
|
|
594
|
-
`,
|
|
595
|
-
inputSchema: {
|
|
596
|
-
type: 'object',
|
|
597
|
-
properties: {
|
|
598
|
-
query: {
|
|
599
|
-
type: 'string',
|
|
600
|
-
description: 'The query to research',
|
|
601
|
-
},
|
|
602
|
-
maxDepth: {
|
|
603
|
-
type: 'number',
|
|
604
|
-
description: 'Maximum depth of research iterations (1-10)',
|
|
605
|
-
},
|
|
606
|
-
timeLimit: {
|
|
607
|
-
type: 'number',
|
|
608
|
-
description: 'Time limit in seconds (30-300)',
|
|
609
|
-
},
|
|
610
|
-
maxUrls: {
|
|
611
|
-
type: 'number',
|
|
612
|
-
description: 'Maximum number of URLs to analyze (1-1000)',
|
|
613
|
-
},
|
|
614
|
-
},
|
|
615
|
-
required: ['query'],
|
|
616
|
-
},
|
|
617
|
-
};
|
|
618
|
-
const GENERATE_LLMSTXT_TOOL = {
|
|
619
|
-
name: 'firecrawl_generate_llmstxt',
|
|
620
|
-
description: `
|
|
621
|
-
Generate a standardized llms.txt (and optionally llms-full.txt) file for a given domain. This file defines how large language models should interact with the site.
|
|
622
|
-
|
|
623
|
-
**Best for:** Creating machine-readable permission guidelines for AI models.
|
|
624
|
-
**Not recommended for:** General content extraction or research.
|
|
625
|
-
**Arguments:**
|
|
626
|
-
- url (string, required): The base URL of the website to analyze.
|
|
627
|
-
- maxUrls (number, optional): Max number of URLs to include (default: 10).
|
|
628
|
-
- showFullText (boolean, optional): Whether to include llms-full.txt contents in the response.
|
|
629
|
-
**Prompt Example:** "Generate an LLMs.txt file for example.com."
|
|
630
|
-
**Usage Example:**
|
|
631
|
-
\`\`\`json
|
|
632
|
-
{
|
|
633
|
-
"name": "firecrawl_generate_llmstxt",
|
|
634
|
-
"arguments": {
|
|
635
|
-
"url": "https://example.com",
|
|
636
|
-
"maxUrls": 20,
|
|
637
|
-
"showFullText": true
|
|
638
|
-
}
|
|
639
|
-
}
|
|
640
|
-
\`\`\`
|
|
641
|
-
**Returns:** LLMs.txt file contents (and optionally llms-full.txt).
|
|
642
|
-
`,
|
|
643
|
-
inputSchema: {
|
|
644
|
-
type: 'object',
|
|
645
|
-
properties: {
|
|
646
|
-
url: {
|
|
647
|
-
type: 'string',
|
|
648
|
-
description: 'The URL to generate LLMs.txt from',
|
|
649
|
-
},
|
|
650
|
-
maxUrls: {
|
|
651
|
-
type: 'number',
|
|
652
|
-
description: 'Maximum number of URLs to process (1-100, default: 10)',
|
|
653
|
-
},
|
|
654
|
-
showFullText: {
|
|
655
|
-
type: 'boolean',
|
|
656
|
-
description: 'Whether to show the full LLMs-full.txt in the response',
|
|
657
|
-
},
|
|
658
|
-
},
|
|
659
|
-
required: ['url'],
|
|
660
|
-
},
|
|
661
|
-
};
|
|
662
665
|
// Type guards
|
|
663
666
|
function isScrapeOptions(args) {
|
|
664
667
|
return (typeof args === 'object' &&
|
|
@@ -672,6 +675,7 @@ function isMapOptions(args) {
|
|
|
672
675
|
'url' in args &&
|
|
673
676
|
typeof args.url === 'string');
|
|
674
677
|
}
|
|
678
|
+
//@ts-expect-error todo: fix
|
|
675
679
|
function isCrawlOptions(args) {
|
|
676
680
|
return (typeof args === 'object' &&
|
|
677
681
|
args !== null &&
|
|
@@ -703,6 +707,24 @@ function isGenerateLLMsTextOptions(args) {
|
|
|
703
707
|
'url' in args &&
|
|
704
708
|
typeof args.url === 'string');
|
|
705
709
|
}
|
|
710
|
+
function removeEmptyTopLevel(obj) {
|
|
711
|
+
const out = {};
|
|
712
|
+
for (const [k, v] of Object.entries(obj)) {
|
|
713
|
+
if (v == null)
|
|
714
|
+
continue;
|
|
715
|
+
if (typeof v === 'string' && v.trim() === '')
|
|
716
|
+
continue;
|
|
717
|
+
if (Array.isArray(v) && v.length === 0)
|
|
718
|
+
continue;
|
|
719
|
+
if (typeof v === 'object' &&
|
|
720
|
+
!Array.isArray(v) &&
|
|
721
|
+
Object.keys(v).length === 0)
|
|
722
|
+
continue;
|
|
723
|
+
// @ts-expect-error dynamic assignment
|
|
724
|
+
out[k] = v;
|
|
725
|
+
}
|
|
726
|
+
return out;
|
|
727
|
+
}
|
|
706
728
|
// Server implementation
|
|
707
729
|
const server = new Server({
|
|
708
730
|
name: 'firecrawl-mcp',
|
|
@@ -710,7 +732,6 @@ const server = new Server({
|
|
|
710
732
|
}, {
|
|
711
733
|
capabilities: {
|
|
712
734
|
tools: {},
|
|
713
|
-
logging: {},
|
|
714
735
|
},
|
|
715
736
|
});
|
|
716
737
|
// Get optional API URL
|
|
@@ -743,14 +764,9 @@ function delay(ms) {
|
|
|
743
764
|
}
|
|
744
765
|
let isStdioTransport = false;
|
|
745
766
|
function safeLog(level, data) {
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
}
|
|
750
|
-
else {
|
|
751
|
-
// For other transport types, use the normal logging mechanism
|
|
752
|
-
server.sendLoggingMessage({ level, data });
|
|
753
|
-
}
|
|
767
|
+
// Always log to stderr to avoid relying on MCP logging capability
|
|
768
|
+
const message = `[${level}] ${typeof data === 'object' ? JSON.stringify(data) : String(data)}`;
|
|
769
|
+
console.error(message);
|
|
754
770
|
}
|
|
755
771
|
// Add retry logic with exponential backoff
|
|
756
772
|
async function withRetry(operation, context, attempt = 1) {
|
|
@@ -779,18 +795,16 @@ server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
|
779
795
|
CHECK_CRAWL_STATUS_TOOL,
|
|
780
796
|
SEARCH_TOOL,
|
|
781
797
|
EXTRACT_TOOL,
|
|
782
|
-
DEEP_RESEARCH_TOOL,
|
|
783
|
-
GENERATE_LLMSTXT_TOOL,
|
|
784
798
|
],
|
|
785
799
|
}));
|
|
786
800
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
787
801
|
const startTime = Date.now();
|
|
788
802
|
try {
|
|
789
803
|
const { name, arguments: args } = request.params;
|
|
790
|
-
const apiKey = process.env.CLOUD_SERVICE
|
|
804
|
+
const apiKey = process.env.CLOUD_SERVICE === 'true'
|
|
791
805
|
? request.params._meta?.apiKey
|
|
792
806
|
: FIRECRAWL_API_KEY;
|
|
793
|
-
if (process.env.CLOUD_SERVICE && !apiKey) {
|
|
807
|
+
if (process.env.CLOUD_SERVICE === 'true' && !apiKey) {
|
|
794
808
|
throw new Error('No API key provided');
|
|
795
809
|
}
|
|
796
810
|
const client = new FirecrawlApp({
|
|
@@ -808,38 +822,46 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
808
822
|
throw new Error('Invalid arguments for firecrawl_scrape');
|
|
809
823
|
}
|
|
810
824
|
const { url, ...options } = args;
|
|
825
|
+
const cleaned = removeEmptyTopLevel(options);
|
|
811
826
|
try {
|
|
812
827
|
const scrapeStartTime = Date.now();
|
|
813
828
|
safeLog('info', `Starting scrape for URL: ${url} with options: ${JSON.stringify(options)}`);
|
|
814
|
-
const response = await client.
|
|
815
|
-
...
|
|
816
|
-
// @ts-expect-error Extended API options including origin
|
|
829
|
+
const response = await client.scrape(url, {
|
|
830
|
+
...cleaned,
|
|
817
831
|
origin: 'mcp-server',
|
|
818
832
|
});
|
|
819
833
|
// Log performance metrics
|
|
820
834
|
safeLog('info', `Scrape completed in ${Date.now() - scrapeStartTime}ms`);
|
|
821
|
-
if ('success' in response && !response.success) {
|
|
822
|
-
throw new Error(response.error || 'Scraping failed');
|
|
823
|
-
}
|
|
824
835
|
// Format content based on requested formats
|
|
825
836
|
const contentParts = [];
|
|
826
|
-
|
|
837
|
+
const formats = (options?.formats ?? []);
|
|
838
|
+
const hasFormat = (name) => Array.isArray(formats) &&
|
|
839
|
+
formats.some((f) => typeof f === 'string'
|
|
840
|
+
? f === name
|
|
841
|
+
: f && typeof f === 'object' && f.type === name);
|
|
842
|
+
if (hasFormat('markdown') && response.markdown) {
|
|
827
843
|
contentParts.push(response.markdown);
|
|
828
844
|
}
|
|
829
|
-
if (
|
|
845
|
+
if (hasFormat('html') && response.html) {
|
|
830
846
|
contentParts.push(response.html);
|
|
831
847
|
}
|
|
832
|
-
if (
|
|
848
|
+
if (hasFormat('rawHtml') && response.rawHtml) {
|
|
833
849
|
contentParts.push(response.rawHtml);
|
|
834
850
|
}
|
|
835
|
-
if (
|
|
851
|
+
if (hasFormat('links') && response.links) {
|
|
836
852
|
contentParts.push(response.links.join('\n'));
|
|
837
853
|
}
|
|
838
|
-
if (
|
|
854
|
+
if (hasFormat('screenshot') && response.screenshot) {
|
|
839
855
|
contentParts.push(response.screenshot);
|
|
840
856
|
}
|
|
841
|
-
if (
|
|
842
|
-
contentParts.push(JSON.stringify(response.
|
|
857
|
+
if (hasFormat('json') && response.json) {
|
|
858
|
+
contentParts.push(JSON.stringify(response.json, null, 2));
|
|
859
|
+
}
|
|
860
|
+
if (hasFormat('changeTracking') && response.changeTracking) {
|
|
861
|
+
contentParts.push(JSON.stringify(response.changeTracking, null, 2));
|
|
862
|
+
}
|
|
863
|
+
if (hasFormat('summary') && response.summary) {
|
|
864
|
+
contentParts.push(JSON.stringify(response.summary, null, 2));
|
|
843
865
|
}
|
|
844
866
|
// If options.formats is empty, default to markdown
|
|
845
867
|
if (!options.formats || options.formats.length === 0) {
|
|
@@ -872,20 +894,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
872
894
|
throw new Error('Invalid arguments for firecrawl_map');
|
|
873
895
|
}
|
|
874
896
|
const { url, ...options } = args;
|
|
875
|
-
const response = await client.
|
|
897
|
+
const response = await client.map(url, {
|
|
876
898
|
...options,
|
|
877
899
|
// @ts-expect-error Extended API options including origin
|
|
878
900
|
origin: 'mcp-server',
|
|
879
901
|
});
|
|
880
|
-
if ('error' in response) {
|
|
881
|
-
throw new Error(response.error);
|
|
882
|
-
}
|
|
883
902
|
if (!response.links) {
|
|
884
903
|
throw new Error('No links received from Firecrawl API');
|
|
885
904
|
}
|
|
886
905
|
return {
|
|
887
906
|
content: [
|
|
888
|
-
{ type: 'text', text: trimResponseText(response.links
|
|
907
|
+
{ type: 'text', text: trimResponseText(JSON.stringify(response.links, null, 2)) },
|
|
889
908
|
],
|
|
890
909
|
isError: false,
|
|
891
910
|
};
|
|
@@ -895,17 +914,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
895
914
|
throw new Error('Invalid arguments for firecrawl_crawl');
|
|
896
915
|
}
|
|
897
916
|
const { url, ...options } = args;
|
|
898
|
-
const response = await withRetry(async () =>
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
}
|
|
917
|
+
const response = await withRetry(async () => client.crawl(url, {
|
|
918
|
+
...options,
|
|
919
|
+
// @ts-expect-error Extended API options including origin
|
|
920
|
+
origin: 'mcp-server',
|
|
921
|
+
}), 'crawl operation');
|
|
904
922
|
return {
|
|
905
923
|
content: [
|
|
906
924
|
{
|
|
907
925
|
type: 'text',
|
|
908
|
-
text: trimResponseText(
|
|
926
|
+
text: trimResponseText(JSON.stringify(response)),
|
|
909
927
|
},
|
|
910
928
|
],
|
|
911
929
|
isError: false,
|
|
@@ -915,10 +933,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
915
933
|
if (!isStatusCheckOptions(args)) {
|
|
916
934
|
throw new Error('Invalid arguments for firecrawl_check_crawl_status');
|
|
917
935
|
}
|
|
918
|
-
const response = await client.
|
|
919
|
-
if (!response.success) {
|
|
920
|
-
throw new Error(response.error);
|
|
921
|
-
}
|
|
936
|
+
const response = await client.getCrawlStatus(args.id);
|
|
922
937
|
const status = `Crawl Status:
|
|
923
938
|
Status: ${response.status}
|
|
924
939
|
Progress: ${response.completed}/${response.total}
|
|
@@ -935,19 +950,18 @@ ${response.data.length > 0 ? '\nResults:\n' + formatResults(response.data) : ''}
|
|
|
935
950
|
throw new Error('Invalid arguments for firecrawl_search');
|
|
936
951
|
}
|
|
937
952
|
try {
|
|
938
|
-
const response = await withRetry(async () => client.search(args.query, {
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
const results = response.data
|
|
944
|
-
.map((result) => `URL: ${result.url}
|
|
945
|
-
Title: ${result.title || 'No title'}
|
|
946
|
-
Description: ${result.description || 'No description'}
|
|
947
|
-
${result.markdown ? `\nContent:\n${result.markdown}` : ''}`)
|
|
948
|
-
.join('\n\n');
|
|
953
|
+
const response = await withRetry(async () => client.search(args.query, {
|
|
954
|
+
...args,
|
|
955
|
+
// @ts-expect-error Extended API options including origin
|
|
956
|
+
origin: 'mcp-server',
|
|
957
|
+
}), 'search operation');
|
|
949
958
|
return {
|
|
950
|
-
content: [
|
|
959
|
+
content: [
|
|
960
|
+
{
|
|
961
|
+
type: 'text',
|
|
962
|
+
text: trimResponseText(JSON.stringify(response, null, 2)),
|
|
963
|
+
},
|
|
964
|
+
],
|
|
951
965
|
isError: false,
|
|
952
966
|
};
|
|
953
967
|
}
|
|
@@ -972,9 +986,9 @@ ${result.markdown ? `\nContent:\n${result.markdown}` : ''}`)
|
|
|
972
986
|
if (FIRECRAWL_API_URL) {
|
|
973
987
|
safeLog('info', 'Using self-hosted instance for extraction');
|
|
974
988
|
}
|
|
975
|
-
const extractResponse = await withRetry(async () => client.extract(
|
|
989
|
+
const extractResponse = await withRetry(async () => client.extract({
|
|
990
|
+
urls: args.urls,
|
|
976
991
|
prompt: args.prompt,
|
|
977
|
-
systemPrompt: args.systemPrompt,
|
|
978
992
|
schema: args.schema,
|
|
979
993
|
allowExternalLinks: args.allowExternalLinks,
|
|
980
994
|
enableWebSearch: args.enableWebSearch,
|
|
@@ -1025,57 +1039,6 @@ ${result.markdown ? `\nContent:\n${result.markdown}` : ''}`)
|
|
|
1025
1039
|
};
|
|
1026
1040
|
}
|
|
1027
1041
|
}
|
|
1028
|
-
case 'firecrawl_deep_research': {
|
|
1029
|
-
if (!args || typeof args !== 'object' || !('query' in args)) {
|
|
1030
|
-
throw new Error('Invalid arguments for firecrawl_deep_research');
|
|
1031
|
-
}
|
|
1032
|
-
try {
|
|
1033
|
-
const researchStartTime = Date.now();
|
|
1034
|
-
safeLog('info', `Starting deep research for query: ${args.query}`);
|
|
1035
|
-
const response = await client.deepResearch(args.query, {
|
|
1036
|
-
maxDepth: args.maxDepth,
|
|
1037
|
-
timeLimit: args.timeLimit,
|
|
1038
|
-
maxUrls: args.maxUrls,
|
|
1039
|
-
// @ts-expect-error Extended API options including origin
|
|
1040
|
-
origin: 'mcp-server',
|
|
1041
|
-
},
|
|
1042
|
-
// Activity callback
|
|
1043
|
-
(activity) => {
|
|
1044
|
-
safeLog('info', `Research activity: ${activity.message} (Depth: ${activity.depth})`);
|
|
1045
|
-
},
|
|
1046
|
-
// Source callback
|
|
1047
|
-
(source) => {
|
|
1048
|
-
safeLog('info', `Research source found: ${source.url}${source.title ? ` - ${source.title}` : ''}`);
|
|
1049
|
-
});
|
|
1050
|
-
// Log performance metrics
|
|
1051
|
-
safeLog('info', `Deep research completed in ${Date.now() - researchStartTime}ms`);
|
|
1052
|
-
if (!response.success) {
|
|
1053
|
-
throw new Error(response.error || 'Deep research failed');
|
|
1054
|
-
}
|
|
1055
|
-
// Format the results
|
|
1056
|
-
const formattedResponse = {
|
|
1057
|
-
finalAnalysis: response.data.finalAnalysis,
|
|
1058
|
-
activities: response.data.activities,
|
|
1059
|
-
sources: response.data.sources,
|
|
1060
|
-
};
|
|
1061
|
-
return {
|
|
1062
|
-
content: [
|
|
1063
|
-
{
|
|
1064
|
-
type: 'text',
|
|
1065
|
-
text: trimResponseText(formattedResponse.finalAnalysis),
|
|
1066
|
-
},
|
|
1067
|
-
],
|
|
1068
|
-
isError: false,
|
|
1069
|
-
};
|
|
1070
|
-
}
|
|
1071
|
-
catch (error) {
|
|
1072
|
-
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1073
|
-
return {
|
|
1074
|
-
content: [{ type: 'text', text: trimResponseText(errorMessage) }],
|
|
1075
|
-
isError: true,
|
|
1076
|
-
};
|
|
1077
|
-
}
|
|
1078
|
-
}
|
|
1079
1042
|
case 'firecrawl_generate_llmstxt': {
|
|
1080
1043
|
if (!isGenerateLLMsTextOptions(args)) {
|
|
1081
1044
|
throw new Error('Invalid arguments for firecrawl_generate_llmstxt');
|
|
@@ -1152,8 +1115,7 @@ function formatResults(data) {
|
|
|
1152
1115
|
return data
|
|
1153
1116
|
.map((doc) => {
|
|
1154
1117
|
const content = doc.markdown || doc.html || doc.rawHtml || 'No content';
|
|
1155
|
-
return `
|
|
1156
|
-
Content: ${content.substring(0, 100)}${content.length > 100 ? '...' : ''}
|
|
1118
|
+
return `Content: ${content.substring(0, 100)}${content.length > 100 ? '...' : ''}
|
|
1157
1119
|
${doc.metadata?.title ? `Title: ${doc.metadata.title}` : ''}`;
|
|
1158
1120
|
})
|
|
1159
1121
|
.join('\n\n');
|
|
@@ -1214,6 +1176,92 @@ async function runSSELocalServer() {
|
|
|
1214
1176
|
console.error('Error starting server:', error);
|
|
1215
1177
|
}
|
|
1216
1178
|
}
|
|
1179
|
+
async function runHTTPStreamableServer() {
|
|
1180
|
+
const app = express();
|
|
1181
|
+
app.use(express.json());
|
|
1182
|
+
const transports = {};
|
|
1183
|
+
// A single endpoint handles all MCP requests.
|
|
1184
|
+
app.all('/mcp', async (req, res) => {
|
|
1185
|
+
try {
|
|
1186
|
+
const sessionId = req.headers['mcp-session-id'];
|
|
1187
|
+
let transport;
|
|
1188
|
+
if (sessionId && transports[sessionId]) {
|
|
1189
|
+
transport = transports[sessionId];
|
|
1190
|
+
}
|
|
1191
|
+
else if (!sessionId &&
|
|
1192
|
+
req.method === 'POST' &&
|
|
1193
|
+
req.body &&
|
|
1194
|
+
typeof req.body === 'object' &&
|
|
1195
|
+
req.body.method === 'initialize') {
|
|
1196
|
+
transport = new StreamableHTTPServerTransport({
|
|
1197
|
+
sessionIdGenerator: () => {
|
|
1198
|
+
const id = randomUUID();
|
|
1199
|
+
return id;
|
|
1200
|
+
},
|
|
1201
|
+
onsessioninitialized: (sid) => {
|
|
1202
|
+
transports[sid] = transport;
|
|
1203
|
+
},
|
|
1204
|
+
});
|
|
1205
|
+
transport.onclose = () => {
|
|
1206
|
+
const sid = transport.sessionId;
|
|
1207
|
+
if (sid && transports[sid]) {
|
|
1208
|
+
delete transports[sid];
|
|
1209
|
+
}
|
|
1210
|
+
};
|
|
1211
|
+
console.log('Creating server instance');
|
|
1212
|
+
console.log('Connecting transport to server');
|
|
1213
|
+
await server.connect(transport);
|
|
1214
|
+
await transport.handleRequest(req, res, req.body);
|
|
1215
|
+
return;
|
|
1216
|
+
}
|
|
1217
|
+
else {
|
|
1218
|
+
res.status(400).json({
|
|
1219
|
+
jsonrpc: '2.0',
|
|
1220
|
+
error: {
|
|
1221
|
+
code: -32000,
|
|
1222
|
+
message: 'Invalid or missing session ID',
|
|
1223
|
+
},
|
|
1224
|
+
id: null,
|
|
1225
|
+
});
|
|
1226
|
+
return;
|
|
1227
|
+
}
|
|
1228
|
+
await transport.handleRequest(req, res, req.body);
|
|
1229
|
+
}
|
|
1230
|
+
catch (error) {
|
|
1231
|
+
if (!res.headersSent) {
|
|
1232
|
+
res.status(500).json({
|
|
1233
|
+
jsonrpc: '2.0',
|
|
1234
|
+
error: {
|
|
1235
|
+
code: -32603,
|
|
1236
|
+
message: 'Internal server error',
|
|
1237
|
+
},
|
|
1238
|
+
id: null,
|
|
1239
|
+
});
|
|
1240
|
+
}
|
|
1241
|
+
}
|
|
1242
|
+
});
|
|
1243
|
+
const PORT = 3000;
|
|
1244
|
+
const appServer = app.listen(PORT, () => {
|
|
1245
|
+
console.log(`MCP Streamable HTTP Server listening on port ${PORT}`);
|
|
1246
|
+
});
|
|
1247
|
+
process.on('SIGINT', async () => {
|
|
1248
|
+
console.log('Shutting down server...');
|
|
1249
|
+
for (const sessionId in transports) {
|
|
1250
|
+
try {
|
|
1251
|
+
console.log(`Closing transport for session ${sessionId}`);
|
|
1252
|
+
await transports[sessionId].close();
|
|
1253
|
+
delete transports[sessionId];
|
|
1254
|
+
}
|
|
1255
|
+
catch (error) {
|
|
1256
|
+
console.error(`Error closing transport for session ${sessionId}:`, error);
|
|
1257
|
+
}
|
|
1258
|
+
}
|
|
1259
|
+
appServer.close(() => {
|
|
1260
|
+
console.log('Server shutdown complete');
|
|
1261
|
+
process.exit(0);
|
|
1262
|
+
});
|
|
1263
|
+
});
|
|
1264
|
+
}
|
|
1217
1265
|
async function runSSECloudServer() {
|
|
1218
1266
|
const transports = {};
|
|
1219
1267
|
const app = express();
|
|
@@ -1277,6 +1325,13 @@ else if (process.env.SSE_LOCAL === 'true') {
|
|
|
1277
1325
|
process.exit(1);
|
|
1278
1326
|
});
|
|
1279
1327
|
}
|
|
1328
|
+
else if (process.env.HTTP_STREAMABLE_SERVER === 'true') {
|
|
1329
|
+
console.log('Running HTTP Streamable Server');
|
|
1330
|
+
runHTTPStreamableServer().catch((error) => {
|
|
1331
|
+
console.error('Fatal error running server:', error);
|
|
1332
|
+
process.exit(1);
|
|
1333
|
+
});
|
|
1334
|
+
}
|
|
1280
1335
|
else {
|
|
1281
1336
|
runLocalServer().catch((error) => {
|
|
1282
1337
|
console.error('Fatal error running server:', error);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "firecrawl-mcp",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.0",
|
|
4
4
|
"description": "MCP server for Firecrawl web scraping integration. Supports both cloud and self-hosted instances. Features include web scraping, batch processing, structured data extraction, and LLM-powered content analysis.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -20,15 +20,17 @@
|
|
|
20
20
|
"lint:fix": "eslint src/**/*.ts --fix",
|
|
21
21
|
"format": "prettier --write .",
|
|
22
22
|
"prepare": "npm run build",
|
|
23
|
-
"publish": "npm run build && npm publish"
|
|
23
|
+
"publish": "npm run build && npm publish",
|
|
24
|
+
"publish-beta": "npm run build && npm publish --tag beta"
|
|
24
25
|
},
|
|
25
26
|
"license": "MIT",
|
|
26
27
|
"dependencies": {
|
|
27
|
-
"@mendable/firecrawl-js": "^
|
|
28
|
-
"@modelcontextprotocol/sdk": "^1.
|
|
28
|
+
"@mendable/firecrawl-js": "^3.0.3",
|
|
29
|
+
"@modelcontextprotocol/sdk": "^1.17.3",
|
|
29
30
|
"dotenv": "^16.4.7",
|
|
30
31
|
"express": "^5.1.0",
|
|
31
32
|
"shx": "^0.3.4",
|
|
33
|
+
"typescript": "^5.9.2",
|
|
32
34
|
"ws": "^8.18.1"
|
|
33
35
|
},
|
|
34
36
|
"devDependencies": {
|
|
@@ -43,8 +45,7 @@
|
|
|
43
45
|
"jest": "^29.7.0",
|
|
44
46
|
"jest-mock-extended": "^4.0.0-beta1",
|
|
45
47
|
"prettier": "^3.1.1",
|
|
46
|
-
"ts-jest": "^29.1.1"
|
|
47
|
-
"typescript": "^5.3.3"
|
|
48
|
+
"ts-jest": "^29.1.1"
|
|
48
49
|
},
|
|
49
50
|
"engines": {
|
|
50
51
|
"node": ">=18.0.0"
|
|
@@ -58,11 +59,11 @@
|
|
|
58
59
|
],
|
|
59
60
|
"repository": {
|
|
60
61
|
"type": "git",
|
|
61
|
-
"url": "git+https://github.com/
|
|
62
|
+
"url": "git+https://github.com/firecrawl/firecrawl-mcp-server.git"
|
|
62
63
|
},
|
|
63
64
|
"author": "vrknetha",
|
|
64
65
|
"bugs": {
|
|
65
|
-
"url": "https://github.com/
|
|
66
|
+
"url": "https://github.com/firecrawl/firecrawl-mcp-server/issues"
|
|
66
67
|
},
|
|
67
|
-
"homepage": "https://github.com/
|
|
68
|
+
"homepage": "https://github.com/firecrawl/firecrawl-mcp-server#readme"
|
|
68
69
|
}
|