crawlforge-mcp-server 3.0.11 → 3.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/server.js CHANGED
@@ -1,32 +1,15 @@
1
1
  #!/usr/bin/env node
2
2
 
3
- // Secure Creator Mode Authentication - MUST run before any imports
4
- // Only the creator can enable unlimited access with their secret
5
- import crypto from 'crypto';
6
- import dotenv from 'dotenv';
3
+ // Creator Mode Authentication imported from src/core/creatorMode.js
4
+ // This MUST be the first import so the secret is verified before any tool code runs.
5
+ export { isCreatorModeVerified } from './src/core/creatorMode.js';
7
6
 
8
- // Load .env file early to check for creator secret
9
- dotenv.config({ path: '.env', quiet: true });
10
-
11
- const CREATOR_SECRET_HASH = 'cfef62e5068d48e7dd6a39c9e16f0be2615510c6b68274fc8abe3156feb5050b';
12
-
13
- if (process.env.CRAWLFORGE_CREATOR_SECRET) {
14
- const providedHash = crypto
15
- .createHash('sha256')
16
- .update(process.env.CRAWLFORGE_CREATOR_SECRET)
17
- .digest('hex');
18
-
19
- if (providedHash === CREATOR_SECRET_HASH) {
20
- process.env.CRAWLFORGE_CREATOR_MODE = 'true';
21
- console.log('🔓 Creator Mode Enabled - Unlimited Access');
22
- } else {
23
- console.warn('⚠️ Invalid creator secret provided');
24
- }
25
- }
26
-
27
- // Now import everything else
7
+ // Import everything else
28
8
  import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
29
9
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
10
+ import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
11
+ import { createServer } from "node:http";
12
+ import { randomUUID } from "node:crypto";
30
13
  import { z } from "zod";
31
14
  import { load } from "cheerio";
32
15
  import { SearchWebTool } from "./src/tools/search/searchWeb.js";
@@ -36,6 +19,8 @@ import { ExtractContentTool } from "./src/tools/extract/extractContent.js";
36
19
  import { ProcessDocumentTool } from "./src/tools/extract/processDocument.js";
37
20
  import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
38
21
  import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
22
+ // Phase 1: LLM-Powered Structured Extraction
23
+ import { ExtractStructuredTool } from "./src/tools/extract/extractStructured.js";
39
24
  // Wave 2 Advanced Tools
40
25
  import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
41
26
  import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
@@ -97,7 +82,43 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
97
82
  }
98
83
 
99
84
  // Create the server
100
- const server = new McpServer({ name: "crawlforge", version: "3.0.10" });
85
+ const server = new McpServer({
86
+ name: "crawlforge",
87
+ version: "3.0.12",
88
+ description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
89
+ homepage: "https://www.crawlforge.dev",
90
+ icon: "https://www.crawlforge.dev/icon.png"
91
+ });
92
+
93
+ // Register getting-started prompt
94
+ server.prompt("getting-started", {
95
+ description: "Get started with CrawlForge MCP - learn available tools and best practices",
96
+ }, async () => {
97
+ return {
98
+ messages: [{
99
+ role: "user",
100
+ content: {
101
+ type: "text",
102
+ text: "You have access to CrawlForge MCP with 20 web scraping tools. Key tools:\n\n" +
103
+ "- fetch_url: Fetch raw HTML/content from any URL\n" +
104
+ "- extract_text: Extract clean text from a webpage\n" +
105
+ "- extract_content: Smart content extraction with readability\n" +
106
+ "- search_web: Search the web and get structured results\n" +
107
+ "- crawl_deep: Crawl a website following links to a specified depth\n" +
108
+ "- map_site: Discover all pages on a website\n" +
109
+ "- batch_scrape: Scrape multiple URLs in parallel\n" +
110
+ "- scrape_with_actions: Automate browser actions then scrape\n" +
111
+ "- deep_research: Multi-source research on any topic\n" +
112
+ "- stealth_mode: Anti-detection browsing for protected sites\n" +
113
+ "- extract_structured: LLM-powered structured data extraction\n" +
114
+ "- track_changes: Monitor website changes over time\n" +
115
+ "- generate_llms_txt: Generate llms.txt for any website\n\n" +
116
+ "Workflow: search_web -> fetch_url -> extract_content -> analyze_content\n\n" +
117
+ "Get your API key at https://www.crawlforge.dev/signup (1,000 free credits)"
118
+ }
119
+ }]
120
+ };
121
+ });
101
122
 
102
123
  // Helper function to wrap tool handlers with authentication and credit tracking
103
124
  function withAuth(toolName, handler) {
@@ -171,6 +192,9 @@ const processDocumentTool = new ProcessDocumentTool();
171
192
  const summarizeContentTool = new SummarizeContentTool();
172
193
  const analyzeContentTool = new AnalyzeContentTool();
173
194
 
195
+ // Phase 1: LLM-Powered Structured Extraction Tool
196
+ const extractStructuredTool = new ExtractStructuredTool();
197
+
174
198
  // Initialize Wave 2 Advanced Tools
175
199
  const batchScrapeTool = new BatchScrapeTool();
176
200
  const scrapeWithActionsTool = new ScrapeWithActionsTool();
@@ -620,10 +644,11 @@ async function fetchWithTimeout(url, options = {}) {
620
644
  // Tool: fetch_url - Basic URL fetching with headers and response handling
621
645
  server.registerTool("fetch_url", {
622
646
  description: "Fetch content from a URL with optional headers and timeout",
647
+ annotations: { title: "Fetch URL", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
623
648
  inputSchema: {
624
- url: z.string().url(),
625
- headers: z.record(z.string()).optional(),
626
- timeout: z.number().min(1000).max(30000).optional().default(10000)
649
+ url: z.string().url().describe("The URL to fetch content from"),
650
+ headers: z.record(z.string()).optional().describe("Custom HTTP headers to include in the request"),
651
+ timeout: z.number().min(1000).max(30000).optional().default(10000).describe("Request timeout in milliseconds (1000-30000)")
627
652
  }
628
653
  }, withAuth("fetch_url", async ({ url, headers, timeout }) => {
629
654
  try {
@@ -666,10 +691,11 @@ server.registerTool("fetch_url", {
666
691
  // Tool: extract_text - Extract clean text content from HTML
667
692
  server.registerTool("extract_text", {
668
693
  description: "Extract clean text content from a webpage",
694
+ annotations: { title: "Extract Text", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
669
695
  inputSchema: {
670
- url: z.string().url(),
671
- remove_scripts: z.boolean().optional().default(true),
672
- remove_styles: z.boolean().optional().default(true)
696
+ url: z.string().url().describe("The URL to extract text from"),
697
+ remove_scripts: z.boolean().optional().default(true).describe("Remove script tags before extraction"),
698
+ remove_styles: z.boolean().optional().default(true).describe("Remove style tags before extraction")
673
699
  }
674
700
  }, withAuth("extract_text", async ({ url, remove_scripts, remove_styles }) => {
675
701
  try {
@@ -720,10 +746,11 @@ server.registerTool("extract_text", {
720
746
  // Tool: extract_links - Extract all links from a webpage with optional filtering
721
747
  server.registerTool("extract_links", {
722
748
  description: "Extract all links from a webpage with optional filtering",
749
+ annotations: { title: "Extract Links", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
723
750
  inputSchema: {
724
- url: z.string().url(),
725
- filter_external: z.boolean().optional().default(false),
726
- base_url: z.string().url().optional()
751
+ url: z.string().url().describe("The URL to extract links from"),
752
+ filter_external: z.boolean().optional().default(false).describe("Only return external links"),
753
+ base_url: z.string().url().optional().describe("Base URL for resolving relative links")
727
754
  }
728
755
  }, withAuth("extract_links", async ({ url, filter_external, base_url }) => {
729
756
  try {
@@ -804,8 +831,9 @@ server.registerTool("extract_links", {
804
831
  // Tool: extract_metadata - Extract page metadata
805
832
  server.registerTool("extract_metadata", {
806
833
  description: "Extract metadata from a webpage (title, description, keywords, etc.)",
834
+ annotations: { title: "Extract Metadata", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
807
835
  inputSchema: {
808
- url: z.string().url()
836
+ url: z.string().url().describe("The URL to extract metadata from")
809
837
  }
810
838
  }, withAuth("extract_metadata", async ({ url }) => {
811
839
  try {
@@ -883,9 +911,10 @@ server.registerTool("extract_metadata", {
883
911
  // Tool: scrape_structured - Extract structured data using CSS selectors
884
912
  server.registerTool("scrape_structured", {
885
913
  description: "Extract structured data from a webpage using CSS selectors",
914
+ annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
886
915
  inputSchema: {
887
- url: z.string().url(),
888
- selectors: z.record(z.string())
916
+ url: z.string().url().describe("The URL to scrape"),
917
+ selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors")
889
918
  }
890
919
  }, withAuth("scrape_structured", async ({ url, selectors }) => {
891
920
  try {
@@ -946,15 +975,16 @@ server.registerTool("scrape_structured", {
946
975
  // Tool: search_web - Search the web using Google Search via CrawlForge proxy
947
976
  server.registerTool("search_web", {
948
977
  description: "Search the web using Google Search API (proxied through CrawlForge)",
978
+ annotations: { title: "Search the Web", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
949
979
  inputSchema: {
950
- query: z.string(),
951
- limit: z.number().min(1).max(100).optional(),
952
- offset: z.number().min(0).optional(),
953
- lang: z.string().optional(),
954
- safe_search: z.boolean().optional(),
955
- time_range: z.enum(["day", "week", "month", "year", "all"]).optional(),
956
- site: z.string().optional(),
957
- file_type: z.string().optional()
980
+ query: z.string().describe("Search query string"),
981
+ limit: z.number().min(1).max(100).optional().describe("Maximum number of results to return"),
982
+ offset: z.number().min(0).optional().describe("Number of results to skip for pagination"),
983
+ lang: z.string().optional().describe("Language code for results (e.g. 'en', 'fr')"),
984
+ safe_search: z.boolean().optional().describe("Enable safe search filtering"),
985
+ time_range: z.enum(["day", "week", "month", "year", "all"]).optional().describe("Filter results by time range"),
986
+ site: z.string().optional().describe("Limit results to a specific domain"),
987
+ file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')")
958
988
  }
959
989
  }, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
960
990
  try {
@@ -989,16 +1019,17 @@ server.registerTool("search_web", {
989
1019
  // Tool: crawl_deep - Deep crawl websites with BFS algorithm
990
1020
  server.registerTool("crawl_deep", {
991
1021
  description: "Crawl websites deeply using breadth-first search",
1022
+ annotations: { title: "Deep Crawl", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
992
1023
  inputSchema: {
993
- url: z.string().url(),
994
- max_depth: z.number().min(1).max(5).optional(),
995
- max_pages: z.number().min(1).max(1000).optional(),
996
- include_patterns: z.array(z.string()).optional(),
997
- exclude_patterns: z.array(z.string()).optional(),
998
- follow_external: z.boolean().optional(),
999
- respect_robots: z.boolean().optional(),
1000
- extract_content: z.boolean().optional(),
1001
- concurrency: z.number().min(1).max(20).optional()
1024
+ url: z.string().url().describe("Starting URL for the crawl"),
1025
+ max_depth: z.number().min(1).max(5).optional().describe("Maximum crawl depth from starting URL"),
1026
+ max_pages: z.number().min(1).max(1000).optional().describe("Maximum number of pages to crawl"),
1027
+ include_patterns: z.array(z.string()).optional().describe("URL patterns to include (regex)"),
1028
+ exclude_patterns: z.array(z.string()).optional().describe("URL patterns to exclude (regex)"),
1029
+ follow_external: z.boolean().optional().describe("Follow links to external domains"),
1030
+ respect_robots: z.boolean().optional().describe("Respect robots.txt directives"),
1031
+ extract_content: z.boolean().optional().describe("Extract page content during crawl"),
1032
+ concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests")
1002
1033
  }
1003
1034
  }, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
1004
1035
  try {
@@ -1033,12 +1064,13 @@ server.registerTool("crawl_deep", {
1033
1064
  // Tool: map_site - Discover and map website structure
1034
1065
  server.registerTool("map_site", {
1035
1066
  description: "Discover and map website structure",
1067
+ annotations: { title: "Map Website", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
1036
1068
  inputSchema: {
1037
- url: z.string().url(),
1038
- include_sitemap: z.boolean().optional(),
1039
- max_urls: z.number().min(1).max(10000).optional(),
1040
- group_by_path: z.boolean().optional(),
1041
- include_metadata: z.boolean().optional()
1069
+ url: z.string().url().describe("The website URL to map"),
1070
+ include_sitemap: z.boolean().optional().describe("Include sitemap.xml data in results"),
1071
+ max_urls: z.number().min(1).max(10000).optional().describe("Maximum number of URLs to discover"),
1072
+ group_by_path: z.boolean().optional().describe("Group URLs by path segments"),
1073
+ include_metadata: z.boolean().optional().describe("Include page metadata for each URL")
1042
1074
  }
1043
1075
  }, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
1044
1076
  try {
@@ -1075,9 +1107,10 @@ server.registerTool("map_site", {
1075
1107
  // Tool: extract_content - Enhanced content extraction with readability detection
1076
1108
  server.registerTool("extract_content", {
1077
1109
  description: "Extract and analyze main content from web pages with enhanced readability detection",
1110
+ annotations: { title: "Extract Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
1078
1111
  inputSchema: {
1079
- url: z.string().url(),
1080
- options: z.object({}).optional()
1112
+ url: z.string().url().describe("The URL to extract content from"),
1113
+ options: z.object({}).optional().describe("Additional extraction options")
1081
1114
  }
1082
1115
  }, withAuth("extract_content", async ({ url, options }) => {
1083
1116
  try {
@@ -1112,10 +1145,11 @@ server.registerTool("extract_content", {
1112
1145
  // Tool: process_document - Multi-format document processing
1113
1146
  server.registerTool("process_document", {
1114
1147
  description: "Process documents from multiple sources and formats including PDFs and web pages",
1148
+ annotations: { title: "Process Document", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
1115
1149
  inputSchema: {
1116
- source: z.string(),
1117
- sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional(),
1118
- options: z.object({}).optional()
1150
+ source: z.string().describe("Document source - URL or file path"),
1151
+ sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional().describe("Type of document source"),
1152
+ options: z.object({}).optional().describe("Additional processing options")
1119
1153
  }
1120
1154
  }, withAuth("process_document", async ({ source, sourceType, options }) => {
1121
1155
  try {
@@ -1150,9 +1184,10 @@ server.registerTool("process_document", {
1150
1184
  // Tool: summarize_content - Intelligent content summarization
1151
1185
  server.registerTool("summarize_content", {
1152
1186
  description: "Generate intelligent summaries of text content with configurable options",
1187
+ annotations: { title: "Summarize Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
1153
1188
  inputSchema: {
1154
- text: z.string(),
1155
- options: z.object({}).optional()
1189
+ text: z.string().describe("The text content to summarize"),
1190
+ options: z.object({}).optional().describe("Summarization options")
1156
1191
  }
1157
1192
  }, withAuth("summarize_content", async ({ text, options }) => {
1158
1193
  try {
@@ -1187,9 +1222,10 @@ server.registerTool("summarize_content", {
1187
1222
  // Tool: analyze_content - Comprehensive content analysis
1188
1223
  server.registerTool("analyze_content", {
1189
1224
  description: "Perform comprehensive content analysis including language detection and topic extraction",
1225
+ annotations: { title: "Analyze Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
1190
1226
  inputSchema: {
1191
- text: z.string(),
1192
- options: z.object({}).optional()
1227
+ text: z.string().describe("The text content to analyze"),
1228
+ options: z.object({}).optional().describe("Analysis options")
1193
1229
  }
1194
1230
  }, withAuth("analyze_content", async ({ text, options }) => {
1195
1231
  try {
@@ -1222,11 +1258,62 @@ server.registerTool("analyze_content", {
1222
1258
  }));
1223
1259
 
1224
1260
 
1261
+
1262
+ // Phase 1: LLM-Powered Structured Extraction
1263
+
1264
+ // Tool: extract_structured - Extract structured data from a URL using LLM and JSON Schema
1265
+ server.registerTool("extract_structured", {
1266
+ description: "Extract structured data from a webpage using LLM-powered analysis and a JSON Schema. Falls back to CSS selector extraction when no LLM provider is configured.",
1267
+ annotations: { title: "Extract Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
1268
+ inputSchema: {
1269
+ url: z.string().url().describe("The URL to extract structured data from"),
1270
+ schema: z.object({
1271
+ type: z.string().optional(),
1272
+ properties: z.record(z.any()),
1273
+ required: z.array(z.string()).optional()
1274
+ }).describe("JSON schema defining the data structure to extract"),
1275
+ prompt: z.string().optional().describe("Natural language instructions for extraction"),
1276
+ llmConfig: z.object({
1277
+ provider: z.string().optional(),
1278
+ apiKey: z.string().optional()
1279
+ }).optional().describe("LLM provider configuration for AI-powered extraction"),
1280
+ fallbackToSelectors: z.boolean().optional().default(true).describe("Fall back to CSS selector extraction if LLM is unavailable"),
1281
+ selectorHints: z.record(z.string()).optional().describe("CSS selector hints to guide extraction")
1282
+ }
1283
+ }, withAuth("extract_structured", async ({ url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints }) => {
1284
+ try {
1285
+ const result = await extractStructuredTool.execute({
1286
+ url,
1287
+ schema,
1288
+ prompt,
1289
+ llmConfig,
1290
+ fallbackToSelectors,
1291
+ selectorHints
1292
+ });
1293
+ return {
1294
+ content: [{
1295
+ type: "text",
1296
+ text: JSON.stringify(result, null, 2)
1297
+ }]
1298
+ };
1299
+ } catch (error) {
1300
+ return {
1301
+ content: [{
1302
+ type: "text",
1303
+ text: `Structured extraction failed: ${error.message}`
1304
+ }],
1305
+ isError: true
1306
+ };
1307
+ }
1308
+ }));
1309
+
1310
+
1225
1311
  // Wave 2 Advanced Tools
1226
1312
 
1227
1313
  // Tool: batch_scrape - Process multiple URLs simultaneously with job management
1228
1314
  server.registerTool("batch_scrape", {
1229
1315
  description: "Process multiple URLs simultaneously with support for async job management and webhook notifications",
1316
+ annotations: { title: "Batch Scrape", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1230
1317
  inputSchema: {
1231
1318
  urls: z.array(z.union([
1232
1319
  z.string().url(),
@@ -1237,27 +1324,27 @@ server.registerTool("batch_scrape", {
1237
1324
  timeout: z.number().min(1000).max(30000).optional(),
1238
1325
  metadata: z.record(z.any()).optional()
1239
1326
  })
1240
- ])).min(1).max(50),
1241
- formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
1242
- mode: z.enum(['sync', 'async']).default('sync'),
1327
+ ])).min(1).max(50).describe("Array of URLs or URL objects to scrape"),
1328
+ formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']).describe("Output formats for scraped content"),
1329
+ mode: z.enum(['sync', 'async']).default('sync').describe("Processing mode: sync (wait) or async (background)"),
1243
1330
  webhook: z.object({
1244
1331
  url: z.string().url(),
1245
1332
  events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
1246
1333
  headers: z.record(z.string()).optional(),
1247
1334
  signingSecret: z.string().optional()
1248
- }).optional(),
1249
- extractionSchema: z.record(z.string()).optional(),
1250
- maxConcurrency: z.number().min(1).max(20).default(10),
1251
- delayBetweenRequests: z.number().min(0).max(10000).default(100),
1252
- includeMetadata: z.boolean().default(true),
1253
- includeFailed: z.boolean().default(true),
1254
- pageSize: z.number().min(1).max(100).default(25),
1335
+ }).optional().describe("Webhook configuration for async job notifications"),
1336
+ extractionSchema: z.record(z.string()).optional().describe("Schema for structured data extraction from each URL"),
1337
+ maxConcurrency: z.number().min(1).max(20).default(10).describe("Maximum concurrent scraping requests"),
1338
+ delayBetweenRequests: z.number().min(0).max(10000).default(100).describe("Delay in milliseconds between requests"),
1339
+ includeMetadata: z.boolean().default(true).describe("Include page metadata in results"),
1340
+ includeFailed: z.boolean().default(true).describe("Include failed URLs in results"),
1341
+ pageSize: z.number().min(1).max(100).default(25).describe("Number of results per page"),
1255
1342
  jobOptions: z.object({
1256
1343
  priority: z.number().default(0),
1257
1344
  ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
1258
1345
  maxRetries: z.number().min(0).max(5).default(1),
1259
1346
  tags: z.array(z.string()).default([])
1260
- }).optional()
1347
+ }).optional().describe("Job management options for async processing")
1261
1348
  }
1262
1349
  }, withAuth("batch_scrape", async (params) => {
1263
1350
  try {
@@ -1282,8 +1369,9 @@ server.registerTool("batch_scrape", {
1282
1369
  // Tool: scrape_with_actions - Execute action chains before scraping
1283
1370
  server.registerTool("scrape_with_actions", {
1284
1371
  description: "Execute browser action chains before scraping content, with form auto-fill and intermediate state capture",
1372
+ annotations: { title: "Scrape with Browser Actions", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1285
1373
  inputSchema: {
1286
- url: z.string().url(),
1374
+ url: z.string().url().describe("The URL to scrape"),
1287
1375
  actions: z.array(z.object({
1288
1376
  type: z.enum(['wait', 'click', 'type', 'press', 'scroll', 'screenshot', 'executeJavaScript']),
1289
1377
  selector: z.string().optional(),
@@ -1294,10 +1382,10 @@ server.registerTool("scrape_with_actions", {
1294
1382
  description: z.string().optional(),
1295
1383
  continueOnError: z.boolean().default(false),
1296
1384
  retries: z.number().min(0).max(5).default(0)
1297
- })).min(1).max(20),
1298
- formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
1299
- captureIntermediateStates: z.boolean().default(false),
1300
- captureScreenshots: z.boolean().default(true),
1385
+ })).min(1).max(20).describe("Browser actions to perform before scraping"),
1386
+ formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']).describe("Output formats for scraped content"),
1387
+ captureIntermediateStates: z.boolean().default(false).describe("Capture page state after each action"),
1388
+ captureScreenshots: z.boolean().default(true).describe("Take screenshots during action execution"),
1301
1389
  formAutoFill: z.object({
1302
1390
  fields: z.array(z.object({
1303
1391
  selector: z.string(),
@@ -1307,23 +1395,23 @@ server.registerTool("scrape_with_actions", {
1307
1395
  })),
1308
1396
  submitSelector: z.string().optional(),
1309
1397
  waitAfterSubmit: z.number().min(0).max(30000).default(2000)
1310
- }).optional(),
1398
+ }).optional().describe("Form auto-fill configuration"),
1311
1399
  browserOptions: z.object({
1312
1400
  headless: z.boolean().default(true),
1313
1401
  userAgent: z.string().optional(),
1314
1402
  viewportWidth: z.number().min(800).max(1920).default(1280),
1315
1403
  viewportHeight: z.number().min(600).max(1080).default(720),
1316
1404
  timeout: z.number().min(10000).max(120000).default(30000)
1317
- }).optional(),
1405
+ }).optional().describe("Browser configuration options"),
1318
1406
  extractionOptions: z.object({
1319
1407
  selectors: z.record(z.string()).optional(),
1320
1408
  includeMetadata: z.boolean().default(true),
1321
1409
  includeLinks: z.boolean().default(true),
1322
1410
  includeImages: z.boolean().default(true)
1323
- }).optional(),
1324
- continueOnActionError: z.boolean().default(false),
1325
- maxRetries: z.number().min(0).max(3).default(1),
1326
- screenshotOnError: z.boolean().default(true)
1411
+ }).optional().describe("Content extraction options"),
1412
+ continueOnActionError: z.boolean().default(false).describe("Continue executing actions if one fails"),
1413
+ maxRetries: z.number().min(0).max(3).default(1).describe("Maximum retry attempts on failure"),
1414
+ screenshotOnError: z.boolean().default(true).describe("Capture screenshot when an error occurs")
1327
1415
  }
1328
1416
  }, withAuth("scrape_with_actions", async (params) => {
1329
1417
  try {
@@ -1348,27 +1436,28 @@ server.registerTool("scrape_with_actions", {
1348
1436
  // Tool: deep_research - Comprehensive multi-stage research with source verification
1349
1437
  server.registerTool("deep_research", {
1350
1438
  description: "Conduct comprehensive multi-stage research with intelligent query expansion, source verification, and conflict detection",
1439
+ annotations: { title: "Deep Research", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1351
1440
  inputSchema: {
1352
- topic: z.string().min(3).max(500),
1353
- maxDepth: z.number().min(1).max(10).optional().default(5),
1354
- maxUrls: z.number().min(1).max(1000).optional().default(50),
1355
- timeLimit: z.number().min(30000).max(300000).optional().default(120000),
1356
- researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'),
1357
- sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']),
1358
- credibilityThreshold: z.number().min(0).max(1).optional().default(0.3),
1359
- includeRecentOnly: z.boolean().optional().default(false),
1360
- enableConflictDetection: z.boolean().optional().default(true),
1361
- enableSourceVerification: z.boolean().optional().default(true),
1362
- enableSynthesis: z.boolean().optional().default(true),
1363
- outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'),
1364
- includeRawData: z.boolean().optional().default(false),
1365
- includeActivityLog: z.boolean().optional().default(false),
1441
+ topic: z.string().min(3).max(500).describe("Research topic or question"),
1442
+ maxDepth: z.number().min(1).max(10).optional().default(5).describe("Maximum research depth"),
1443
+ maxUrls: z.number().min(1).max(1000).optional().default(50).describe("Maximum URLs to analyze"),
1444
+ timeLimit: z.number().min(30000).max(300000).optional().default(120000).describe("Time limit in milliseconds for the research"),
1445
+ researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad').describe("Research methodology approach"),
1446
+ sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']).describe("Types of sources to include"),
1447
+ credibilityThreshold: z.number().min(0).max(1).optional().default(0.3).describe("Minimum credibility score for sources (0-1)"),
1448
+ includeRecentOnly: z.boolean().optional().default(false).describe("Only include recent sources"),
1449
+ enableConflictDetection: z.boolean().optional().default(true).describe("Detect conflicting information across sources"),
1450
+ enableSourceVerification: z.boolean().optional().default(true).describe("Verify source credibility"),
1451
+ enableSynthesis: z.boolean().optional().default(true).describe("Synthesize findings into a coherent report"),
1452
+ outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive').describe("Output format for the research report"),
1453
+ includeRawData: z.boolean().optional().default(false).describe("Include raw scraped data in output"),
1454
+ includeActivityLog: z.boolean().optional().default(false).describe("Include detailed activity log"),
1366
1455
  queryExpansion: z.object({
1367
1456
  enableSynonyms: z.boolean().optional().default(true),
1368
1457
  enableSpellCheck: z.boolean().optional().default(true),
1369
1458
  enableContextual: z.boolean().optional().default(true),
1370
1459
  maxVariations: z.number().min(1).max(20).optional().default(8)
1371
- }).optional(),
1460
+ }).optional().describe("Query expansion settings for broader search coverage"),
1372
1461
  llmConfig: z.object({
1373
1462
  provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'),
1374
1463
  openai: z.object({
@@ -1382,14 +1471,14 @@ server.registerTool("deep_research", {
1382
1471
  }).optional(),
1383
1472
  enableSemanticAnalysis: z.boolean().optional().default(true),
1384
1473
  enableIntelligentSynthesis: z.boolean().optional().default(true)
1385
- }).optional(),
1386
- concurrency: z.number().min(1).max(20).optional().default(5),
1387
- cacheResults: z.boolean().optional().default(true),
1474
+ }).optional().describe("LLM provider configuration for AI-powered analysis"),
1475
+ concurrency: z.number().min(1).max(20).optional().default(5).describe("Number of concurrent research requests"),
1476
+ cacheResults: z.boolean().optional().default(true).describe("Cache research results for reuse"),
1388
1477
  webhook: z.object({
1389
1478
  url: z.string().url(),
1390
1479
  events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']),
1391
1480
  headers: z.record(z.string()).optional()
1392
- }).optional()
1481
+ }).optional().describe("Webhook for progress and completion notifications")
1393
1482
  }
1394
1483
  }, withAuth("deep_research", async (params) => {
1395
1484
  try {
@@ -1414,13 +1503,14 @@ server.registerTool("deep_research", {
1414
1503
  // Tool: track_changes - Enhanced Content change tracking with baseline capture and monitoring (Phase 2.4)
1415
1504
  server.registerTool("track_changes", {
1416
1505
  description: "Enhanced content change tracking with baseline capture, comparison, scheduled monitoring, advanced comparison engine, alert system, and historical analysis",
1506
+ annotations: { title: "Track Changes", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1417
1507
  inputSchema: {
1418
- url: z.string().url(),
1508
+ url: z.string().url().describe("The URL to track changes for"),
1419
1509
  operation: z.enum([
1420
- 'create_baseline',
1421
- 'compare',
1422
- 'monitor',
1423
- 'get_history',
1510
+ 'create_baseline',
1511
+ 'compare',
1512
+ 'monitor',
1513
+ 'get_history',
1424
1514
  'get_stats',
1425
1515
  'create_scheduled_monitor',
1426
1516
  'stop_scheduled_monitor',
@@ -1429,9 +1519,9 @@ server.registerTool("track_changes", {
1429
1519
  'create_alert_rule',
1430
1520
  'generate_trend_report',
1431
1521
  'get_monitoring_templates'
1432
- ]).default('compare'),
1433
- content: z.string().optional(),
1434
- html: z.string().optional(),
1522
+ ]).default('compare').describe("Tracking operation to perform"),
1523
+ content: z.string().optional().describe("Content to compare against baseline"),
1524
+ html: z.string().optional().describe("HTML content to compare against baseline"),
1435
1525
  trackingOptions: z.object({
1436
1526
  granularity: z.enum(['page', 'section', 'element', 'text']).default('section'),
1437
1527
  trackText: z.boolean().default(true),
@@ -1448,7 +1538,7 @@ server.registerTool("track_changes", {
1448
1538
  moderate: z.number().min(0).max(1).default(0.3),
1449
1539
  major: z.number().min(0).max(1).default(0.7)
1450
1540
  }).optional()
1451
- }).optional(),
1541
+ }).optional().describe("Options for how changes are tracked and compared"),
1452
1542
  monitoringOptions: z.object({
1453
1543
  enabled: z.boolean().default(false),
1454
1544
  interval: z.number().min(60000).max(24 * 60 * 60 * 1000).default(300000),
@@ -1458,14 +1548,14 @@ server.registerTool("track_changes", {
1458
1548
  enableWebhook: z.boolean().default(false),
1459
1549
  webhookUrl: z.string().url().optional(),
1460
1550
  webhookSecret: z.string().optional()
1461
- }).optional(),
1551
+ }).optional().describe("Monitoring schedule and notification settings"),
1462
1552
  storageOptions: z.object({
1463
1553
  enableSnapshots: z.boolean().default(true),
1464
1554
  retainHistory: z.boolean().default(true),
1465
1555
  maxHistoryEntries: z.number().min(1).max(1000).default(100),
1466
1556
  compressionEnabled: z.boolean().default(true),
1467
1557
  deltaStorageEnabled: z.boolean().default(true)
1468
- }).optional(),
1558
+ }).optional().describe("Storage and history retention settings"),
1469
1559
  queryOptions: z.object({
1470
1560
  limit: z.number().min(1).max(500).default(50),
1471
1561
  offset: z.number().min(0).default(0),
@@ -1473,7 +1563,7 @@ server.registerTool("track_changes", {
1473
1563
  endTime: z.number().optional(),
1474
1564
  includeContent: z.boolean().default(false),
1475
1565
  significanceFilter: z.enum(['all', 'minor', 'moderate', 'major', 'critical']).optional()
1476
- }).optional(),
1566
+ }).optional().describe("Query options for history and stats retrieval"),
1477
1567
  notificationOptions: z.object({
1478
1568
  webhook: z.object({
1479
1569
  enabled: z.boolean().default(false),
@@ -1489,32 +1579,32 @@ server.registerTool("track_changes", {
1489
1579
  channel: z.string().optional(),
1490
1580
  username: z.string().optional()
1491
1581
  }).optional()
1492
- }).optional(),
1582
+ }).optional().describe("Notification configuration for webhooks and Slack"),
1493
1583
  // Enhanced Phase 2.4 options
1494
1584
  scheduledMonitorOptions: z.object({
1495
1585
  schedule: z.string().optional(), // Cron expression
1496
1586
  templateId: z.string().optional(), // Monitoring template ID
1497
1587
  enabled: z.boolean().default(true)
1498
- }).optional(),
1588
+ }).optional().describe("Scheduled monitoring options with cron expressions"),
1499
1589
  alertRuleOptions: z.object({
1500
1590
  ruleId: z.string().optional(),
1501
1591
  condition: z.string().optional(), // Condition description
1502
1592
  actions: z.array(z.enum(['webhook', 'email', 'slack'])).optional(),
1503
1593
  throttle: z.number().min(0).optional(),
1504
1594
  priority: z.enum(['low', 'medium', 'high']).optional()
1505
- }).optional(),
1595
+ }).optional().describe("Alert rule configuration for change notifications"),
1506
1596
  exportOptions: z.object({
1507
1597
  format: z.enum(['json', 'csv']).default('json'),
1508
1598
  startTime: z.number().optional(),
1509
1599
  endTime: z.number().optional(),
1510
1600
  includeContent: z.boolean().default(false),
1511
1601
  includeSnapshots: z.boolean().default(false)
1512
- }).optional(),
1602
+ }).optional().describe("Export options for change history data"),
1513
1603
  dashboardOptions: z.object({
1514
1604
  includeRecentAlerts: z.boolean().default(true),
1515
1605
  includeTrends: z.boolean().default(true),
1516
1606
  includeMonitorStatus: z.boolean().default(true)
1517
- }).optional()
1607
+ }).optional().describe("Dashboard display options")
1518
1608
  }
1519
1609
  }, withAuth("track_changes", async (params) => {
1520
1610
  try {
@@ -1539,8 +1629,9 @@ server.registerTool("track_changes", {
1539
1629
  // Tool: generate_llms_txt - Generate LLMs.txt and LLMs-full.txt files (Phase 2.5)
1540
1630
  server.registerTool("generate_llms_txt", {
1541
1631
  description: "Analyze websites and generate standard-compliant LLMs.txt and LLMs-full.txt files defining AI model interaction guidelines",
1632
+ annotations: { title: "Generate llms.txt", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
1542
1633
  inputSchema: {
1543
- url: z.string().url(),
1634
+ url: z.string().url().describe("The website URL to generate llms.txt for"),
1544
1635
  analysisOptions: z.object({
1545
1636
  maxDepth: z.number().min(1).max(5).optional().default(3),
1546
1637
  maxPages: z.number().min(10).max(500).optional().default(100),
@@ -1548,7 +1639,7 @@ server.registerTool("generate_llms_txt", {
1548
1639
  analyzeContent: z.boolean().optional().default(true),
1549
1640
  checkSecurity: z.boolean().optional().default(true),
1550
1641
  respectRobots: z.boolean().optional().default(true)
1551
- }).optional(),
1642
+ }).optional().describe("Website analysis options for depth, scope, and detection"),
1552
1643
  outputOptions: z.object({
1553
1644
  includeDetailed: z.boolean().optional().default(true),
1554
1645
  includeAnalysis: z.boolean().optional().default(false),
@@ -1556,9 +1647,9 @@ server.registerTool("generate_llms_txt", {
1556
1647
  organizationName: z.string().optional(),
1557
1648
  customGuidelines: z.array(z.string()).optional(),
1558
1649
  customRestrictions: z.array(z.string()).optional()
1559
- }).optional(),
1560
- complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard'),
1561
- format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both')
1650
+ }).optional().describe("Output customization and organization details"),
1651
+ complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard').describe("Compliance level for generated guidelines"),
1652
+ format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both').describe("Output format: llms.txt, llms-full.txt, or both")
1562
1653
  }
1563
1654
  }, withAuth("generate_llms_txt", async (params) => {
1564
1655
  try {
@@ -1583,8 +1674,9 @@ server.registerTool("generate_llms_txt", {
1583
1674
  // Tool: stealth_mode - Advanced anti-detection browser management (Wave 3)
1584
1675
  server.registerTool("stealth_mode", {
1585
1676
  description: "Advanced anti-detection browser management with stealth features, fingerprint randomization, and human behavior simulation",
1677
+ annotations: { title: "Stealth Mode", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1586
1678
  inputSchema: {
1587
- operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure'),
1679
+ operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure').describe("Stealth operation to perform"),
1588
1680
  stealthConfig: z.object({
1589
1681
  level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
1590
1682
  randomizeFingerprint: z.boolean().default(true),
@@ -1622,9 +1714,9 @@ server.registerTool("stealth_mode", {
1622
1714
  fontSpoofing: z.boolean().default(true),
1623
1715
  hardwareSpoofing: z.boolean().default(true)
1624
1716
  }).optional()
1625
- }).optional(),
1626
- contextId: z.string().optional(),
1627
- urlToTest: z.string().url().optional()
1717
+ }).optional().describe("Stealth browser configuration with anti-detection settings"),
1718
+ contextId: z.string().optional().describe("Browser context ID for page operations"),
1719
+ urlToTest: z.string().url().optional().describe("URL to navigate to when creating a page")
1628
1720
  }
1629
1721
  }, withAuth("stealth_mode", async ({ operation, stealthConfig, contextId, urlToTest }) => {
1630
1722
  try {
@@ -1704,20 +1796,21 @@ server.registerTool("stealth_mode", {
1704
1796
  // Tool: localization - Multi-language and geo-location management (Wave 3)
1705
1797
  server.registerTool("localization", {
1706
1798
  description: "Multi-language and geo-location management with country-specific settings, browser locale emulation, timezone spoofing, and geo-blocked content handling",
1799
+ annotations: { title: "Localization", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1707
1800
  inputSchema: {
1708
- operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country'),
1709
- countryCode: z.string().length(2).optional(),
1710
- language: z.string().optional(),
1711
- timezone: z.string().optional(),
1712
- currency: z.string().length(3).optional(),
1713
- customHeaders: z.record(z.string()).optional(),
1714
- userAgent: z.string().optional(),
1715
- acceptLanguage: z.string().optional(),
1801
+ operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country').describe("Localization operation to perform"),
1802
+ countryCode: z.string().length(2).optional().describe("ISO 3166-1 alpha-2 country code"),
1803
+ language: z.string().optional().describe("Language code (e.g. 'en', 'fr', 'de')"),
1804
+ timezone: z.string().optional().describe("IANA timezone identifier (e.g. 'America/New_York')"),
1805
+ currency: z.string().length(3).optional().describe("ISO 4217 currency code (e.g. 'USD', 'EUR')"),
1806
+ customHeaders: z.record(z.string()).optional().describe("Custom HTTP headers for localized requests"),
1807
+ userAgent: z.string().optional().describe("Custom user agent string"),
1808
+ acceptLanguage: z.string().optional().describe("Accept-Language header value"),
1716
1809
  geoLocation: z.object({
1717
1810
  latitude: z.number().min(-90).max(90),
1718
1811
  longitude: z.number().min(-180).max(180),
1719
1812
  accuracy: z.number().min(1).max(100).optional()
1720
- }).optional(),
1813
+ }).optional().describe("GPS coordinates for geolocation emulation"),
1721
1814
  proxySettings: z.object({
1722
1815
  enabled: z.boolean().default(false),
1723
1816
  region: z.string().optional(),
@@ -1736,26 +1829,26 @@ server.registerTool("localization", {
1736
1829
  maxRetries: z.number().default(3),
1737
1830
  timeout: z.number().default(10000)
1738
1831
  }).optional()
1739
- }).optional(),
1832
+ }).optional().describe("Proxy configuration for geo-targeted requests"),
1740
1833
  searchParams: z.object({
1741
1834
  query: z.string().optional(),
1742
1835
  limit: z.number().optional(),
1743
1836
  offset: z.number().optional(),
1744
1837
  headers: z.record(z.string()).optional()
1745
- }).optional(),
1838
+ }).optional().describe("Search parameters for localized search queries"),
1746
1839
  browserOptions: z.object({
1747
1840
  locale: z.string().optional(),
1748
1841
  timezoneId: z.string().optional(),
1749
1842
  extraHTTPHeaders: z.record(z.string()).optional(),
1750
1843
  userAgent: z.string().optional()
1751
- }).optional(),
1752
- content: z.string().optional(),
1753
- url: z.string().url().optional(),
1844
+ }).optional().describe("Browser context options for locale emulation"),
1845
+ content: z.string().optional().describe("Content for auto-detection of language and locale"),
1846
+ url: z.string().url().optional().describe("URL for geo-blocking detection or auto-detection"),
1754
1847
  response: z.object({
1755
1848
  status: z.number(),
1756
1849
  body: z.string().optional(),
1757
1850
  statusText: z.string().optional()
1758
- }).optional()
1851
+ }).optional().describe("HTTP response for geo-blocking analysis")
1759
1852
  }
1760
1853
  }, withAuth("localization", async (params) => {
1761
1854
  try {
@@ -1837,11 +1930,92 @@ server.registerTool("localization", {
1837
1930
  }
1838
1931
  }));
1839
1932
 
1840
- // Set up the stdio transport and start the server
1933
+ // Determine transport mode: HTTP if --http flag or MCP_HTTP env var is set
1934
+ const useHttp = process.argv.includes('--http') || process.env.MCP_HTTP === 'true';
1935
+
1936
+ // Set up transport and start the server
1841
1937
  async function runServer() {
1842
- const transport = new StdioServerTransport();
1843
- await server.connect(transport);
1844
- console.error("CrawlForge MCP Server v3.0 running on stdio");
1938
+ if (useHttp) {
1939
+ const port = parseInt(process.env.PORT || '3000', 10);
1940
+
1941
+ // Stateless transport — no session tracking, each request is independent
1942
+ // This avoids the bug where server.connect(newTransport) kills previous sessions
1943
+ const transport = new StreamableHTTPServerTransport({
1944
+ sessionIdGenerator: undefined,
1945
+ });
1946
+ await server.connect(transport);
1947
+
1948
+ const httpServer = createServer(async (req, res) => {
1949
+ // CORS headers for Smithery gateway
1950
+ res.setHeader('Access-Control-Allow-Origin', '*');
1951
+ res.setHeader('Access-Control-Allow-Methods', 'GET, POST, DELETE, OPTIONS');
1952
+ res.setHeader('Access-Control-Allow-Headers', 'Content-Type, mcp-session-id');
1953
+ res.setHeader('Access-Control-Expose-Headers', 'mcp-session-id');
1954
+
1955
+ if (req.method === 'OPTIONS') {
1956
+ res.writeHead(204);
1957
+ res.end();
1958
+ return;
1959
+ }
1960
+
1961
+ // Health check endpoint
1962
+ if (req.url === '/health') {
1963
+ res.writeHead(200, { 'Content-Type': 'application/json' });
1964
+ res.end(JSON.stringify({ status: 'ok', version: '3.0' }));
1965
+ return;
1966
+ }
1967
+
1968
+ // MCP server card for Smithery discovery
1969
+ if (req.url === '/.well-known/mcp/server-card.json') {
1970
+ res.writeHead(200, { 'Content-Type': 'application/json' });
1971
+ res.end(JSON.stringify({
1972
+ serverInfo: {
1973
+ name: "crawlforge",
1974
+ version: "3.0.12",
1975
+ description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
1976
+ homepage: "https://www.crawlforge.dev",
1977
+ icon: "https://www.crawlforge.dev/icon.png"
1978
+ },
1979
+ transport: {
1980
+ type: "streamable-http",
1981
+ url: "/mcp"
1982
+ },
1983
+ configSchema: {
1984
+ type: "object",
1985
+ properties: {
1986
+ apiKey: {
1987
+ type: "string",
1988
+ title: "CrawlForge API Key",
1989
+ description: "Your CrawlForge API key. Get one free at https://www.crawlforge.dev/signup (includes 1,000 credits)",
1990
+ "x-from": { header: "x-api-key" }
1991
+ }
1992
+ },
1993
+ required: ["apiKey"]
1994
+ }
1995
+ }));
1996
+ return;
1997
+ }
1998
+
1999
+ // Route /mcp to the transport handler
2000
+ if (req.url === '/mcp' || req.url === '/') {
2001
+ await transport.handleRequest(req, res);
2002
+ return;
2003
+ }
2004
+
2005
+ res.writeHead(404);
2006
+ res.end('Not Found');
2007
+ });
2008
+
2009
+ httpServer.listen(port, () => {
2010
+ console.error(`CrawlForge MCP Server v3.0 running on HTTP port ${port}`);
2011
+ console.error(`MCP endpoint: http://localhost:${port}/mcp`);
2012
+ console.error(`Health check: http://localhost:${port}/health`);
2013
+ });
2014
+ } else {
2015
+ const transport = new StdioServerTransport();
2016
+ await server.connect(transport);
2017
+ console.error("CrawlForge MCP Server v3.0 running on stdio");
2018
+ }
1845
2019
  console.error(`Environment: ${config.server.nodeEnv}`);
1846
2020
 
1847
2021
  console.error("Search enabled: true (via CrawlForge proxy)");
@@ -1854,7 +2028,8 @@ async function runServer() {
1854
2028
  const trackingTools = ", track_changes";
1855
2029
  const llmsTxtTools = ", generate_llms_txt";
1856
2030
  const wave3Tools = ", stealth_mode, localization";
1857
- console.error(`Tools available: ${baseTools}${searchTool}${phase3Tools}${wave2Tools}${researchTools}${trackingTools}${llmsTxtTools}${wave3Tools}`);
2031
+ const phase1Tools = ", extract_structured";
2032
+ console.error(`Tools available: ${baseTools}${searchTool}${phase3Tools}${wave2Tools}${researchTools}${trackingTools}${llmsTxtTools}${wave3Tools}${phase1Tools}`);
1858
2033
 
1859
2034
 
1860
2035
  // === MEMORY LEAK PREVENTION ===
@@ -1880,7 +2055,8 @@ async function gracefulShutdown(signal) {
1880
2055
  trackChangesTool,
1881
2056
  generateLLMsTxtTool,
1882
2057
  stealthBrowserManager,
1883
- localizationManager
2058
+ localizationManager,
2059
+ extractStructuredTool
1884
2060
  ].filter(tool => tool && (typeof tool.destroy === 'function' || typeof tool.cleanup === 'function'));
1885
2061
 
1886
2062
  console.error(`Cleaning up ${toolsToCleanup.length} tools...`);