crawlforge-mcp-server 3.0.12 → 3.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/server.js CHANGED
@@ -1,45 +1,15 @@
1
1
  #!/usr/bin/env node
2
2
 
3
- // Secure Creator Mode Authentication - MUST run before any imports
4
- // Only the creator can enable unlimited access with their secret
5
- import crypto from 'crypto';
6
- import dotenv from 'dotenv';
3
+ // Creator Mode Authentication imported from src/core/creatorMode.js
4
+ // This MUST be the first import so the secret is verified before any tool code runs.
5
+ export { isCreatorModeVerified } from './src/core/creatorMode.js';
7
6
 
8
- // Load .env file early to check for creator secret
9
- dotenv.config({ path: '.env', quiet: true });
10
-
11
- // SECURITY: Clear any externally-set creator mode env var to prevent bypass
12
- delete process.env.CRAWLFORGE_CREATOR_MODE;
13
-
14
- const CREATOR_SECRET_HASH = 'cfef62e5068d48e7dd6a39c9e16f0be2615510c6b68274fc8abe3156feb5050b';
15
-
16
- // Module-scoped flag - cannot be set externally
17
- let _creatorModeVerified = false;
18
-
19
- if (process.env.CRAWLFORGE_CREATOR_SECRET) {
20
- const providedHash = crypto
21
- .createHash('sha256')
22
- .update(process.env.CRAWLFORGE_CREATOR_SECRET)
23
- .digest('hex');
24
-
25
- if (crypto.timingSafeEqual(Buffer.from(providedHash, 'hex'), Buffer.from(CREATOR_SECRET_HASH, 'hex'))) {
26
- _creatorModeVerified = true;
27
- console.log('🔓 Creator Mode Enabled - Unlimited Access');
28
- } else {
29
- console.warn('⚠️ Invalid creator secret provided');
30
- }
31
- // Clean up the secret from environment
32
- delete process.env.CRAWLFORGE_CREATOR_SECRET;
33
- }
34
-
35
- // Export getter for AuthManager to use
36
- export function isCreatorModeVerified() {
37
- return _creatorModeVerified;
38
- }
39
-
40
- // Now import everything else
7
+ // Import everything else
41
8
  import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
42
9
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
10
+ import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
11
+ import { createServer } from "node:http";
12
+ import { randomUUID } from "node:crypto";
43
13
  import { z } from "zod";
44
14
  import { load } from "cheerio";
45
15
  import { SearchWebTool } from "./src/tools/search/searchWeb.js";
@@ -49,6 +19,8 @@ import { ExtractContentTool } from "./src/tools/extract/extractContent.js";
49
19
  import { ProcessDocumentTool } from "./src/tools/extract/processDocument.js";
50
20
  import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
51
21
  import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
22
+ // Phase 1: LLM-Powered Structured Extraction
23
+ import { ExtractStructuredTool } from "./src/tools/extract/extractStructured.js";
52
24
  // Wave 2 Advanced Tools
53
25
  import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
54
26
  import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
@@ -110,7 +82,43 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
110
82
  }
111
83
 
112
84
  // Create the server
113
- const server = new McpServer({ name: "crawlforge", version: "3.0.10" });
85
+ const server = new McpServer({
86
+ name: "crawlforge",
87
+ version: "3.0.12",
88
+ description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
89
+ homepage: "https://www.crawlforge.dev",
90
+ icon: "https://www.crawlforge.dev/icon.png"
91
+ });
92
+
93
+ // Register getting-started prompt
94
+ server.prompt("getting-started", {
95
+ description: "Get started with CrawlForge MCP - learn available tools and best practices",
96
+ }, async () => {
97
+ return {
98
+ messages: [{
99
+ role: "user",
100
+ content: {
101
+ type: "text",
102
+ text: "You have access to CrawlForge MCP with 20 web scraping tools. Key tools:\n\n" +
103
+ "- fetch_url: Fetch raw HTML/content from any URL\n" +
104
+ "- extract_text: Extract clean text from a webpage\n" +
105
+ "- extract_content: Smart content extraction with readability\n" +
106
+ "- search_web: Search the web and get structured results\n" +
107
+ "- crawl_deep: Crawl a website following links to a specified depth\n" +
108
+ "- map_site: Discover all pages on a website\n" +
109
+ "- batch_scrape: Scrape multiple URLs in parallel\n" +
110
+ "- scrape_with_actions: Automate browser actions then scrape\n" +
111
+ "- deep_research: Multi-source research on any topic\n" +
112
+ "- stealth_mode: Anti-detection browsing for protected sites\n" +
113
+ "- extract_structured: LLM-powered structured data extraction\n" +
114
+ "- track_changes: Monitor website changes over time\n" +
115
+ "- generate_llms_txt: Generate llms.txt for any website\n\n" +
116
+ "Workflow: search_web -> fetch_url -> extract_content -> analyze_content\n\n" +
117
+ "Get your API key at https://www.crawlforge.dev/signup (1,000 free credits)"
118
+ }
119
+ }]
120
+ };
121
+ });
114
122
 
115
123
  // Helper function to wrap tool handlers with authentication and credit tracking
116
124
  function withAuth(toolName, handler) {
@@ -184,6 +192,9 @@ const processDocumentTool = new ProcessDocumentTool();
184
192
  const summarizeContentTool = new SummarizeContentTool();
185
193
  const analyzeContentTool = new AnalyzeContentTool();
186
194
 
195
+ // Phase 1: LLM-Powered Structured Extraction Tool
196
+ const extractStructuredTool = new ExtractStructuredTool();
197
+
187
198
  // Initialize Wave 2 Advanced Tools
188
199
  const batchScrapeTool = new BatchScrapeTool();
189
200
  const scrapeWithActionsTool = new ScrapeWithActionsTool();
@@ -633,10 +644,11 @@ async function fetchWithTimeout(url, options = {}) {
633
644
  // Tool: fetch_url - Basic URL fetching with headers and response handling
634
645
  server.registerTool("fetch_url", {
635
646
  description: "Fetch content from a URL with optional headers and timeout",
647
+ annotations: { title: "Fetch URL", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
636
648
  inputSchema: {
637
- url: z.string().url(),
638
- headers: z.record(z.string()).optional(),
639
- timeout: z.number().min(1000).max(30000).optional().default(10000)
649
+ url: z.string().url().describe("The URL to fetch content from"),
650
+ headers: z.record(z.string()).optional().describe("Custom HTTP headers to include in the request"),
651
+ timeout: z.number().min(1000).max(30000).optional().default(10000).describe("Request timeout in milliseconds (1000-30000)")
640
652
  }
641
653
  }, withAuth("fetch_url", async ({ url, headers, timeout }) => {
642
654
  try {
@@ -679,10 +691,11 @@ server.registerTool("fetch_url", {
679
691
  // Tool: extract_text - Extract clean text content from HTML
680
692
  server.registerTool("extract_text", {
681
693
  description: "Extract clean text content from a webpage",
694
+ annotations: { title: "Extract Text", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
682
695
  inputSchema: {
683
- url: z.string().url(),
684
- remove_scripts: z.boolean().optional().default(true),
685
- remove_styles: z.boolean().optional().default(true)
696
+ url: z.string().url().describe("The URL to extract text from"),
697
+ remove_scripts: z.boolean().optional().default(true).describe("Remove script tags before extraction"),
698
+ remove_styles: z.boolean().optional().default(true).describe("Remove style tags before extraction")
686
699
  }
687
700
  }, withAuth("extract_text", async ({ url, remove_scripts, remove_styles }) => {
688
701
  try {
@@ -733,10 +746,11 @@ server.registerTool("extract_text", {
733
746
  // Tool: extract_links - Extract all links from a webpage with optional filtering
734
747
  server.registerTool("extract_links", {
735
748
  description: "Extract all links from a webpage with optional filtering",
749
+ annotations: { title: "Extract Links", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
736
750
  inputSchema: {
737
- url: z.string().url(),
738
- filter_external: z.boolean().optional().default(false),
739
- base_url: z.string().url().optional()
751
+ url: z.string().url().describe("The URL to extract links from"),
752
+ filter_external: z.boolean().optional().default(false).describe("Only return external links"),
753
+ base_url: z.string().url().optional().describe("Base URL for resolving relative links")
740
754
  }
741
755
  }, withAuth("extract_links", async ({ url, filter_external, base_url }) => {
742
756
  try {
@@ -817,8 +831,9 @@ server.registerTool("extract_links", {
817
831
  // Tool: extract_metadata - Extract page metadata
818
832
  server.registerTool("extract_metadata", {
819
833
  description: "Extract metadata from a webpage (title, description, keywords, etc.)",
834
+ annotations: { title: "Extract Metadata", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
820
835
  inputSchema: {
821
- url: z.string().url()
836
+ url: z.string().url().describe("The URL to extract metadata from")
822
837
  }
823
838
  }, withAuth("extract_metadata", async ({ url }) => {
824
839
  try {
@@ -896,9 +911,10 @@ server.registerTool("extract_metadata", {
896
911
  // Tool: scrape_structured - Extract structured data using CSS selectors
897
912
  server.registerTool("scrape_structured", {
898
913
  description: "Extract structured data from a webpage using CSS selectors",
914
+ annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
899
915
  inputSchema: {
900
- url: z.string().url(),
901
- selectors: z.record(z.string())
916
+ url: z.string().url().describe("The URL to scrape"),
917
+ selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors")
902
918
  }
903
919
  }, withAuth("scrape_structured", async ({ url, selectors }) => {
904
920
  try {
@@ -959,15 +975,16 @@ server.registerTool("scrape_structured", {
959
975
  // Tool: search_web - Search the web using Google Search via CrawlForge proxy
960
976
  server.registerTool("search_web", {
961
977
  description: "Search the web using Google Search API (proxied through CrawlForge)",
978
+ annotations: { title: "Search the Web", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
962
979
  inputSchema: {
963
- query: z.string(),
964
- limit: z.number().min(1).max(100).optional(),
965
- offset: z.number().min(0).optional(),
966
- lang: z.string().optional(),
967
- safe_search: z.boolean().optional(),
968
- time_range: z.enum(["day", "week", "month", "year", "all"]).optional(),
969
- site: z.string().optional(),
970
- file_type: z.string().optional()
980
+ query: z.string().describe("Search query string"),
981
+ limit: z.number().min(1).max(100).optional().describe("Maximum number of results to return"),
982
+ offset: z.number().min(0).optional().describe("Number of results to skip for pagination"),
983
+ lang: z.string().optional().describe("Language code for results (e.g. 'en', 'fr')"),
984
+ safe_search: z.boolean().optional().describe("Enable safe search filtering"),
985
+ time_range: z.enum(["day", "week", "month", "year", "all"]).optional().describe("Filter results by time range"),
986
+ site: z.string().optional().describe("Limit results to a specific domain"),
987
+ file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')")
971
988
  }
972
989
  }, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
973
990
  try {
@@ -1002,16 +1019,17 @@ server.registerTool("search_web", {
1002
1019
  // Tool: crawl_deep - Deep crawl websites with BFS algorithm
1003
1020
  server.registerTool("crawl_deep", {
1004
1021
  description: "Crawl websites deeply using breadth-first search",
1022
+ annotations: { title: "Deep Crawl", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1005
1023
  inputSchema: {
1006
- url: z.string().url(),
1007
- max_depth: z.number().min(1).max(5).optional(),
1008
- max_pages: z.number().min(1).max(1000).optional(),
1009
- include_patterns: z.array(z.string()).optional(),
1010
- exclude_patterns: z.array(z.string()).optional(),
1011
- follow_external: z.boolean().optional(),
1012
- respect_robots: z.boolean().optional(),
1013
- extract_content: z.boolean().optional(),
1014
- concurrency: z.number().min(1).max(20).optional()
1024
+ url: z.string().url().describe("Starting URL for the crawl"),
1025
+ max_depth: z.number().min(1).max(5).optional().describe("Maximum crawl depth from starting URL"),
1026
+ max_pages: z.number().min(1).max(1000).optional().describe("Maximum number of pages to crawl"),
1027
+ include_patterns: z.array(z.string()).optional().describe("URL patterns to include (regex)"),
1028
+ exclude_patterns: z.array(z.string()).optional().describe("URL patterns to exclude (regex)"),
1029
+ follow_external: z.boolean().optional().describe("Follow links to external domains"),
1030
+ respect_robots: z.boolean().optional().describe("Respect robots.txt directives"),
1031
+ extract_content: z.boolean().optional().describe("Extract page content during crawl"),
1032
+ concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests")
1015
1033
  }
1016
1034
  }, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
1017
1035
  try {
@@ -1046,12 +1064,13 @@ server.registerTool("crawl_deep", {
1046
1064
  // Tool: map_site - Discover and map website structure
1047
1065
  server.registerTool("map_site", {
1048
1066
  description: "Discover and map website structure",
1067
+ annotations: { title: "Map Website", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
1049
1068
  inputSchema: {
1050
- url: z.string().url(),
1051
- include_sitemap: z.boolean().optional(),
1052
- max_urls: z.number().min(1).max(10000).optional(),
1053
- group_by_path: z.boolean().optional(),
1054
- include_metadata: z.boolean().optional()
1069
+ url: z.string().url().describe("The website URL to map"),
1070
+ include_sitemap: z.boolean().optional().describe("Include sitemap.xml data in results"),
1071
+ max_urls: z.number().min(1).max(10000).optional().describe("Maximum number of URLs to discover"),
1072
+ group_by_path: z.boolean().optional().describe("Group URLs by path segments"),
1073
+ include_metadata: z.boolean().optional().describe("Include page metadata for each URL")
1055
1074
  }
1056
1075
  }, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
1057
1076
  try {
@@ -1088,9 +1107,10 @@ server.registerTool("map_site", {
1088
1107
  // Tool: extract_content - Enhanced content extraction with readability detection
1089
1108
  server.registerTool("extract_content", {
1090
1109
  description: "Extract and analyze main content from web pages with enhanced readability detection",
1110
+ annotations: { title: "Extract Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
1091
1111
  inputSchema: {
1092
- url: z.string().url(),
1093
- options: z.object({}).optional()
1112
+ url: z.string().url().describe("The URL to extract content from"),
1113
+ options: z.object({}).optional().describe("Additional extraction options")
1094
1114
  }
1095
1115
  }, withAuth("extract_content", async ({ url, options }) => {
1096
1116
  try {
@@ -1125,10 +1145,11 @@ server.registerTool("extract_content", {
1125
1145
  // Tool: process_document - Multi-format document processing
1126
1146
  server.registerTool("process_document", {
1127
1147
  description: "Process documents from multiple sources and formats including PDFs and web pages",
1148
+ annotations: { title: "Process Document", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
1128
1149
  inputSchema: {
1129
- source: z.string(),
1130
- sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional(),
1131
- options: z.object({}).optional()
1150
+ source: z.string().describe("Document source - URL or file path"),
1151
+ sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional().describe("Type of document source"),
1152
+ options: z.object({}).optional().describe("Additional processing options")
1132
1153
  }
1133
1154
  }, withAuth("process_document", async ({ source, sourceType, options }) => {
1134
1155
  try {
@@ -1163,9 +1184,10 @@ server.registerTool("process_document", {
1163
1184
  // Tool: summarize_content - Intelligent content summarization
1164
1185
  server.registerTool("summarize_content", {
1165
1186
  description: "Generate intelligent summaries of text content with configurable options",
1187
+ annotations: { title: "Summarize Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
1166
1188
  inputSchema: {
1167
- text: z.string(),
1168
- options: z.object({}).optional()
1189
+ text: z.string().describe("The text content to summarize"),
1190
+ options: z.object({}).optional().describe("Summarization options")
1169
1191
  }
1170
1192
  }, withAuth("summarize_content", async ({ text, options }) => {
1171
1193
  try {
@@ -1200,9 +1222,10 @@ server.registerTool("summarize_content", {
1200
1222
  // Tool: analyze_content - Comprehensive content analysis
1201
1223
  server.registerTool("analyze_content", {
1202
1224
  description: "Perform comprehensive content analysis including language detection and topic extraction",
1225
+ annotations: { title: "Analyze Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
1203
1226
  inputSchema: {
1204
- text: z.string(),
1205
- options: z.object({}).optional()
1227
+ text: z.string().describe("The text content to analyze"),
1228
+ options: z.object({}).optional().describe("Analysis options")
1206
1229
  }
1207
1230
  }, withAuth("analyze_content", async ({ text, options }) => {
1208
1231
  try {
@@ -1235,11 +1258,62 @@ server.registerTool("analyze_content", {
1235
1258
  }));
1236
1259
 
1237
1260
 
1261
+
1262
+ // Phase 1: LLM-Powered Structured Extraction
1263
+
1264
+ // Tool: extract_structured - Extract structured data from a URL using LLM and JSON Schema
1265
+ server.registerTool("extract_structured", {
1266
+ description: "Extract structured data from a webpage using LLM-powered analysis and a JSON Schema. Falls back to CSS selector extraction when no LLM provider is configured.",
1267
+ annotations: { title: "Extract Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
1268
+ inputSchema: {
1269
+ url: z.string().url().describe("The URL to extract structured data from"),
1270
+ schema: z.object({
1271
+ type: z.string().optional(),
1272
+ properties: z.record(z.any()),
1273
+ required: z.array(z.string()).optional()
1274
+ }).describe("JSON schema defining the data structure to extract"),
1275
+ prompt: z.string().optional().describe("Natural language instructions for extraction"),
1276
+ llmConfig: z.object({
1277
+ provider: z.string().optional(),
1278
+ apiKey: z.string().optional()
1279
+ }).optional().describe("LLM provider configuration for AI-powered extraction"),
1280
+ fallbackToSelectors: z.boolean().optional().default(true).describe("Fall back to CSS selector extraction if LLM is unavailable"),
1281
+ selectorHints: z.record(z.string()).optional().describe("CSS selector hints to guide extraction")
1282
+ }
1283
+ }, withAuth("extract_structured", async ({ url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints }) => {
1284
+ try {
1285
+ const result = await extractStructuredTool.execute({
1286
+ url,
1287
+ schema,
1288
+ prompt,
1289
+ llmConfig,
1290
+ fallbackToSelectors,
1291
+ selectorHints
1292
+ });
1293
+ return {
1294
+ content: [{
1295
+ type: "text",
1296
+ text: JSON.stringify(result, null, 2)
1297
+ }]
1298
+ };
1299
+ } catch (error) {
1300
+ return {
1301
+ content: [{
1302
+ type: "text",
1303
+ text: `Structured extraction failed: ${error.message}`
1304
+ }],
1305
+ isError: true
1306
+ };
1307
+ }
1308
+ }));
1309
+
1310
+
1238
1311
  // Wave 2 Advanced Tools
1239
1312
 
1240
1313
  // Tool: batch_scrape - Process multiple URLs simultaneously with job management
1241
1314
  server.registerTool("batch_scrape", {
1242
1315
  description: "Process multiple URLs simultaneously with support for async job management and webhook notifications",
1316
+ annotations: { title: "Batch Scrape", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1243
1317
  inputSchema: {
1244
1318
  urls: z.array(z.union([
1245
1319
  z.string().url(),
@@ -1250,27 +1324,27 @@ server.registerTool("batch_scrape", {
1250
1324
  timeout: z.number().min(1000).max(30000).optional(),
1251
1325
  metadata: z.record(z.any()).optional()
1252
1326
  })
1253
- ])).min(1).max(50),
1254
- formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
1255
- mode: z.enum(['sync', 'async']).default('sync'),
1327
+ ])).min(1).max(50).describe("Array of URLs or URL objects to scrape"),
1328
+ formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']).describe("Output formats for scraped content"),
1329
+ mode: z.enum(['sync', 'async']).default('sync').describe("Processing mode: sync (wait) or async (background)"),
1256
1330
  webhook: z.object({
1257
1331
  url: z.string().url(),
1258
1332
  events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
1259
1333
  headers: z.record(z.string()).optional(),
1260
1334
  signingSecret: z.string().optional()
1261
- }).optional(),
1262
- extractionSchema: z.record(z.string()).optional(),
1263
- maxConcurrency: z.number().min(1).max(20).default(10),
1264
- delayBetweenRequests: z.number().min(0).max(10000).default(100),
1265
- includeMetadata: z.boolean().default(true),
1266
- includeFailed: z.boolean().default(true),
1267
- pageSize: z.number().min(1).max(100).default(25),
1335
+ }).optional().describe("Webhook configuration for async job notifications"),
1336
+ extractionSchema: z.record(z.string()).optional().describe("Schema for structured data extraction from each URL"),
1337
+ maxConcurrency: z.number().min(1).max(20).default(10).describe("Maximum concurrent scraping requests"),
1338
+ delayBetweenRequests: z.number().min(0).max(10000).default(100).describe("Delay in milliseconds between requests"),
1339
+ includeMetadata: z.boolean().default(true).describe("Include page metadata in results"),
1340
+ includeFailed: z.boolean().default(true).describe("Include failed URLs in results"),
1341
+ pageSize: z.number().min(1).max(100).default(25).describe("Number of results per page"),
1268
1342
  jobOptions: z.object({
1269
1343
  priority: z.number().default(0),
1270
1344
  ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
1271
1345
  maxRetries: z.number().min(0).max(5).default(1),
1272
1346
  tags: z.array(z.string()).default([])
1273
- }).optional()
1347
+ }).optional().describe("Job management options for async processing")
1274
1348
  }
1275
1349
  }, withAuth("batch_scrape", async (params) => {
1276
1350
  try {
@@ -1295,8 +1369,9 @@ server.registerTool("batch_scrape", {
1295
1369
  // Tool: scrape_with_actions - Execute action chains before scraping
1296
1370
  server.registerTool("scrape_with_actions", {
1297
1371
  description: "Execute browser action chains before scraping content, with form auto-fill and intermediate state capture",
1372
+ annotations: { title: "Scrape with Browser Actions", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1298
1373
  inputSchema: {
1299
- url: z.string().url(),
1374
+ url: z.string().url().describe("The URL to scrape"),
1300
1375
  actions: z.array(z.object({
1301
1376
  type: z.enum(['wait', 'click', 'type', 'press', 'scroll', 'screenshot', 'executeJavaScript']),
1302
1377
  selector: z.string().optional(),
@@ -1307,10 +1382,10 @@ server.registerTool("scrape_with_actions", {
1307
1382
  description: z.string().optional(),
1308
1383
  continueOnError: z.boolean().default(false),
1309
1384
  retries: z.number().min(0).max(5).default(0)
1310
- })).min(1).max(20),
1311
- formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
1312
- captureIntermediateStates: z.boolean().default(false),
1313
- captureScreenshots: z.boolean().default(true),
1385
+ })).min(1).max(20).describe("Browser actions to perform before scraping"),
1386
+ formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']).describe("Output formats for scraped content"),
1387
+ captureIntermediateStates: z.boolean().default(false).describe("Capture page state after each action"),
1388
+ captureScreenshots: z.boolean().default(true).describe("Take screenshots during action execution"),
1314
1389
  formAutoFill: z.object({
1315
1390
  fields: z.array(z.object({
1316
1391
  selector: z.string(),
@@ -1320,23 +1395,23 @@ server.registerTool("scrape_with_actions", {
1320
1395
  })),
1321
1396
  submitSelector: z.string().optional(),
1322
1397
  waitAfterSubmit: z.number().min(0).max(30000).default(2000)
1323
- }).optional(),
1398
+ }).optional().describe("Form auto-fill configuration"),
1324
1399
  browserOptions: z.object({
1325
1400
  headless: z.boolean().default(true),
1326
1401
  userAgent: z.string().optional(),
1327
1402
  viewportWidth: z.number().min(800).max(1920).default(1280),
1328
1403
  viewportHeight: z.number().min(600).max(1080).default(720),
1329
1404
  timeout: z.number().min(10000).max(120000).default(30000)
1330
- }).optional(),
1405
+ }).optional().describe("Browser configuration options"),
1331
1406
  extractionOptions: z.object({
1332
1407
  selectors: z.record(z.string()).optional(),
1333
1408
  includeMetadata: z.boolean().default(true),
1334
1409
  includeLinks: z.boolean().default(true),
1335
1410
  includeImages: z.boolean().default(true)
1336
- }).optional(),
1337
- continueOnActionError: z.boolean().default(false),
1338
- maxRetries: z.number().min(0).max(3).default(1),
1339
- screenshotOnError: z.boolean().default(true)
1411
+ }).optional().describe("Content extraction options"),
1412
+ continueOnActionError: z.boolean().default(false).describe("Continue executing actions if one fails"),
1413
+ maxRetries: z.number().min(0).max(3).default(1).describe("Maximum retry attempts on failure"),
1414
+ screenshotOnError: z.boolean().default(true).describe("Capture screenshot when an error occurs")
1340
1415
  }
1341
1416
  }, withAuth("scrape_with_actions", async (params) => {
1342
1417
  try {
@@ -1361,27 +1436,28 @@ server.registerTool("scrape_with_actions", {
1361
1436
  // Tool: deep_research - Comprehensive multi-stage research with source verification
1362
1437
  server.registerTool("deep_research", {
1363
1438
  description: "Conduct comprehensive multi-stage research with intelligent query expansion, source verification, and conflict detection",
1439
+ annotations: { title: "Deep Research", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1364
1440
  inputSchema: {
1365
- topic: z.string().min(3).max(500),
1366
- maxDepth: z.number().min(1).max(10).optional().default(5),
1367
- maxUrls: z.number().min(1).max(1000).optional().default(50),
1368
- timeLimit: z.number().min(30000).max(300000).optional().default(120000),
1369
- researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'),
1370
- sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']),
1371
- credibilityThreshold: z.number().min(0).max(1).optional().default(0.3),
1372
- includeRecentOnly: z.boolean().optional().default(false),
1373
- enableConflictDetection: z.boolean().optional().default(true),
1374
- enableSourceVerification: z.boolean().optional().default(true),
1375
- enableSynthesis: z.boolean().optional().default(true),
1376
- outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'),
1377
- includeRawData: z.boolean().optional().default(false),
1378
- includeActivityLog: z.boolean().optional().default(false),
1441
+ topic: z.string().min(3).max(500).describe("Research topic or question"),
1442
+ maxDepth: z.number().min(1).max(10).optional().default(5).describe("Maximum research depth"),
1443
+ maxUrls: z.number().min(1).max(1000).optional().default(50).describe("Maximum URLs to analyze"),
1444
+ timeLimit: z.number().min(30000).max(300000).optional().default(120000).describe("Time limit in milliseconds for the research"),
1445
+ researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad').describe("Research methodology approach"),
1446
+ sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']).describe("Types of sources to include"),
1447
+ credibilityThreshold: z.number().min(0).max(1).optional().default(0.3).describe("Minimum credibility score for sources (0-1)"),
1448
+ includeRecentOnly: z.boolean().optional().default(false).describe("Only include recent sources"),
1449
+ enableConflictDetection: z.boolean().optional().default(true).describe("Detect conflicting information across sources"),
1450
+ enableSourceVerification: z.boolean().optional().default(true).describe("Verify source credibility"),
1451
+ enableSynthesis: z.boolean().optional().default(true).describe("Synthesize findings into a coherent report"),
1452
+ outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive').describe("Output format for the research report"),
1453
+ includeRawData: z.boolean().optional().default(false).describe("Include raw scraped data in output"),
1454
+ includeActivityLog: z.boolean().optional().default(false).describe("Include detailed activity log"),
1379
1455
  queryExpansion: z.object({
1380
1456
  enableSynonyms: z.boolean().optional().default(true),
1381
1457
  enableSpellCheck: z.boolean().optional().default(true),
1382
1458
  enableContextual: z.boolean().optional().default(true),
1383
1459
  maxVariations: z.number().min(1).max(20).optional().default(8)
1384
- }).optional(),
1460
+ }).optional().describe("Query expansion settings for broader search coverage"),
1385
1461
  llmConfig: z.object({
1386
1462
  provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'),
1387
1463
  openai: z.object({
@@ -1395,14 +1471,14 @@ server.registerTool("deep_research", {
1395
1471
  }).optional(),
1396
1472
  enableSemanticAnalysis: z.boolean().optional().default(true),
1397
1473
  enableIntelligentSynthesis: z.boolean().optional().default(true)
1398
- }).optional(),
1399
- concurrency: z.number().min(1).max(20).optional().default(5),
1400
- cacheResults: z.boolean().optional().default(true),
1474
+ }).optional().describe("LLM provider configuration for AI-powered analysis"),
1475
+ concurrency: z.number().min(1).max(20).optional().default(5).describe("Number of concurrent research requests"),
1476
+ cacheResults: z.boolean().optional().default(true).describe("Cache research results for reuse"),
1401
1477
  webhook: z.object({
1402
1478
  url: z.string().url(),
1403
1479
  events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']),
1404
1480
  headers: z.record(z.string()).optional()
1405
- }).optional()
1481
+ }).optional().describe("Webhook for progress and completion notifications")
1406
1482
  }
1407
1483
  }, withAuth("deep_research", async (params) => {
1408
1484
  try {
@@ -1427,13 +1503,14 @@ server.registerTool("deep_research", {
1427
1503
  // Tool: track_changes - Enhanced Content change tracking with baseline capture and monitoring (Phase 2.4)
1428
1504
  server.registerTool("track_changes", {
1429
1505
  description: "Enhanced content change tracking with baseline capture, comparison, scheduled monitoring, advanced comparison engine, alert system, and historical analysis",
1506
+ annotations: { title: "Track Changes", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1430
1507
  inputSchema: {
1431
- url: z.string().url(),
1508
+ url: z.string().url().describe("The URL to track changes for"),
1432
1509
  operation: z.enum([
1433
- 'create_baseline',
1434
- 'compare',
1435
- 'monitor',
1436
- 'get_history',
1510
+ 'create_baseline',
1511
+ 'compare',
1512
+ 'monitor',
1513
+ 'get_history',
1437
1514
  'get_stats',
1438
1515
  'create_scheduled_monitor',
1439
1516
  'stop_scheduled_monitor',
@@ -1442,9 +1519,9 @@ server.registerTool("track_changes", {
1442
1519
  'create_alert_rule',
1443
1520
  'generate_trend_report',
1444
1521
  'get_monitoring_templates'
1445
- ]).default('compare'),
1446
- content: z.string().optional(),
1447
- html: z.string().optional(),
1522
+ ]).default('compare').describe("Tracking operation to perform"),
1523
+ content: z.string().optional().describe("Content to compare against baseline"),
1524
+ html: z.string().optional().describe("HTML content to compare against baseline"),
1448
1525
  trackingOptions: z.object({
1449
1526
  granularity: z.enum(['page', 'section', 'element', 'text']).default('section'),
1450
1527
  trackText: z.boolean().default(true),
@@ -1461,7 +1538,7 @@ server.registerTool("track_changes", {
1461
1538
  moderate: z.number().min(0).max(1).default(0.3),
1462
1539
  major: z.number().min(0).max(1).default(0.7)
1463
1540
  }).optional()
1464
- }).optional(),
1541
+ }).optional().describe("Options for how changes are tracked and compared"),
1465
1542
  monitoringOptions: z.object({
1466
1543
  enabled: z.boolean().default(false),
1467
1544
  interval: z.number().min(60000).max(24 * 60 * 60 * 1000).default(300000),
@@ -1471,14 +1548,14 @@ server.registerTool("track_changes", {
1471
1548
  enableWebhook: z.boolean().default(false),
1472
1549
  webhookUrl: z.string().url().optional(),
1473
1550
  webhookSecret: z.string().optional()
1474
- }).optional(),
1551
+ }).optional().describe("Monitoring schedule and notification settings"),
1475
1552
  storageOptions: z.object({
1476
1553
  enableSnapshots: z.boolean().default(true),
1477
1554
  retainHistory: z.boolean().default(true),
1478
1555
  maxHistoryEntries: z.number().min(1).max(1000).default(100),
1479
1556
  compressionEnabled: z.boolean().default(true),
1480
1557
  deltaStorageEnabled: z.boolean().default(true)
1481
- }).optional(),
1558
+ }).optional().describe("Storage and history retention settings"),
1482
1559
  queryOptions: z.object({
1483
1560
  limit: z.number().min(1).max(500).default(50),
1484
1561
  offset: z.number().min(0).default(0),
@@ -1486,7 +1563,7 @@ server.registerTool("track_changes", {
1486
1563
  endTime: z.number().optional(),
1487
1564
  includeContent: z.boolean().default(false),
1488
1565
  significanceFilter: z.enum(['all', 'minor', 'moderate', 'major', 'critical']).optional()
1489
- }).optional(),
1566
+ }).optional().describe("Query options for history and stats retrieval"),
1490
1567
  notificationOptions: z.object({
1491
1568
  webhook: z.object({
1492
1569
  enabled: z.boolean().default(false),
@@ -1502,32 +1579,32 @@ server.registerTool("track_changes", {
1502
1579
  channel: z.string().optional(),
1503
1580
  username: z.string().optional()
1504
1581
  }).optional()
1505
- }).optional(),
1582
+ }).optional().describe("Notification configuration for webhooks and Slack"),
1506
1583
  // Enhanced Phase 2.4 options
1507
1584
  scheduledMonitorOptions: z.object({
1508
1585
  schedule: z.string().optional(), // Cron expression
1509
1586
  templateId: z.string().optional(), // Monitoring template ID
1510
1587
  enabled: z.boolean().default(true)
1511
- }).optional(),
1588
+ }).optional().describe("Scheduled monitoring options with cron expressions"),
1512
1589
  alertRuleOptions: z.object({
1513
1590
  ruleId: z.string().optional(),
1514
1591
  condition: z.string().optional(), // Condition description
1515
1592
  actions: z.array(z.enum(['webhook', 'email', 'slack'])).optional(),
1516
1593
  throttle: z.number().min(0).optional(),
1517
1594
  priority: z.enum(['low', 'medium', 'high']).optional()
1518
- }).optional(),
1595
+ }).optional().describe("Alert rule configuration for change notifications"),
1519
1596
  exportOptions: z.object({
1520
1597
  format: z.enum(['json', 'csv']).default('json'),
1521
1598
  startTime: z.number().optional(),
1522
1599
  endTime: z.number().optional(),
1523
1600
  includeContent: z.boolean().default(false),
1524
1601
  includeSnapshots: z.boolean().default(false)
1525
- }).optional(),
1602
+ }).optional().describe("Export options for change history data"),
1526
1603
  dashboardOptions: z.object({
1527
1604
  includeRecentAlerts: z.boolean().default(true),
1528
1605
  includeTrends: z.boolean().default(true),
1529
1606
  includeMonitorStatus: z.boolean().default(true)
1530
- }).optional()
1607
+ }).optional().describe("Dashboard display options")
1531
1608
  }
1532
1609
  }, withAuth("track_changes", async (params) => {
1533
1610
  try {
@@ -1552,8 +1629,9 @@ server.registerTool("track_changes", {
1552
1629
  // Tool: generate_llms_txt - Generate LLMs.txt and LLMs-full.txt files (Phase 2.5)
1553
1630
  server.registerTool("generate_llms_txt", {
1554
1631
  description: "Analyze websites and generate standard-compliant LLMs.txt and LLMs-full.txt files defining AI model interaction guidelines",
1632
+ annotations: { title: "Generate llms.txt", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
1555
1633
  inputSchema: {
1556
- url: z.string().url(),
1634
+ url: z.string().url().describe("The website URL to generate llms.txt for"),
1557
1635
  analysisOptions: z.object({
1558
1636
  maxDepth: z.number().min(1).max(5).optional().default(3),
1559
1637
  maxPages: z.number().min(10).max(500).optional().default(100),
@@ -1561,7 +1639,7 @@ server.registerTool("generate_llms_txt", {
1561
1639
  analyzeContent: z.boolean().optional().default(true),
1562
1640
  checkSecurity: z.boolean().optional().default(true),
1563
1641
  respectRobots: z.boolean().optional().default(true)
1564
- }).optional(),
1642
+ }).optional().describe("Website analysis options for depth, scope, and detection"),
1565
1643
  outputOptions: z.object({
1566
1644
  includeDetailed: z.boolean().optional().default(true),
1567
1645
  includeAnalysis: z.boolean().optional().default(false),
@@ -1569,9 +1647,9 @@ server.registerTool("generate_llms_txt", {
1569
1647
  organizationName: z.string().optional(),
1570
1648
  customGuidelines: z.array(z.string()).optional(),
1571
1649
  customRestrictions: z.array(z.string()).optional()
1572
- }).optional(),
1573
- complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard'),
1574
- format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both')
1650
+ }).optional().describe("Output customization and organization details"),
1651
+ complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard').describe("Compliance level for generated guidelines"),
1652
+ format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both').describe("Output format: llms.txt, llms-full.txt, or both")
1575
1653
  }
1576
1654
  }, withAuth("generate_llms_txt", async (params) => {
1577
1655
  try {
@@ -1596,8 +1674,9 @@ server.registerTool("generate_llms_txt", {
1596
1674
  // Tool: stealth_mode - Advanced anti-detection browser management (Wave 3)
1597
1675
  server.registerTool("stealth_mode", {
1598
1676
  description: "Advanced anti-detection browser management with stealth features, fingerprint randomization, and human behavior simulation",
1677
+ annotations: { title: "Stealth Mode", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1599
1678
  inputSchema: {
1600
- operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure'),
1679
+ operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure').describe("Stealth operation to perform"),
1601
1680
  stealthConfig: z.object({
1602
1681
  level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
1603
1682
  randomizeFingerprint: z.boolean().default(true),
@@ -1635,9 +1714,9 @@ server.registerTool("stealth_mode", {
1635
1714
  fontSpoofing: z.boolean().default(true),
1636
1715
  hardwareSpoofing: z.boolean().default(true)
1637
1716
  }).optional()
1638
- }).optional(),
1639
- contextId: z.string().optional(),
1640
- urlToTest: z.string().url().optional()
1717
+ }).optional().describe("Stealth browser configuration with anti-detection settings"),
1718
+ contextId: z.string().optional().describe("Browser context ID for page operations"),
1719
+ urlToTest: z.string().url().optional().describe("URL to navigate to when creating a page")
1641
1720
  }
1642
1721
  }, withAuth("stealth_mode", async ({ operation, stealthConfig, contextId, urlToTest }) => {
1643
1722
  try {
@@ -1717,20 +1796,21 @@ server.registerTool("stealth_mode", {
1717
1796
  // Tool: localization - Multi-language and geo-location management (Wave 3)
1718
1797
  server.registerTool("localization", {
1719
1798
  description: "Multi-language and geo-location management with country-specific settings, browser locale emulation, timezone spoofing, and geo-blocked content handling",
1799
+ annotations: { title: "Localization", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1720
1800
  inputSchema: {
1721
- operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country'),
1722
- countryCode: z.string().length(2).optional(),
1723
- language: z.string().optional(),
1724
- timezone: z.string().optional(),
1725
- currency: z.string().length(3).optional(),
1726
- customHeaders: z.record(z.string()).optional(),
1727
- userAgent: z.string().optional(),
1728
- acceptLanguage: z.string().optional(),
1801
+ operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country').describe("Localization operation to perform"),
1802
+ countryCode: z.string().length(2).optional().describe("ISO 3166-1 alpha-2 country code"),
1803
+ language: z.string().optional().describe("Language code (e.g. 'en', 'fr', 'de')"),
1804
+ timezone: z.string().optional().describe("IANA timezone identifier (e.g. 'America/New_York')"),
1805
+ currency: z.string().length(3).optional().describe("ISO 4217 currency code (e.g. 'USD', 'EUR')"),
1806
+ customHeaders: z.record(z.string()).optional().describe("Custom HTTP headers for localized requests"),
1807
+ userAgent: z.string().optional().describe("Custom user agent string"),
1808
+ acceptLanguage: z.string().optional().describe("Accept-Language header value"),
1729
1809
  geoLocation: z.object({
1730
1810
  latitude: z.number().min(-90).max(90),
1731
1811
  longitude: z.number().min(-180).max(180),
1732
1812
  accuracy: z.number().min(1).max(100).optional()
1733
- }).optional(),
1813
+ }).optional().describe("GPS coordinates for geolocation emulation"),
1734
1814
  proxySettings: z.object({
1735
1815
  enabled: z.boolean().default(false),
1736
1816
  region: z.string().optional(),
@@ -1749,26 +1829,26 @@ server.registerTool("localization", {
1749
1829
  maxRetries: z.number().default(3),
1750
1830
  timeout: z.number().default(10000)
1751
1831
  }).optional()
1752
- }).optional(),
1832
+ }).optional().describe("Proxy configuration for geo-targeted requests"),
1753
1833
  searchParams: z.object({
1754
1834
  query: z.string().optional(),
1755
1835
  limit: z.number().optional(),
1756
1836
  offset: z.number().optional(),
1757
1837
  headers: z.record(z.string()).optional()
1758
- }).optional(),
1838
+ }).optional().describe("Search parameters for localized search queries"),
1759
1839
  browserOptions: z.object({
1760
1840
  locale: z.string().optional(),
1761
1841
  timezoneId: z.string().optional(),
1762
1842
  extraHTTPHeaders: z.record(z.string()).optional(),
1763
1843
  userAgent: z.string().optional()
1764
- }).optional(),
1765
- content: z.string().optional(),
1766
- url: z.string().url().optional(),
1844
+ }).optional().describe("Browser context options for locale emulation"),
1845
+ content: z.string().optional().describe("Content for auto-detection of language and locale"),
1846
+ url: z.string().url().optional().describe("URL for geo-blocking detection or auto-detection"),
1767
1847
  response: z.object({
1768
1848
  status: z.number(),
1769
1849
  body: z.string().optional(),
1770
1850
  statusText: z.string().optional()
1771
- }).optional()
1851
+ }).optional().describe("HTTP response for geo-blocking analysis")
1772
1852
  }
1773
1853
  }, withAuth("localization", async (params) => {
1774
1854
  try {
@@ -1850,11 +1930,92 @@ server.registerTool("localization", {
1850
1930
  }
1851
1931
  }));
1852
1932
 
1853
- // Set up the stdio transport and start the server
1933
+ // Determine transport mode: HTTP if --http flag or MCP_HTTP env var is set
1934
+ const useHttp = process.argv.includes('--http') || process.env.MCP_HTTP === 'true';
1935
+
1936
+ // Set up transport and start the server
1854
1937
  async function runServer() {
1855
- const transport = new StdioServerTransport();
1856
- await server.connect(transport);
1857
- console.error("CrawlForge MCP Server v3.0 running on stdio");
1938
+ if (useHttp) {
1939
+ const port = parseInt(process.env.PORT || '3000', 10);
1940
+
1941
+ // Stateless transport — no session tracking, each request is independent
1942
+ // This avoids the bug where server.connect(newTransport) kills previous sessions
1943
+ const transport = new StreamableHTTPServerTransport({
1944
+ sessionIdGenerator: undefined,
1945
+ });
1946
+ await server.connect(transport);
1947
+
1948
+ const httpServer = createServer(async (req, res) => {
1949
+ // CORS headers for Smithery gateway
1950
+ res.setHeader('Access-Control-Allow-Origin', '*');
1951
+ res.setHeader('Access-Control-Allow-Methods', 'GET, POST, DELETE, OPTIONS');
1952
+ res.setHeader('Access-Control-Allow-Headers', 'Content-Type, mcp-session-id');
1953
+ res.setHeader('Access-Control-Expose-Headers', 'mcp-session-id');
1954
+
1955
+ if (req.method === 'OPTIONS') {
1956
+ res.writeHead(204);
1957
+ res.end();
1958
+ return;
1959
+ }
1960
+
1961
+ // Health check endpoint
1962
+ if (req.url === '/health') {
1963
+ res.writeHead(200, { 'Content-Type': 'application/json' });
1964
+ res.end(JSON.stringify({ status: 'ok', version: '3.0' }));
1965
+ return;
1966
+ }
1967
+
1968
+ // MCP server card for Smithery discovery
1969
+ if (req.url === '/.well-known/mcp/server-card.json') {
1970
+ res.writeHead(200, { 'Content-Type': 'application/json' });
1971
+ res.end(JSON.stringify({
1972
+ serverInfo: {
1973
+ name: "crawlforge",
1974
+ version: "3.0.12",
1975
+ description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
1976
+ homepage: "https://www.crawlforge.dev",
1977
+ icon: "https://www.crawlforge.dev/icon.png"
1978
+ },
1979
+ transport: {
1980
+ type: "streamable-http",
1981
+ url: "/mcp"
1982
+ },
1983
+ configSchema: {
1984
+ type: "object",
1985
+ properties: {
1986
+ apiKey: {
1987
+ type: "string",
1988
+ title: "CrawlForge API Key",
1989
+ description: "Your CrawlForge API key. Get one free at https://www.crawlforge.dev/signup (includes 1,000 credits)",
1990
+ "x-from": { header: "x-api-key" }
1991
+ }
1992
+ },
1993
+ required: ["apiKey"]
1994
+ }
1995
+ }));
1996
+ return;
1997
+ }
1998
+
1999
+ // Route /mcp to the transport handler
2000
+ if (req.url === '/mcp' || req.url === '/') {
2001
+ await transport.handleRequest(req, res);
2002
+ return;
2003
+ }
2004
+
2005
+ res.writeHead(404);
2006
+ res.end('Not Found');
2007
+ });
2008
+
2009
+ httpServer.listen(port, () => {
2010
+ console.error(`CrawlForge MCP Server v3.0 running on HTTP port ${port}`);
2011
+ console.error(`MCP endpoint: http://localhost:${port}/mcp`);
2012
+ console.error(`Health check: http://localhost:${port}/health`);
2013
+ });
2014
+ } else {
2015
+ const transport = new StdioServerTransport();
2016
+ await server.connect(transport);
2017
+ console.error("CrawlForge MCP Server v3.0 running on stdio");
2018
+ }
1858
2019
  console.error(`Environment: ${config.server.nodeEnv}`);
1859
2020
 
1860
2021
  console.error("Search enabled: true (via CrawlForge proxy)");
@@ -1867,7 +2028,8 @@ async function runServer() {
1867
2028
  const trackingTools = ", track_changes";
1868
2029
  const llmsTxtTools = ", generate_llms_txt";
1869
2030
  const wave3Tools = ", stealth_mode, localization";
1870
- console.error(`Tools available: ${baseTools}${searchTool}${phase3Tools}${wave2Tools}${researchTools}${trackingTools}${llmsTxtTools}${wave3Tools}`);
2031
+ const phase1Tools = ", extract_structured";
2032
+ console.error(`Tools available: ${baseTools}${searchTool}${phase3Tools}${wave2Tools}${researchTools}${trackingTools}${llmsTxtTools}${wave3Tools}${phase1Tools}`);
1871
2033
 
1872
2034
 
1873
2035
  // === MEMORY LEAK PREVENTION ===
@@ -1893,7 +2055,8 @@ async function gracefulShutdown(signal) {
1893
2055
  trackChangesTool,
1894
2056
  generateLLMsTxtTool,
1895
2057
  stealthBrowserManager,
1896
- localizationManager
2058
+ localizationManager,
2059
+ extractStructuredTool
1897
2060
  ].filter(tool => tool && (typeof tool.destroy === 'function' || typeof tool.cleanup === 'function'));
1898
2061
 
1899
2062
  console.error(`Cleaning up ${toolsToCleanup.length} tools...`);