crawlforge-mcp-server 3.0.11 → 3.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +103 -324
- package/package.json +2 -1
- package/server.js +332 -156
- package/src/core/AuthManager.js +22 -9
- package/src/core/ChangeTracker.js +1 -1
- package/src/core/ResearchOrchestrator.js +43 -5
- package/src/core/analysis/ContentAnalyzer.js +70 -17
- package/src/core/analysis/sentenceUtils.js +73 -0
- package/src/core/creatorMode.js +47 -0
- package/src/core/llm/LLMManager.js +120 -0
- package/src/core/processing/BrowserProcessor.js +1 -1
- package/src/tools/extract/extractStructured.js +280 -0
- package/src/tools/extract/summarizeContent.js +3 -2
- package/src/tools/search/ranking/ResultDeduplicator.js +21 -21
- package/src/tools/search/searchWeb.js +2 -1
package/server.js
CHANGED
|
@@ -1,32 +1,15 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
|
-
//
|
|
4
|
-
//
|
|
5
|
-
|
|
6
|
-
import dotenv from 'dotenv';
|
|
3
|
+
// Creator Mode Authentication — imported from src/core/creatorMode.js
|
|
4
|
+
// This MUST be the first import so the secret is verified before any tool code runs.
|
|
5
|
+
export { isCreatorModeVerified } from './src/core/creatorMode.js';
|
|
7
6
|
|
|
8
|
-
//
|
|
9
|
-
dotenv.config({ path: '.env', quiet: true });
|
|
10
|
-
|
|
11
|
-
const CREATOR_SECRET_HASH = 'cfef62e5068d48e7dd6a39c9e16f0be2615510c6b68274fc8abe3156feb5050b';
|
|
12
|
-
|
|
13
|
-
if (process.env.CRAWLFORGE_CREATOR_SECRET) {
|
|
14
|
-
const providedHash = crypto
|
|
15
|
-
.createHash('sha256')
|
|
16
|
-
.update(process.env.CRAWLFORGE_CREATOR_SECRET)
|
|
17
|
-
.digest('hex');
|
|
18
|
-
|
|
19
|
-
if (providedHash === CREATOR_SECRET_HASH) {
|
|
20
|
-
process.env.CRAWLFORGE_CREATOR_MODE = 'true';
|
|
21
|
-
console.log('🔓 Creator Mode Enabled - Unlimited Access');
|
|
22
|
-
} else {
|
|
23
|
-
console.warn('⚠️ Invalid creator secret provided');
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
// Now import everything else
|
|
7
|
+
// Import everything else
|
|
28
8
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
29
9
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
10
|
+
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
11
|
+
import { createServer } from "node:http";
|
|
12
|
+
import { randomUUID } from "node:crypto";
|
|
30
13
|
import { z } from "zod";
|
|
31
14
|
import { load } from "cheerio";
|
|
32
15
|
import { SearchWebTool } from "./src/tools/search/searchWeb.js";
|
|
@@ -36,6 +19,8 @@ import { ExtractContentTool } from "./src/tools/extract/extractContent.js";
|
|
|
36
19
|
import { ProcessDocumentTool } from "./src/tools/extract/processDocument.js";
|
|
37
20
|
import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
|
|
38
21
|
import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
|
|
22
|
+
// Phase 1: LLM-Powered Structured Extraction
|
|
23
|
+
import { ExtractStructuredTool } from "./src/tools/extract/extractStructured.js";
|
|
39
24
|
// Wave 2 Advanced Tools
|
|
40
25
|
import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
|
|
41
26
|
import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
|
|
@@ -97,7 +82,43 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
|
|
|
97
82
|
}
|
|
98
83
|
|
|
99
84
|
// Create the server
|
|
100
|
-
const server = new McpServer({
|
|
85
|
+
const server = new McpServer({
|
|
86
|
+
name: "crawlforge",
|
|
87
|
+
version: "3.0.12",
|
|
88
|
+
description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
|
|
89
|
+
homepage: "https://www.crawlforge.dev",
|
|
90
|
+
icon: "https://www.crawlforge.dev/icon.png"
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
// Register getting-started prompt
|
|
94
|
+
server.prompt("getting-started", {
|
|
95
|
+
description: "Get started with CrawlForge MCP - learn available tools and best practices",
|
|
96
|
+
}, async () => {
|
|
97
|
+
return {
|
|
98
|
+
messages: [{
|
|
99
|
+
role: "user",
|
|
100
|
+
content: {
|
|
101
|
+
type: "text",
|
|
102
|
+
text: "You have access to CrawlForge MCP with 20 web scraping tools. Key tools:\n\n" +
|
|
103
|
+
"- fetch_url: Fetch raw HTML/content from any URL\n" +
|
|
104
|
+
"- extract_text: Extract clean text from a webpage\n" +
|
|
105
|
+
"- extract_content: Smart content extraction with readability\n" +
|
|
106
|
+
"- search_web: Search the web and get structured results\n" +
|
|
107
|
+
"- crawl_deep: Crawl a website following links to a specified depth\n" +
|
|
108
|
+
"- map_site: Discover all pages on a website\n" +
|
|
109
|
+
"- batch_scrape: Scrape multiple URLs in parallel\n" +
|
|
110
|
+
"- scrape_with_actions: Automate browser actions then scrape\n" +
|
|
111
|
+
"- deep_research: Multi-source research on any topic\n" +
|
|
112
|
+
"- stealth_mode: Anti-detection browsing for protected sites\n" +
|
|
113
|
+
"- extract_structured: LLM-powered structured data extraction\n" +
|
|
114
|
+
"- track_changes: Monitor website changes over time\n" +
|
|
115
|
+
"- generate_llms_txt: Generate llms.txt for any website\n\n" +
|
|
116
|
+
"Workflow: search_web -> fetch_url -> extract_content -> analyze_content\n\n" +
|
|
117
|
+
"Get your API key at https://www.crawlforge.dev/signup (1,000 free credits)"
|
|
118
|
+
}
|
|
119
|
+
}]
|
|
120
|
+
};
|
|
121
|
+
});
|
|
101
122
|
|
|
102
123
|
// Helper function to wrap tool handlers with authentication and credit tracking
|
|
103
124
|
function withAuth(toolName, handler) {
|
|
@@ -171,6 +192,9 @@ const processDocumentTool = new ProcessDocumentTool();
|
|
|
171
192
|
const summarizeContentTool = new SummarizeContentTool();
|
|
172
193
|
const analyzeContentTool = new AnalyzeContentTool();
|
|
173
194
|
|
|
195
|
+
// Phase 1: LLM-Powered Structured Extraction Tool
|
|
196
|
+
const extractStructuredTool = new ExtractStructuredTool();
|
|
197
|
+
|
|
174
198
|
// Initialize Wave 2 Advanced Tools
|
|
175
199
|
const batchScrapeTool = new BatchScrapeTool();
|
|
176
200
|
const scrapeWithActionsTool = new ScrapeWithActionsTool();
|
|
@@ -620,10 +644,11 @@ async function fetchWithTimeout(url, options = {}) {
|
|
|
620
644
|
// Tool: fetch_url - Basic URL fetching with headers and response handling
|
|
621
645
|
server.registerTool("fetch_url", {
|
|
622
646
|
description: "Fetch content from a URL with optional headers and timeout",
|
|
647
|
+
annotations: { title: "Fetch URL", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
623
648
|
inputSchema: {
|
|
624
|
-
url: z.string().url(),
|
|
625
|
-
headers: z.record(z.string()).optional(),
|
|
626
|
-
timeout: z.number().min(1000).max(30000).optional().default(10000)
|
|
649
|
+
url: z.string().url().describe("The URL to fetch content from"),
|
|
650
|
+
headers: z.record(z.string()).optional().describe("Custom HTTP headers to include in the request"),
|
|
651
|
+
timeout: z.number().min(1000).max(30000).optional().default(10000).describe("Request timeout in milliseconds (1000-30000)")
|
|
627
652
|
}
|
|
628
653
|
}, withAuth("fetch_url", async ({ url, headers, timeout }) => {
|
|
629
654
|
try {
|
|
@@ -666,10 +691,11 @@ server.registerTool("fetch_url", {
|
|
|
666
691
|
// Tool: extract_text - Extract clean text content from HTML
|
|
667
692
|
server.registerTool("extract_text", {
|
|
668
693
|
description: "Extract clean text content from a webpage",
|
|
694
|
+
annotations: { title: "Extract Text", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
669
695
|
inputSchema: {
|
|
670
|
-
url: z.string().url(),
|
|
671
|
-
remove_scripts: z.boolean().optional().default(true),
|
|
672
|
-
remove_styles: z.boolean().optional().default(true)
|
|
696
|
+
url: z.string().url().describe("The URL to extract text from"),
|
|
697
|
+
remove_scripts: z.boolean().optional().default(true).describe("Remove script tags before extraction"),
|
|
698
|
+
remove_styles: z.boolean().optional().default(true).describe("Remove style tags before extraction")
|
|
673
699
|
}
|
|
674
700
|
}, withAuth("extract_text", async ({ url, remove_scripts, remove_styles }) => {
|
|
675
701
|
try {
|
|
@@ -720,10 +746,11 @@ server.registerTool("extract_text", {
|
|
|
720
746
|
// Tool: extract_links - Extract all links from a webpage with optional filtering
|
|
721
747
|
server.registerTool("extract_links", {
|
|
722
748
|
description: "Extract all links from a webpage with optional filtering",
|
|
749
|
+
annotations: { title: "Extract Links", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
723
750
|
inputSchema: {
|
|
724
|
-
url: z.string().url(),
|
|
725
|
-
filter_external: z.boolean().optional().default(false),
|
|
726
|
-
base_url: z.string().url().optional()
|
|
751
|
+
url: z.string().url().describe("The URL to extract links from"),
|
|
752
|
+
filter_external: z.boolean().optional().default(false).describe("Only return external links"),
|
|
753
|
+
base_url: z.string().url().optional().describe("Base URL for resolving relative links")
|
|
727
754
|
}
|
|
728
755
|
}, withAuth("extract_links", async ({ url, filter_external, base_url }) => {
|
|
729
756
|
try {
|
|
@@ -804,8 +831,9 @@ server.registerTool("extract_links", {
|
|
|
804
831
|
// Tool: extract_metadata - Extract page metadata
|
|
805
832
|
server.registerTool("extract_metadata", {
|
|
806
833
|
description: "Extract metadata from a webpage (title, description, keywords, etc.)",
|
|
834
|
+
annotations: { title: "Extract Metadata", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
807
835
|
inputSchema: {
|
|
808
|
-
url: z.string().url()
|
|
836
|
+
url: z.string().url().describe("The URL to extract metadata from")
|
|
809
837
|
}
|
|
810
838
|
}, withAuth("extract_metadata", async ({ url }) => {
|
|
811
839
|
try {
|
|
@@ -883,9 +911,10 @@ server.registerTool("extract_metadata", {
|
|
|
883
911
|
// Tool: scrape_structured - Extract structured data using CSS selectors
|
|
884
912
|
server.registerTool("scrape_structured", {
|
|
885
913
|
description: "Extract structured data from a webpage using CSS selectors",
|
|
914
|
+
annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
886
915
|
inputSchema: {
|
|
887
|
-
url: z.string().url(),
|
|
888
|
-
selectors: z.record(z.string())
|
|
916
|
+
url: z.string().url().describe("The URL to scrape"),
|
|
917
|
+
selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors")
|
|
889
918
|
}
|
|
890
919
|
}, withAuth("scrape_structured", async ({ url, selectors }) => {
|
|
891
920
|
try {
|
|
@@ -946,15 +975,16 @@ server.registerTool("scrape_structured", {
|
|
|
946
975
|
// Tool: search_web - Search the web using Google Search via CrawlForge proxy
|
|
947
976
|
server.registerTool("search_web", {
|
|
948
977
|
description: "Search the web using Google Search API (proxied through CrawlForge)",
|
|
978
|
+
annotations: { title: "Search the Web", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
949
979
|
inputSchema: {
|
|
950
|
-
query: z.string(),
|
|
951
|
-
limit: z.number().min(1).max(100).optional(),
|
|
952
|
-
offset: z.number().min(0).optional(),
|
|
953
|
-
lang: z.string().optional(),
|
|
954
|
-
safe_search: z.boolean().optional(),
|
|
955
|
-
time_range: z.enum(["day", "week", "month", "year", "all"]).optional(),
|
|
956
|
-
site: z.string().optional(),
|
|
957
|
-
file_type: z.string().optional()
|
|
980
|
+
query: z.string().describe("Search query string"),
|
|
981
|
+
limit: z.number().min(1).max(100).optional().describe("Maximum number of results to return"),
|
|
982
|
+
offset: z.number().min(0).optional().describe("Number of results to skip for pagination"),
|
|
983
|
+
lang: z.string().optional().describe("Language code for results (e.g. 'en', 'fr')"),
|
|
984
|
+
safe_search: z.boolean().optional().describe("Enable safe search filtering"),
|
|
985
|
+
time_range: z.enum(["day", "week", "month", "year", "all"]).optional().describe("Filter results by time range"),
|
|
986
|
+
site: z.string().optional().describe("Limit results to a specific domain"),
|
|
987
|
+
file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')")
|
|
958
988
|
}
|
|
959
989
|
}, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
|
|
960
990
|
try {
|
|
@@ -989,16 +1019,17 @@ server.registerTool("search_web", {
|
|
|
989
1019
|
// Tool: crawl_deep - Deep crawl websites with BFS algorithm
|
|
990
1020
|
server.registerTool("crawl_deep", {
|
|
991
1021
|
description: "Crawl websites deeply using breadth-first search",
|
|
1022
|
+
annotations: { title: "Deep Crawl", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
992
1023
|
inputSchema: {
|
|
993
|
-
url: z.string().url(),
|
|
994
|
-
max_depth: z.number().min(1).max(5).optional(),
|
|
995
|
-
max_pages: z.number().min(1).max(1000).optional(),
|
|
996
|
-
include_patterns: z.array(z.string()).optional(),
|
|
997
|
-
exclude_patterns: z.array(z.string()).optional(),
|
|
998
|
-
follow_external: z.boolean().optional(),
|
|
999
|
-
respect_robots: z.boolean().optional(),
|
|
1000
|
-
extract_content: z.boolean().optional(),
|
|
1001
|
-
concurrency: z.number().min(1).max(20).optional()
|
|
1024
|
+
url: z.string().url().describe("Starting URL for the crawl"),
|
|
1025
|
+
max_depth: z.number().min(1).max(5).optional().describe("Maximum crawl depth from starting URL"),
|
|
1026
|
+
max_pages: z.number().min(1).max(1000).optional().describe("Maximum number of pages to crawl"),
|
|
1027
|
+
include_patterns: z.array(z.string()).optional().describe("URL patterns to include (regex)"),
|
|
1028
|
+
exclude_patterns: z.array(z.string()).optional().describe("URL patterns to exclude (regex)"),
|
|
1029
|
+
follow_external: z.boolean().optional().describe("Follow links to external domains"),
|
|
1030
|
+
respect_robots: z.boolean().optional().describe("Respect robots.txt directives"),
|
|
1031
|
+
extract_content: z.boolean().optional().describe("Extract page content during crawl"),
|
|
1032
|
+
concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests")
|
|
1002
1033
|
}
|
|
1003
1034
|
}, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
|
|
1004
1035
|
try {
|
|
@@ -1033,12 +1064,13 @@ server.registerTool("crawl_deep", {
|
|
|
1033
1064
|
// Tool: map_site - Discover and map website structure
|
|
1034
1065
|
server.registerTool("map_site", {
|
|
1035
1066
|
description: "Discover and map website structure",
|
|
1067
|
+
annotations: { title: "Map Website", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
1036
1068
|
inputSchema: {
|
|
1037
|
-
url: z.string().url(),
|
|
1038
|
-
include_sitemap: z.boolean().optional(),
|
|
1039
|
-
max_urls: z.number().min(1).max(10000).optional(),
|
|
1040
|
-
group_by_path: z.boolean().optional(),
|
|
1041
|
-
include_metadata: z.boolean().optional()
|
|
1069
|
+
url: z.string().url().describe("The website URL to map"),
|
|
1070
|
+
include_sitemap: z.boolean().optional().describe("Include sitemap.xml data in results"),
|
|
1071
|
+
max_urls: z.number().min(1).max(10000).optional().describe("Maximum number of URLs to discover"),
|
|
1072
|
+
group_by_path: z.boolean().optional().describe("Group URLs by path segments"),
|
|
1073
|
+
include_metadata: z.boolean().optional().describe("Include page metadata for each URL")
|
|
1042
1074
|
}
|
|
1043
1075
|
}, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
|
|
1044
1076
|
try {
|
|
@@ -1075,9 +1107,10 @@ server.registerTool("map_site", {
|
|
|
1075
1107
|
// Tool: extract_content - Enhanced content extraction with readability detection
|
|
1076
1108
|
server.registerTool("extract_content", {
|
|
1077
1109
|
description: "Extract and analyze main content from web pages with enhanced readability detection",
|
|
1110
|
+
annotations: { title: "Extract Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
1078
1111
|
inputSchema: {
|
|
1079
|
-
url: z.string().url(),
|
|
1080
|
-
options: z.object({}).optional()
|
|
1112
|
+
url: z.string().url().describe("The URL to extract content from"),
|
|
1113
|
+
options: z.object({}).optional().describe("Additional extraction options")
|
|
1081
1114
|
}
|
|
1082
1115
|
}, withAuth("extract_content", async ({ url, options }) => {
|
|
1083
1116
|
try {
|
|
@@ -1112,10 +1145,11 @@ server.registerTool("extract_content", {
|
|
|
1112
1145
|
// Tool: process_document - Multi-format document processing
|
|
1113
1146
|
server.registerTool("process_document", {
|
|
1114
1147
|
description: "Process documents from multiple sources and formats including PDFs and web pages",
|
|
1148
|
+
annotations: { title: "Process Document", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
1115
1149
|
inputSchema: {
|
|
1116
|
-
source: z.string(),
|
|
1117
|
-
sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional(),
|
|
1118
|
-
options: z.object({}).optional()
|
|
1150
|
+
source: z.string().describe("Document source - URL or file path"),
|
|
1151
|
+
sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional().describe("Type of document source"),
|
|
1152
|
+
options: z.object({}).optional().describe("Additional processing options")
|
|
1119
1153
|
}
|
|
1120
1154
|
}, withAuth("process_document", async ({ source, sourceType, options }) => {
|
|
1121
1155
|
try {
|
|
@@ -1150,9 +1184,10 @@ server.registerTool("process_document", {
|
|
|
1150
1184
|
// Tool: summarize_content - Intelligent content summarization
|
|
1151
1185
|
server.registerTool("summarize_content", {
|
|
1152
1186
|
description: "Generate intelligent summaries of text content with configurable options",
|
|
1187
|
+
annotations: { title: "Summarize Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
|
|
1153
1188
|
inputSchema: {
|
|
1154
|
-
text: z.string(),
|
|
1155
|
-
options: z.object({}).optional()
|
|
1189
|
+
text: z.string().describe("The text content to summarize"),
|
|
1190
|
+
options: z.object({}).optional().describe("Summarization options")
|
|
1156
1191
|
}
|
|
1157
1192
|
}, withAuth("summarize_content", async ({ text, options }) => {
|
|
1158
1193
|
try {
|
|
@@ -1187,9 +1222,10 @@ server.registerTool("summarize_content", {
|
|
|
1187
1222
|
// Tool: analyze_content - Comprehensive content analysis
|
|
1188
1223
|
server.registerTool("analyze_content", {
|
|
1189
1224
|
description: "Perform comprehensive content analysis including language detection and topic extraction",
|
|
1225
|
+
annotations: { title: "Analyze Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
|
|
1190
1226
|
inputSchema: {
|
|
1191
|
-
text: z.string(),
|
|
1192
|
-
options: z.object({}).optional()
|
|
1227
|
+
text: z.string().describe("The text content to analyze"),
|
|
1228
|
+
options: z.object({}).optional().describe("Analysis options")
|
|
1193
1229
|
}
|
|
1194
1230
|
}, withAuth("analyze_content", async ({ text, options }) => {
|
|
1195
1231
|
try {
|
|
@@ -1222,11 +1258,62 @@ server.registerTool("analyze_content", {
|
|
|
1222
1258
|
}));
|
|
1223
1259
|
|
|
1224
1260
|
|
|
1261
|
+
|
|
1262
|
+
// Phase 1: LLM-Powered Structured Extraction
|
|
1263
|
+
|
|
1264
|
+
// Tool: extract_structured - Extract structured data from a URL using LLM and JSON Schema
|
|
1265
|
+
server.registerTool("extract_structured", {
|
|
1266
|
+
description: "Extract structured data from a webpage using LLM-powered analysis and a JSON Schema. Falls back to CSS selector extraction when no LLM provider is configured.",
|
|
1267
|
+
annotations: { title: "Extract Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
1268
|
+
inputSchema: {
|
|
1269
|
+
url: z.string().url().describe("The URL to extract structured data from"),
|
|
1270
|
+
schema: z.object({
|
|
1271
|
+
type: z.string().optional(),
|
|
1272
|
+
properties: z.record(z.any()),
|
|
1273
|
+
required: z.array(z.string()).optional()
|
|
1274
|
+
}).describe("JSON schema defining the data structure to extract"),
|
|
1275
|
+
prompt: z.string().optional().describe("Natural language instructions for extraction"),
|
|
1276
|
+
llmConfig: z.object({
|
|
1277
|
+
provider: z.string().optional(),
|
|
1278
|
+
apiKey: z.string().optional()
|
|
1279
|
+
}).optional().describe("LLM provider configuration for AI-powered extraction"),
|
|
1280
|
+
fallbackToSelectors: z.boolean().optional().default(true).describe("Fall back to CSS selector extraction if LLM is unavailable"),
|
|
1281
|
+
selectorHints: z.record(z.string()).optional().describe("CSS selector hints to guide extraction")
|
|
1282
|
+
}
|
|
1283
|
+
}, withAuth("extract_structured", async ({ url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints }) => {
|
|
1284
|
+
try {
|
|
1285
|
+
const result = await extractStructuredTool.execute({
|
|
1286
|
+
url,
|
|
1287
|
+
schema,
|
|
1288
|
+
prompt,
|
|
1289
|
+
llmConfig,
|
|
1290
|
+
fallbackToSelectors,
|
|
1291
|
+
selectorHints
|
|
1292
|
+
});
|
|
1293
|
+
return {
|
|
1294
|
+
content: [{
|
|
1295
|
+
type: "text",
|
|
1296
|
+
text: JSON.stringify(result, null, 2)
|
|
1297
|
+
}]
|
|
1298
|
+
};
|
|
1299
|
+
} catch (error) {
|
|
1300
|
+
return {
|
|
1301
|
+
content: [{
|
|
1302
|
+
type: "text",
|
|
1303
|
+
text: `Structured extraction failed: ${error.message}`
|
|
1304
|
+
}],
|
|
1305
|
+
isError: true
|
|
1306
|
+
};
|
|
1307
|
+
}
|
|
1308
|
+
}));
|
|
1309
|
+
|
|
1310
|
+
|
|
1225
1311
|
// Wave 2 Advanced Tools
|
|
1226
1312
|
|
|
1227
1313
|
// Tool: batch_scrape - Process multiple URLs simultaneously with job management
|
|
1228
1314
|
server.registerTool("batch_scrape", {
|
|
1229
1315
|
description: "Process multiple URLs simultaneously with support for async job management and webhook notifications",
|
|
1316
|
+
annotations: { title: "Batch Scrape", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1230
1317
|
inputSchema: {
|
|
1231
1318
|
urls: z.array(z.union([
|
|
1232
1319
|
z.string().url(),
|
|
@@ -1237,27 +1324,27 @@ server.registerTool("batch_scrape", {
|
|
|
1237
1324
|
timeout: z.number().min(1000).max(30000).optional(),
|
|
1238
1325
|
metadata: z.record(z.any()).optional()
|
|
1239
1326
|
})
|
|
1240
|
-
])).min(1).max(50),
|
|
1241
|
-
formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
|
|
1242
|
-
mode: z.enum(['sync', 'async']).default('sync'),
|
|
1327
|
+
])).min(1).max(50).describe("Array of URLs or URL objects to scrape"),
|
|
1328
|
+
formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']).describe("Output formats for scraped content"),
|
|
1329
|
+
mode: z.enum(['sync', 'async']).default('sync').describe("Processing mode: sync (wait) or async (background)"),
|
|
1243
1330
|
webhook: z.object({
|
|
1244
1331
|
url: z.string().url(),
|
|
1245
1332
|
events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
|
|
1246
1333
|
headers: z.record(z.string()).optional(),
|
|
1247
1334
|
signingSecret: z.string().optional()
|
|
1248
|
-
}).optional(),
|
|
1249
|
-
extractionSchema: z.record(z.string()).optional(),
|
|
1250
|
-
maxConcurrency: z.number().min(1).max(20).default(10),
|
|
1251
|
-
delayBetweenRequests: z.number().min(0).max(10000).default(100),
|
|
1252
|
-
includeMetadata: z.boolean().default(true),
|
|
1253
|
-
includeFailed: z.boolean().default(true),
|
|
1254
|
-
pageSize: z.number().min(1).max(100).default(25),
|
|
1335
|
+
}).optional().describe("Webhook configuration for async job notifications"),
|
|
1336
|
+
extractionSchema: z.record(z.string()).optional().describe("Schema for structured data extraction from each URL"),
|
|
1337
|
+
maxConcurrency: z.number().min(1).max(20).default(10).describe("Maximum concurrent scraping requests"),
|
|
1338
|
+
delayBetweenRequests: z.number().min(0).max(10000).default(100).describe("Delay in milliseconds between requests"),
|
|
1339
|
+
includeMetadata: z.boolean().default(true).describe("Include page metadata in results"),
|
|
1340
|
+
includeFailed: z.boolean().default(true).describe("Include failed URLs in results"),
|
|
1341
|
+
pageSize: z.number().min(1).max(100).default(25).describe("Number of results per page"),
|
|
1255
1342
|
jobOptions: z.object({
|
|
1256
1343
|
priority: z.number().default(0),
|
|
1257
1344
|
ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
|
|
1258
1345
|
maxRetries: z.number().min(0).max(5).default(1),
|
|
1259
1346
|
tags: z.array(z.string()).default([])
|
|
1260
|
-
}).optional()
|
|
1347
|
+
}).optional().describe("Job management options for async processing")
|
|
1261
1348
|
}
|
|
1262
1349
|
}, withAuth("batch_scrape", async (params) => {
|
|
1263
1350
|
try {
|
|
@@ -1282,8 +1369,9 @@ server.registerTool("batch_scrape", {
|
|
|
1282
1369
|
// Tool: scrape_with_actions - Execute action chains before scraping
|
|
1283
1370
|
server.registerTool("scrape_with_actions", {
|
|
1284
1371
|
description: "Execute browser action chains before scraping content, with form auto-fill and intermediate state capture",
|
|
1372
|
+
annotations: { title: "Scrape with Browser Actions", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1285
1373
|
inputSchema: {
|
|
1286
|
-
url: z.string().url(),
|
|
1374
|
+
url: z.string().url().describe("The URL to scrape"),
|
|
1287
1375
|
actions: z.array(z.object({
|
|
1288
1376
|
type: z.enum(['wait', 'click', 'type', 'press', 'scroll', 'screenshot', 'executeJavaScript']),
|
|
1289
1377
|
selector: z.string().optional(),
|
|
@@ -1294,10 +1382,10 @@ server.registerTool("scrape_with_actions", {
|
|
|
1294
1382
|
description: z.string().optional(),
|
|
1295
1383
|
continueOnError: z.boolean().default(false),
|
|
1296
1384
|
retries: z.number().min(0).max(5).default(0)
|
|
1297
|
-
})).min(1).max(20),
|
|
1298
|
-
formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
|
|
1299
|
-
captureIntermediateStates: z.boolean().default(false),
|
|
1300
|
-
captureScreenshots: z.boolean().default(true),
|
|
1385
|
+
})).min(1).max(20).describe("Browser actions to perform before scraping"),
|
|
1386
|
+
formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']).describe("Output formats for scraped content"),
|
|
1387
|
+
captureIntermediateStates: z.boolean().default(false).describe("Capture page state after each action"),
|
|
1388
|
+
captureScreenshots: z.boolean().default(true).describe("Take screenshots during action execution"),
|
|
1301
1389
|
formAutoFill: z.object({
|
|
1302
1390
|
fields: z.array(z.object({
|
|
1303
1391
|
selector: z.string(),
|
|
@@ -1307,23 +1395,23 @@ server.registerTool("scrape_with_actions", {
|
|
|
1307
1395
|
})),
|
|
1308
1396
|
submitSelector: z.string().optional(),
|
|
1309
1397
|
waitAfterSubmit: z.number().min(0).max(30000).default(2000)
|
|
1310
|
-
}).optional(),
|
|
1398
|
+
}).optional().describe("Form auto-fill configuration"),
|
|
1311
1399
|
browserOptions: z.object({
|
|
1312
1400
|
headless: z.boolean().default(true),
|
|
1313
1401
|
userAgent: z.string().optional(),
|
|
1314
1402
|
viewportWidth: z.number().min(800).max(1920).default(1280),
|
|
1315
1403
|
viewportHeight: z.number().min(600).max(1080).default(720),
|
|
1316
1404
|
timeout: z.number().min(10000).max(120000).default(30000)
|
|
1317
|
-
}).optional(),
|
|
1405
|
+
}).optional().describe("Browser configuration options"),
|
|
1318
1406
|
extractionOptions: z.object({
|
|
1319
1407
|
selectors: z.record(z.string()).optional(),
|
|
1320
1408
|
includeMetadata: z.boolean().default(true),
|
|
1321
1409
|
includeLinks: z.boolean().default(true),
|
|
1322
1410
|
includeImages: z.boolean().default(true)
|
|
1323
|
-
}).optional(),
|
|
1324
|
-
continueOnActionError: z.boolean().default(false),
|
|
1325
|
-
maxRetries: z.number().min(0).max(3).default(1),
|
|
1326
|
-
screenshotOnError: z.boolean().default(true)
|
|
1411
|
+
}).optional().describe("Content extraction options"),
|
|
1412
|
+
continueOnActionError: z.boolean().default(false).describe("Continue executing actions if one fails"),
|
|
1413
|
+
maxRetries: z.number().min(0).max(3).default(1).describe("Maximum retry attempts on failure"),
|
|
1414
|
+
screenshotOnError: z.boolean().default(true).describe("Capture screenshot when an error occurs")
|
|
1327
1415
|
}
|
|
1328
1416
|
}, withAuth("scrape_with_actions", async (params) => {
|
|
1329
1417
|
try {
|
|
@@ -1348,27 +1436,28 @@ server.registerTool("scrape_with_actions", {
|
|
|
1348
1436
|
// Tool: deep_research - Comprehensive multi-stage research with source verification
|
|
1349
1437
|
server.registerTool("deep_research", {
|
|
1350
1438
|
description: "Conduct comprehensive multi-stage research with intelligent query expansion, source verification, and conflict detection",
|
|
1439
|
+
annotations: { title: "Deep Research", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1351
1440
|
inputSchema: {
|
|
1352
|
-
topic: z.string().min(3).max(500),
|
|
1353
|
-
maxDepth: z.number().min(1).max(10).optional().default(5),
|
|
1354
|
-
maxUrls: z.number().min(1).max(1000).optional().default(50),
|
|
1355
|
-
timeLimit: z.number().min(30000).max(300000).optional().default(120000),
|
|
1356
|
-
researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'),
|
|
1357
|
-
sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']),
|
|
1358
|
-
credibilityThreshold: z.number().min(0).max(1).optional().default(0.3),
|
|
1359
|
-
includeRecentOnly: z.boolean().optional().default(false),
|
|
1360
|
-
enableConflictDetection: z.boolean().optional().default(true),
|
|
1361
|
-
enableSourceVerification: z.boolean().optional().default(true),
|
|
1362
|
-
enableSynthesis: z.boolean().optional().default(true),
|
|
1363
|
-
outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'),
|
|
1364
|
-
includeRawData: z.boolean().optional().default(false),
|
|
1365
|
-
includeActivityLog: z.boolean().optional().default(false),
|
|
1441
|
+
topic: z.string().min(3).max(500).describe("Research topic or question"),
|
|
1442
|
+
maxDepth: z.number().min(1).max(10).optional().default(5).describe("Maximum research depth"),
|
|
1443
|
+
maxUrls: z.number().min(1).max(1000).optional().default(50).describe("Maximum URLs to analyze"),
|
|
1444
|
+
timeLimit: z.number().min(30000).max(300000).optional().default(120000).describe("Time limit in milliseconds for the research"),
|
|
1445
|
+
researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad').describe("Research methodology approach"),
|
|
1446
|
+
sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']).describe("Types of sources to include"),
|
|
1447
|
+
credibilityThreshold: z.number().min(0).max(1).optional().default(0.3).describe("Minimum credibility score for sources (0-1)"),
|
|
1448
|
+
includeRecentOnly: z.boolean().optional().default(false).describe("Only include recent sources"),
|
|
1449
|
+
enableConflictDetection: z.boolean().optional().default(true).describe("Detect conflicting information across sources"),
|
|
1450
|
+
enableSourceVerification: z.boolean().optional().default(true).describe("Verify source credibility"),
|
|
1451
|
+
enableSynthesis: z.boolean().optional().default(true).describe("Synthesize findings into a coherent report"),
|
|
1452
|
+
outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive').describe("Output format for the research report"),
|
|
1453
|
+
includeRawData: z.boolean().optional().default(false).describe("Include raw scraped data in output"),
|
|
1454
|
+
includeActivityLog: z.boolean().optional().default(false).describe("Include detailed activity log"),
|
|
1366
1455
|
queryExpansion: z.object({
|
|
1367
1456
|
enableSynonyms: z.boolean().optional().default(true),
|
|
1368
1457
|
enableSpellCheck: z.boolean().optional().default(true),
|
|
1369
1458
|
enableContextual: z.boolean().optional().default(true),
|
|
1370
1459
|
maxVariations: z.number().min(1).max(20).optional().default(8)
|
|
1371
|
-
}).optional(),
|
|
1460
|
+
}).optional().describe("Query expansion settings for broader search coverage"),
|
|
1372
1461
|
llmConfig: z.object({
|
|
1373
1462
|
provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'),
|
|
1374
1463
|
openai: z.object({
|
|
@@ -1382,14 +1471,14 @@ server.registerTool("deep_research", {
|
|
|
1382
1471
|
}).optional(),
|
|
1383
1472
|
enableSemanticAnalysis: z.boolean().optional().default(true),
|
|
1384
1473
|
enableIntelligentSynthesis: z.boolean().optional().default(true)
|
|
1385
|
-
}).optional(),
|
|
1386
|
-
concurrency: z.number().min(1).max(20).optional().default(5),
|
|
1387
|
-
cacheResults: z.boolean().optional().default(true),
|
|
1474
|
+
}).optional().describe("LLM provider configuration for AI-powered analysis"),
|
|
1475
|
+
concurrency: z.number().min(1).max(20).optional().default(5).describe("Number of concurrent research requests"),
|
|
1476
|
+
cacheResults: z.boolean().optional().default(true).describe("Cache research results for reuse"),
|
|
1388
1477
|
webhook: z.object({
|
|
1389
1478
|
url: z.string().url(),
|
|
1390
1479
|
events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']),
|
|
1391
1480
|
headers: z.record(z.string()).optional()
|
|
1392
|
-
}).optional()
|
|
1481
|
+
}).optional().describe("Webhook for progress and completion notifications")
|
|
1393
1482
|
}
|
|
1394
1483
|
}, withAuth("deep_research", async (params) => {
|
|
1395
1484
|
try {
|
|
@@ -1414,13 +1503,14 @@ server.registerTool("deep_research", {
|
|
|
1414
1503
|
// Tool: track_changes - Enhanced Content change tracking with baseline capture and monitoring (Phase 2.4)
|
|
1415
1504
|
server.registerTool("track_changes", {
|
|
1416
1505
|
description: "Enhanced content change tracking with baseline capture, comparison, scheduled monitoring, advanced comparison engine, alert system, and historical analysis",
|
|
1506
|
+
annotations: { title: "Track Changes", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1417
1507
|
inputSchema: {
|
|
1418
|
-
url: z.string().url(),
|
|
1508
|
+
url: z.string().url().describe("The URL to track changes for"),
|
|
1419
1509
|
operation: z.enum([
|
|
1420
|
-
'create_baseline',
|
|
1421
|
-
'compare',
|
|
1422
|
-
'monitor',
|
|
1423
|
-
'get_history',
|
|
1510
|
+
'create_baseline',
|
|
1511
|
+
'compare',
|
|
1512
|
+
'monitor',
|
|
1513
|
+
'get_history',
|
|
1424
1514
|
'get_stats',
|
|
1425
1515
|
'create_scheduled_monitor',
|
|
1426
1516
|
'stop_scheduled_monitor',
|
|
@@ -1429,9 +1519,9 @@ server.registerTool("track_changes", {
|
|
|
1429
1519
|
'create_alert_rule',
|
|
1430
1520
|
'generate_trend_report',
|
|
1431
1521
|
'get_monitoring_templates'
|
|
1432
|
-
]).default('compare'),
|
|
1433
|
-
content: z.string().optional(),
|
|
1434
|
-
html: z.string().optional(),
|
|
1522
|
+
]).default('compare').describe("Tracking operation to perform"),
|
|
1523
|
+
content: z.string().optional().describe("Content to compare against baseline"),
|
|
1524
|
+
html: z.string().optional().describe("HTML content to compare against baseline"),
|
|
1435
1525
|
trackingOptions: z.object({
|
|
1436
1526
|
granularity: z.enum(['page', 'section', 'element', 'text']).default('section'),
|
|
1437
1527
|
trackText: z.boolean().default(true),
|
|
@@ -1448,7 +1538,7 @@ server.registerTool("track_changes", {
|
|
|
1448
1538
|
moderate: z.number().min(0).max(1).default(0.3),
|
|
1449
1539
|
major: z.number().min(0).max(1).default(0.7)
|
|
1450
1540
|
}).optional()
|
|
1451
|
-
}).optional(),
|
|
1541
|
+
}).optional().describe("Options for how changes are tracked and compared"),
|
|
1452
1542
|
monitoringOptions: z.object({
|
|
1453
1543
|
enabled: z.boolean().default(false),
|
|
1454
1544
|
interval: z.number().min(60000).max(24 * 60 * 60 * 1000).default(300000),
|
|
@@ -1458,14 +1548,14 @@ server.registerTool("track_changes", {
|
|
|
1458
1548
|
enableWebhook: z.boolean().default(false),
|
|
1459
1549
|
webhookUrl: z.string().url().optional(),
|
|
1460
1550
|
webhookSecret: z.string().optional()
|
|
1461
|
-
}).optional(),
|
|
1551
|
+
}).optional().describe("Monitoring schedule and notification settings"),
|
|
1462
1552
|
storageOptions: z.object({
|
|
1463
1553
|
enableSnapshots: z.boolean().default(true),
|
|
1464
1554
|
retainHistory: z.boolean().default(true),
|
|
1465
1555
|
maxHistoryEntries: z.number().min(1).max(1000).default(100),
|
|
1466
1556
|
compressionEnabled: z.boolean().default(true),
|
|
1467
1557
|
deltaStorageEnabled: z.boolean().default(true)
|
|
1468
|
-
}).optional(),
|
|
1558
|
+
}).optional().describe("Storage and history retention settings"),
|
|
1469
1559
|
queryOptions: z.object({
|
|
1470
1560
|
limit: z.number().min(1).max(500).default(50),
|
|
1471
1561
|
offset: z.number().min(0).default(0),
|
|
@@ -1473,7 +1563,7 @@ server.registerTool("track_changes", {
|
|
|
1473
1563
|
endTime: z.number().optional(),
|
|
1474
1564
|
includeContent: z.boolean().default(false),
|
|
1475
1565
|
significanceFilter: z.enum(['all', 'minor', 'moderate', 'major', 'critical']).optional()
|
|
1476
|
-
}).optional(),
|
|
1566
|
+
}).optional().describe("Query options for history and stats retrieval"),
|
|
1477
1567
|
notificationOptions: z.object({
|
|
1478
1568
|
webhook: z.object({
|
|
1479
1569
|
enabled: z.boolean().default(false),
|
|
@@ -1489,32 +1579,32 @@ server.registerTool("track_changes", {
|
|
|
1489
1579
|
channel: z.string().optional(),
|
|
1490
1580
|
username: z.string().optional()
|
|
1491
1581
|
}).optional()
|
|
1492
|
-
}).optional(),
|
|
1582
|
+
}).optional().describe("Notification configuration for webhooks and Slack"),
|
|
1493
1583
|
// Enhanced Phase 2.4 options
|
|
1494
1584
|
scheduledMonitorOptions: z.object({
|
|
1495
1585
|
schedule: z.string().optional(), // Cron expression
|
|
1496
1586
|
templateId: z.string().optional(), // Monitoring template ID
|
|
1497
1587
|
enabled: z.boolean().default(true)
|
|
1498
|
-
}).optional(),
|
|
1588
|
+
}).optional().describe("Scheduled monitoring options with cron expressions"),
|
|
1499
1589
|
alertRuleOptions: z.object({
|
|
1500
1590
|
ruleId: z.string().optional(),
|
|
1501
1591
|
condition: z.string().optional(), // Condition description
|
|
1502
1592
|
actions: z.array(z.enum(['webhook', 'email', 'slack'])).optional(),
|
|
1503
1593
|
throttle: z.number().min(0).optional(),
|
|
1504
1594
|
priority: z.enum(['low', 'medium', 'high']).optional()
|
|
1505
|
-
}).optional(),
|
|
1595
|
+
}).optional().describe("Alert rule configuration for change notifications"),
|
|
1506
1596
|
exportOptions: z.object({
|
|
1507
1597
|
format: z.enum(['json', 'csv']).default('json'),
|
|
1508
1598
|
startTime: z.number().optional(),
|
|
1509
1599
|
endTime: z.number().optional(),
|
|
1510
1600
|
includeContent: z.boolean().default(false),
|
|
1511
1601
|
includeSnapshots: z.boolean().default(false)
|
|
1512
|
-
}).optional(),
|
|
1602
|
+
}).optional().describe("Export options for change history data"),
|
|
1513
1603
|
dashboardOptions: z.object({
|
|
1514
1604
|
includeRecentAlerts: z.boolean().default(true),
|
|
1515
1605
|
includeTrends: z.boolean().default(true),
|
|
1516
1606
|
includeMonitorStatus: z.boolean().default(true)
|
|
1517
|
-
}).optional()
|
|
1607
|
+
}).optional().describe("Dashboard display options")
|
|
1518
1608
|
}
|
|
1519
1609
|
}, withAuth("track_changes", async (params) => {
|
|
1520
1610
|
try {
|
|
@@ -1539,8 +1629,9 @@ server.registerTool("track_changes", {
|
|
|
1539
1629
|
// Tool: generate_llms_txt - Generate LLMs.txt and LLMs-full.txt files (Phase 2.5)
|
|
1540
1630
|
server.registerTool("generate_llms_txt", {
|
|
1541
1631
|
description: "Analyze websites and generate standard-compliant LLMs.txt and LLMs-full.txt files defining AI model interaction guidelines",
|
|
1632
|
+
annotations: { title: "Generate llms.txt", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
1542
1633
|
inputSchema: {
|
|
1543
|
-
url: z.string().url(),
|
|
1634
|
+
url: z.string().url().describe("The website URL to generate llms.txt for"),
|
|
1544
1635
|
analysisOptions: z.object({
|
|
1545
1636
|
maxDepth: z.number().min(1).max(5).optional().default(3),
|
|
1546
1637
|
maxPages: z.number().min(10).max(500).optional().default(100),
|
|
@@ -1548,7 +1639,7 @@ server.registerTool("generate_llms_txt", {
|
|
|
1548
1639
|
analyzeContent: z.boolean().optional().default(true),
|
|
1549
1640
|
checkSecurity: z.boolean().optional().default(true),
|
|
1550
1641
|
respectRobots: z.boolean().optional().default(true)
|
|
1551
|
-
}).optional(),
|
|
1642
|
+
}).optional().describe("Website analysis options for depth, scope, and detection"),
|
|
1552
1643
|
outputOptions: z.object({
|
|
1553
1644
|
includeDetailed: z.boolean().optional().default(true),
|
|
1554
1645
|
includeAnalysis: z.boolean().optional().default(false),
|
|
@@ -1556,9 +1647,9 @@ server.registerTool("generate_llms_txt", {
|
|
|
1556
1647
|
organizationName: z.string().optional(),
|
|
1557
1648
|
customGuidelines: z.array(z.string()).optional(),
|
|
1558
1649
|
customRestrictions: z.array(z.string()).optional()
|
|
1559
|
-
}).optional(),
|
|
1560
|
-
complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard'),
|
|
1561
|
-
format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both')
|
|
1650
|
+
}).optional().describe("Output customization and organization details"),
|
|
1651
|
+
complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard').describe("Compliance level for generated guidelines"),
|
|
1652
|
+
format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both').describe("Output format: llms.txt, llms-full.txt, or both")
|
|
1562
1653
|
}
|
|
1563
1654
|
}, withAuth("generate_llms_txt", async (params) => {
|
|
1564
1655
|
try {
|
|
@@ -1583,8 +1674,9 @@ server.registerTool("generate_llms_txt", {
|
|
|
1583
1674
|
// Tool: stealth_mode - Advanced anti-detection browser management (Wave 3)
|
|
1584
1675
|
server.registerTool("stealth_mode", {
|
|
1585
1676
|
description: "Advanced anti-detection browser management with stealth features, fingerprint randomization, and human behavior simulation",
|
|
1677
|
+
annotations: { title: "Stealth Mode", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1586
1678
|
inputSchema: {
|
|
1587
|
-
operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure'),
|
|
1679
|
+
operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure').describe("Stealth operation to perform"),
|
|
1588
1680
|
stealthConfig: z.object({
|
|
1589
1681
|
level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
|
|
1590
1682
|
randomizeFingerprint: z.boolean().default(true),
|
|
@@ -1622,9 +1714,9 @@ server.registerTool("stealth_mode", {
|
|
|
1622
1714
|
fontSpoofing: z.boolean().default(true),
|
|
1623
1715
|
hardwareSpoofing: z.boolean().default(true)
|
|
1624
1716
|
}).optional()
|
|
1625
|
-
}).optional(),
|
|
1626
|
-
contextId: z.string().optional(),
|
|
1627
|
-
urlToTest: z.string().url().optional()
|
|
1717
|
+
}).optional().describe("Stealth browser configuration with anti-detection settings"),
|
|
1718
|
+
contextId: z.string().optional().describe("Browser context ID for page operations"),
|
|
1719
|
+
urlToTest: z.string().url().optional().describe("URL to navigate to when creating a page")
|
|
1628
1720
|
}
|
|
1629
1721
|
}, withAuth("stealth_mode", async ({ operation, stealthConfig, contextId, urlToTest }) => {
|
|
1630
1722
|
try {
|
|
@@ -1704,20 +1796,21 @@ server.registerTool("stealth_mode", {
|
|
|
1704
1796
|
// Tool: localization - Multi-language and geo-location management (Wave 3)
|
|
1705
1797
|
server.registerTool("localization", {
|
|
1706
1798
|
description: "Multi-language and geo-location management with country-specific settings, browser locale emulation, timezone spoofing, and geo-blocked content handling",
|
|
1799
|
+
annotations: { title: "Localization", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1707
1800
|
inputSchema: {
|
|
1708
|
-
operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country'),
|
|
1709
|
-
countryCode: z.string().length(2).optional(),
|
|
1710
|
-
language: z.string().optional(),
|
|
1711
|
-
timezone: z.string().optional(),
|
|
1712
|
-
currency: z.string().length(3).optional(),
|
|
1713
|
-
customHeaders: z.record(z.string()).optional(),
|
|
1714
|
-
userAgent: z.string().optional(),
|
|
1715
|
-
acceptLanguage: z.string().optional(),
|
|
1801
|
+
operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country').describe("Localization operation to perform"),
|
|
1802
|
+
countryCode: z.string().length(2).optional().describe("ISO 3166-1 alpha-2 country code"),
|
|
1803
|
+
language: z.string().optional().describe("Language code (e.g. 'en', 'fr', 'de')"),
|
|
1804
|
+
timezone: z.string().optional().describe("IANA timezone identifier (e.g. 'America/New_York')"),
|
|
1805
|
+
currency: z.string().length(3).optional().describe("ISO 4217 currency code (e.g. 'USD', 'EUR')"),
|
|
1806
|
+
customHeaders: z.record(z.string()).optional().describe("Custom HTTP headers for localized requests"),
|
|
1807
|
+
userAgent: z.string().optional().describe("Custom user agent string"),
|
|
1808
|
+
acceptLanguage: z.string().optional().describe("Accept-Language header value"),
|
|
1716
1809
|
geoLocation: z.object({
|
|
1717
1810
|
latitude: z.number().min(-90).max(90),
|
|
1718
1811
|
longitude: z.number().min(-180).max(180),
|
|
1719
1812
|
accuracy: z.number().min(1).max(100).optional()
|
|
1720
|
-
}).optional(),
|
|
1813
|
+
}).optional().describe("GPS coordinates for geolocation emulation"),
|
|
1721
1814
|
proxySettings: z.object({
|
|
1722
1815
|
enabled: z.boolean().default(false),
|
|
1723
1816
|
region: z.string().optional(),
|
|
@@ -1736,26 +1829,26 @@ server.registerTool("localization", {
|
|
|
1736
1829
|
maxRetries: z.number().default(3),
|
|
1737
1830
|
timeout: z.number().default(10000)
|
|
1738
1831
|
}).optional()
|
|
1739
|
-
}).optional(),
|
|
1832
|
+
}).optional().describe("Proxy configuration for geo-targeted requests"),
|
|
1740
1833
|
searchParams: z.object({
|
|
1741
1834
|
query: z.string().optional(),
|
|
1742
1835
|
limit: z.number().optional(),
|
|
1743
1836
|
offset: z.number().optional(),
|
|
1744
1837
|
headers: z.record(z.string()).optional()
|
|
1745
|
-
}).optional(),
|
|
1838
|
+
}).optional().describe("Search parameters for localized search queries"),
|
|
1746
1839
|
browserOptions: z.object({
|
|
1747
1840
|
locale: z.string().optional(),
|
|
1748
1841
|
timezoneId: z.string().optional(),
|
|
1749
1842
|
extraHTTPHeaders: z.record(z.string()).optional(),
|
|
1750
1843
|
userAgent: z.string().optional()
|
|
1751
|
-
}).optional(),
|
|
1752
|
-
content: z.string().optional(),
|
|
1753
|
-
url: z.string().url().optional(),
|
|
1844
|
+
}).optional().describe("Browser context options for locale emulation"),
|
|
1845
|
+
content: z.string().optional().describe("Content for auto-detection of language and locale"),
|
|
1846
|
+
url: z.string().url().optional().describe("URL for geo-blocking detection or auto-detection"),
|
|
1754
1847
|
response: z.object({
|
|
1755
1848
|
status: z.number(),
|
|
1756
1849
|
body: z.string().optional(),
|
|
1757
1850
|
statusText: z.string().optional()
|
|
1758
|
-
}).optional()
|
|
1851
|
+
}).optional().describe("HTTP response for geo-blocking analysis")
|
|
1759
1852
|
}
|
|
1760
1853
|
}, withAuth("localization", async (params) => {
|
|
1761
1854
|
try {
|
|
@@ -1837,11 +1930,92 @@ server.registerTool("localization", {
|
|
|
1837
1930
|
}
|
|
1838
1931
|
}));
|
|
1839
1932
|
|
|
1840
|
-
//
|
|
1933
|
+
// Determine transport mode: HTTP if --http flag or MCP_HTTP env var is set
|
|
1934
|
+
const useHttp = process.argv.includes('--http') || process.env.MCP_HTTP === 'true';
|
|
1935
|
+
|
|
1936
|
+
// Set up transport and start the server
|
|
1841
1937
|
async function runServer() {
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1938
|
+
if (useHttp) {
|
|
1939
|
+
const port = parseInt(process.env.PORT || '3000', 10);
|
|
1940
|
+
|
|
1941
|
+
// Stateless transport — no session tracking, each request is independent
|
|
1942
|
+
// This avoids the bug where server.connect(newTransport) kills previous sessions
|
|
1943
|
+
const transport = new StreamableHTTPServerTransport({
|
|
1944
|
+
sessionIdGenerator: undefined,
|
|
1945
|
+
});
|
|
1946
|
+
await server.connect(transport);
|
|
1947
|
+
|
|
1948
|
+
const httpServer = createServer(async (req, res) => {
|
|
1949
|
+
// CORS headers for Smithery gateway
|
|
1950
|
+
res.setHeader('Access-Control-Allow-Origin', '*');
|
|
1951
|
+
res.setHeader('Access-Control-Allow-Methods', 'GET, POST, DELETE, OPTIONS');
|
|
1952
|
+
res.setHeader('Access-Control-Allow-Headers', 'Content-Type, mcp-session-id');
|
|
1953
|
+
res.setHeader('Access-Control-Expose-Headers', 'mcp-session-id');
|
|
1954
|
+
|
|
1955
|
+
if (req.method === 'OPTIONS') {
|
|
1956
|
+
res.writeHead(204);
|
|
1957
|
+
res.end();
|
|
1958
|
+
return;
|
|
1959
|
+
}
|
|
1960
|
+
|
|
1961
|
+
// Health check endpoint
|
|
1962
|
+
if (req.url === '/health') {
|
|
1963
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
1964
|
+
res.end(JSON.stringify({ status: 'ok', version: '3.0' }));
|
|
1965
|
+
return;
|
|
1966
|
+
}
|
|
1967
|
+
|
|
1968
|
+
// MCP server card for Smithery discovery
|
|
1969
|
+
if (req.url === '/.well-known/mcp/server-card.json') {
|
|
1970
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
1971
|
+
res.end(JSON.stringify({
|
|
1972
|
+
serverInfo: {
|
|
1973
|
+
name: "crawlforge",
|
|
1974
|
+
version: "3.0.12",
|
|
1975
|
+
description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
|
|
1976
|
+
homepage: "https://www.crawlforge.dev",
|
|
1977
|
+
icon: "https://www.crawlforge.dev/icon.png"
|
|
1978
|
+
},
|
|
1979
|
+
transport: {
|
|
1980
|
+
type: "streamable-http",
|
|
1981
|
+
url: "/mcp"
|
|
1982
|
+
},
|
|
1983
|
+
configSchema: {
|
|
1984
|
+
type: "object",
|
|
1985
|
+
properties: {
|
|
1986
|
+
apiKey: {
|
|
1987
|
+
type: "string",
|
|
1988
|
+
title: "CrawlForge API Key",
|
|
1989
|
+
description: "Your CrawlForge API key. Get one free at https://www.crawlforge.dev/signup (includes 1,000 credits)",
|
|
1990
|
+
"x-from": { header: "x-api-key" }
|
|
1991
|
+
}
|
|
1992
|
+
},
|
|
1993
|
+
required: ["apiKey"]
|
|
1994
|
+
}
|
|
1995
|
+
}));
|
|
1996
|
+
return;
|
|
1997
|
+
}
|
|
1998
|
+
|
|
1999
|
+
// Route /mcp to the transport handler
|
|
2000
|
+
if (req.url === '/mcp' || req.url === '/') {
|
|
2001
|
+
await transport.handleRequest(req, res);
|
|
2002
|
+
return;
|
|
2003
|
+
}
|
|
2004
|
+
|
|
2005
|
+
res.writeHead(404);
|
|
2006
|
+
res.end('Not Found');
|
|
2007
|
+
});
|
|
2008
|
+
|
|
2009
|
+
httpServer.listen(port, () => {
|
|
2010
|
+
console.error(`CrawlForge MCP Server v3.0 running on HTTP port ${port}`);
|
|
2011
|
+
console.error(`MCP endpoint: http://localhost:${port}/mcp`);
|
|
2012
|
+
console.error(`Health check: http://localhost:${port}/health`);
|
|
2013
|
+
});
|
|
2014
|
+
} else {
|
|
2015
|
+
const transport = new StdioServerTransport();
|
|
2016
|
+
await server.connect(transport);
|
|
2017
|
+
console.error("CrawlForge MCP Server v3.0 running on stdio");
|
|
2018
|
+
}
|
|
1845
2019
|
console.error(`Environment: ${config.server.nodeEnv}`);
|
|
1846
2020
|
|
|
1847
2021
|
console.error("Search enabled: true (via CrawlForge proxy)");
|
|
@@ -1854,7 +2028,8 @@ async function runServer() {
|
|
|
1854
2028
|
const trackingTools = ", track_changes";
|
|
1855
2029
|
const llmsTxtTools = ", generate_llms_txt";
|
|
1856
2030
|
const wave3Tools = ", stealth_mode, localization";
|
|
1857
|
-
|
|
2031
|
+
const phase1Tools = ", extract_structured";
|
|
2032
|
+
console.error(`Tools available: ${baseTools}${searchTool}${phase3Tools}${wave2Tools}${researchTools}${trackingTools}${llmsTxtTools}${wave3Tools}${phase1Tools}`);
|
|
1858
2033
|
|
|
1859
2034
|
|
|
1860
2035
|
// === MEMORY LEAK PREVENTION ===
|
|
@@ -1880,7 +2055,8 @@ async function gracefulShutdown(signal) {
|
|
|
1880
2055
|
trackChangesTool,
|
|
1881
2056
|
generateLLMsTxtTool,
|
|
1882
2057
|
stealthBrowserManager,
|
|
1883
|
-
localizationManager
|
|
2058
|
+
localizationManager,
|
|
2059
|
+
extractStructuredTool
|
|
1884
2060
|
].filter(tool => tool && (typeof tool.destroy === 'function' || typeof tool.cleanup === 'function'));
|
|
1885
2061
|
|
|
1886
2062
|
console.error(`Cleaning up ${toolsToCleanup.length} tools...`);
|