crawlforge-mcp-server 3.0.12 → 3.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +103 -324
- package/package.json +2 -1
- package/server.js +332 -169
- package/src/core/AuthManager.js +5 -2
- package/src/core/ChangeTracker.js +1 -1
- package/src/core/ResearchOrchestrator.js +43 -5
- package/src/core/analysis/ContentAnalyzer.js +70 -17
- package/src/core/analysis/sentenceUtils.js +73 -0
- package/src/core/creatorMode.js +47 -0
- package/src/core/llm/LLMManager.js +120 -0
- package/src/core/processing/BrowserProcessor.js +1 -1
- package/src/tools/extract/extractStructured.js +280 -0
- package/src/tools/extract/summarizeContent.js +3 -2
- package/src/tools/search/ranking/ResultDeduplicator.js +21 -21
- package/src/tools/search/searchWeb.js +1 -1
package/server.js
CHANGED
|
@@ -1,45 +1,15 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
|
-
//
|
|
4
|
-
//
|
|
5
|
-
|
|
6
|
-
import dotenv from 'dotenv';
|
|
3
|
+
// Creator Mode Authentication — imported from src/core/creatorMode.js
|
|
4
|
+
// This MUST be the first import so the secret is verified before any tool code runs.
|
|
5
|
+
export { isCreatorModeVerified } from './src/core/creatorMode.js';
|
|
7
6
|
|
|
8
|
-
//
|
|
9
|
-
dotenv.config({ path: '.env', quiet: true });
|
|
10
|
-
|
|
11
|
-
// SECURITY: Clear any externally-set creator mode env var to prevent bypass
|
|
12
|
-
delete process.env.CRAWLFORGE_CREATOR_MODE;
|
|
13
|
-
|
|
14
|
-
const CREATOR_SECRET_HASH = 'cfef62e5068d48e7dd6a39c9e16f0be2615510c6b68274fc8abe3156feb5050b';
|
|
15
|
-
|
|
16
|
-
// Module-scoped flag - cannot be set externally
|
|
17
|
-
let _creatorModeVerified = false;
|
|
18
|
-
|
|
19
|
-
if (process.env.CRAWLFORGE_CREATOR_SECRET) {
|
|
20
|
-
const providedHash = crypto
|
|
21
|
-
.createHash('sha256')
|
|
22
|
-
.update(process.env.CRAWLFORGE_CREATOR_SECRET)
|
|
23
|
-
.digest('hex');
|
|
24
|
-
|
|
25
|
-
if (crypto.timingSafeEqual(Buffer.from(providedHash, 'hex'), Buffer.from(CREATOR_SECRET_HASH, 'hex'))) {
|
|
26
|
-
_creatorModeVerified = true;
|
|
27
|
-
console.log('🔓 Creator Mode Enabled - Unlimited Access');
|
|
28
|
-
} else {
|
|
29
|
-
console.warn('⚠️ Invalid creator secret provided');
|
|
30
|
-
}
|
|
31
|
-
// Clean up the secret from environment
|
|
32
|
-
delete process.env.CRAWLFORGE_CREATOR_SECRET;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
// Export getter for AuthManager to use
|
|
36
|
-
export function isCreatorModeVerified() {
|
|
37
|
-
return _creatorModeVerified;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
// Now import everything else
|
|
7
|
+
// Import everything else
|
|
41
8
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
42
9
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
10
|
+
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
11
|
+
import { createServer } from "node:http";
|
|
12
|
+
import { randomUUID } from "node:crypto";
|
|
43
13
|
import { z } from "zod";
|
|
44
14
|
import { load } from "cheerio";
|
|
45
15
|
import { SearchWebTool } from "./src/tools/search/searchWeb.js";
|
|
@@ -49,6 +19,8 @@ import { ExtractContentTool } from "./src/tools/extract/extractContent.js";
|
|
|
49
19
|
import { ProcessDocumentTool } from "./src/tools/extract/processDocument.js";
|
|
50
20
|
import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
|
|
51
21
|
import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
|
|
22
|
+
// Phase 1: LLM-Powered Structured Extraction
|
|
23
|
+
import { ExtractStructuredTool } from "./src/tools/extract/extractStructured.js";
|
|
52
24
|
// Wave 2 Advanced Tools
|
|
53
25
|
import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
|
|
54
26
|
import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
|
|
@@ -110,7 +82,43 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
|
|
|
110
82
|
}
|
|
111
83
|
|
|
112
84
|
// Create the server
|
|
113
|
-
const server = new McpServer({
|
|
85
|
+
const server = new McpServer({
|
|
86
|
+
name: "crawlforge",
|
|
87
|
+
version: "3.0.12",
|
|
88
|
+
description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
|
|
89
|
+
homepage: "https://www.crawlforge.dev",
|
|
90
|
+
icon: "https://www.crawlforge.dev/icon.png"
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
// Register getting-started prompt
|
|
94
|
+
server.prompt("getting-started", {
|
|
95
|
+
description: "Get started with CrawlForge MCP - learn available tools and best practices",
|
|
96
|
+
}, async () => {
|
|
97
|
+
return {
|
|
98
|
+
messages: [{
|
|
99
|
+
role: "user",
|
|
100
|
+
content: {
|
|
101
|
+
type: "text",
|
|
102
|
+
text: "You have access to CrawlForge MCP with 20 web scraping tools. Key tools:\n\n" +
|
|
103
|
+
"- fetch_url: Fetch raw HTML/content from any URL\n" +
|
|
104
|
+
"- extract_text: Extract clean text from a webpage\n" +
|
|
105
|
+
"- extract_content: Smart content extraction with readability\n" +
|
|
106
|
+
"- search_web: Search the web and get structured results\n" +
|
|
107
|
+
"- crawl_deep: Crawl a website following links to a specified depth\n" +
|
|
108
|
+
"- map_site: Discover all pages on a website\n" +
|
|
109
|
+
"- batch_scrape: Scrape multiple URLs in parallel\n" +
|
|
110
|
+
"- scrape_with_actions: Automate browser actions then scrape\n" +
|
|
111
|
+
"- deep_research: Multi-source research on any topic\n" +
|
|
112
|
+
"- stealth_mode: Anti-detection browsing for protected sites\n" +
|
|
113
|
+
"- extract_structured: LLM-powered structured data extraction\n" +
|
|
114
|
+
"- track_changes: Monitor website changes over time\n" +
|
|
115
|
+
"- generate_llms_txt: Generate llms.txt for any website\n\n" +
|
|
116
|
+
"Workflow: search_web -> fetch_url -> extract_content -> analyze_content\n\n" +
|
|
117
|
+
"Get your API key at https://www.crawlforge.dev/signup (1,000 free credits)"
|
|
118
|
+
}
|
|
119
|
+
}]
|
|
120
|
+
};
|
|
121
|
+
});
|
|
114
122
|
|
|
115
123
|
// Helper function to wrap tool handlers with authentication and credit tracking
|
|
116
124
|
function withAuth(toolName, handler) {
|
|
@@ -184,6 +192,9 @@ const processDocumentTool = new ProcessDocumentTool();
|
|
|
184
192
|
const summarizeContentTool = new SummarizeContentTool();
|
|
185
193
|
const analyzeContentTool = new AnalyzeContentTool();
|
|
186
194
|
|
|
195
|
+
// Phase 1: LLM-Powered Structured Extraction Tool
|
|
196
|
+
const extractStructuredTool = new ExtractStructuredTool();
|
|
197
|
+
|
|
187
198
|
// Initialize Wave 2 Advanced Tools
|
|
188
199
|
const batchScrapeTool = new BatchScrapeTool();
|
|
189
200
|
const scrapeWithActionsTool = new ScrapeWithActionsTool();
|
|
@@ -633,10 +644,11 @@ async function fetchWithTimeout(url, options = {}) {
|
|
|
633
644
|
// Tool: fetch_url - Basic URL fetching with headers and response handling
|
|
634
645
|
server.registerTool("fetch_url", {
|
|
635
646
|
description: "Fetch content from a URL with optional headers and timeout",
|
|
647
|
+
annotations: { title: "Fetch URL", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
636
648
|
inputSchema: {
|
|
637
|
-
url: z.string().url(),
|
|
638
|
-
headers: z.record(z.string()).optional(),
|
|
639
|
-
timeout: z.number().min(1000).max(30000).optional().default(10000)
|
|
649
|
+
url: z.string().url().describe("The URL to fetch content from"),
|
|
650
|
+
headers: z.record(z.string()).optional().describe("Custom HTTP headers to include in the request"),
|
|
651
|
+
timeout: z.number().min(1000).max(30000).optional().default(10000).describe("Request timeout in milliseconds (1000-30000)")
|
|
640
652
|
}
|
|
641
653
|
}, withAuth("fetch_url", async ({ url, headers, timeout }) => {
|
|
642
654
|
try {
|
|
@@ -679,10 +691,11 @@ server.registerTool("fetch_url", {
|
|
|
679
691
|
// Tool: extract_text - Extract clean text content from HTML
|
|
680
692
|
server.registerTool("extract_text", {
|
|
681
693
|
description: "Extract clean text content from a webpage",
|
|
694
|
+
annotations: { title: "Extract Text", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
682
695
|
inputSchema: {
|
|
683
|
-
url: z.string().url(),
|
|
684
|
-
remove_scripts: z.boolean().optional().default(true),
|
|
685
|
-
remove_styles: z.boolean().optional().default(true)
|
|
696
|
+
url: z.string().url().describe("The URL to extract text from"),
|
|
697
|
+
remove_scripts: z.boolean().optional().default(true).describe("Remove script tags before extraction"),
|
|
698
|
+
remove_styles: z.boolean().optional().default(true).describe("Remove style tags before extraction")
|
|
686
699
|
}
|
|
687
700
|
}, withAuth("extract_text", async ({ url, remove_scripts, remove_styles }) => {
|
|
688
701
|
try {
|
|
@@ -733,10 +746,11 @@ server.registerTool("extract_text", {
|
|
|
733
746
|
// Tool: extract_links - Extract all links from a webpage with optional filtering
|
|
734
747
|
server.registerTool("extract_links", {
|
|
735
748
|
description: "Extract all links from a webpage with optional filtering",
|
|
749
|
+
annotations: { title: "Extract Links", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
736
750
|
inputSchema: {
|
|
737
|
-
url: z.string().url(),
|
|
738
|
-
filter_external: z.boolean().optional().default(false),
|
|
739
|
-
base_url: z.string().url().optional()
|
|
751
|
+
url: z.string().url().describe("The URL to extract links from"),
|
|
752
|
+
filter_external: z.boolean().optional().default(false).describe("Only return external links"),
|
|
753
|
+
base_url: z.string().url().optional().describe("Base URL for resolving relative links")
|
|
740
754
|
}
|
|
741
755
|
}, withAuth("extract_links", async ({ url, filter_external, base_url }) => {
|
|
742
756
|
try {
|
|
@@ -817,8 +831,9 @@ server.registerTool("extract_links", {
|
|
|
817
831
|
// Tool: extract_metadata - Extract page metadata
|
|
818
832
|
server.registerTool("extract_metadata", {
|
|
819
833
|
description: "Extract metadata from a webpage (title, description, keywords, etc.)",
|
|
834
|
+
annotations: { title: "Extract Metadata", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
820
835
|
inputSchema: {
|
|
821
|
-
url: z.string().url()
|
|
836
|
+
url: z.string().url().describe("The URL to extract metadata from")
|
|
822
837
|
}
|
|
823
838
|
}, withAuth("extract_metadata", async ({ url }) => {
|
|
824
839
|
try {
|
|
@@ -896,9 +911,10 @@ server.registerTool("extract_metadata", {
|
|
|
896
911
|
// Tool: scrape_structured - Extract structured data using CSS selectors
|
|
897
912
|
server.registerTool("scrape_structured", {
|
|
898
913
|
description: "Extract structured data from a webpage using CSS selectors",
|
|
914
|
+
annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
899
915
|
inputSchema: {
|
|
900
|
-
url: z.string().url(),
|
|
901
|
-
selectors: z.record(z.string())
|
|
916
|
+
url: z.string().url().describe("The URL to scrape"),
|
|
917
|
+
selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors")
|
|
902
918
|
}
|
|
903
919
|
}, withAuth("scrape_structured", async ({ url, selectors }) => {
|
|
904
920
|
try {
|
|
@@ -959,15 +975,16 @@ server.registerTool("scrape_structured", {
|
|
|
959
975
|
// Tool: search_web - Search the web using Google Search via CrawlForge proxy
|
|
960
976
|
server.registerTool("search_web", {
|
|
961
977
|
description: "Search the web using Google Search API (proxied through CrawlForge)",
|
|
978
|
+
annotations: { title: "Search the Web", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
962
979
|
inputSchema: {
|
|
963
|
-
query: z.string(),
|
|
964
|
-
limit: z.number().min(1).max(100).optional(),
|
|
965
|
-
offset: z.number().min(0).optional(),
|
|
966
|
-
lang: z.string().optional(),
|
|
967
|
-
safe_search: z.boolean().optional(),
|
|
968
|
-
time_range: z.enum(["day", "week", "month", "year", "all"]).optional(),
|
|
969
|
-
site: z.string().optional(),
|
|
970
|
-
file_type: z.string().optional()
|
|
980
|
+
query: z.string().describe("Search query string"),
|
|
981
|
+
limit: z.number().min(1).max(100).optional().describe("Maximum number of results to return"),
|
|
982
|
+
offset: z.number().min(0).optional().describe("Number of results to skip for pagination"),
|
|
983
|
+
lang: z.string().optional().describe("Language code for results (e.g. 'en', 'fr')"),
|
|
984
|
+
safe_search: z.boolean().optional().describe("Enable safe search filtering"),
|
|
985
|
+
time_range: z.enum(["day", "week", "month", "year", "all"]).optional().describe("Filter results by time range"),
|
|
986
|
+
site: z.string().optional().describe("Limit results to a specific domain"),
|
|
987
|
+
file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')")
|
|
971
988
|
}
|
|
972
989
|
}, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
|
|
973
990
|
try {
|
|
@@ -1002,16 +1019,17 @@ server.registerTool("search_web", {
|
|
|
1002
1019
|
// Tool: crawl_deep - Deep crawl websites with BFS algorithm
|
|
1003
1020
|
server.registerTool("crawl_deep", {
|
|
1004
1021
|
description: "Crawl websites deeply using breadth-first search",
|
|
1022
|
+
annotations: { title: "Deep Crawl", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1005
1023
|
inputSchema: {
|
|
1006
|
-
url: z.string().url(),
|
|
1007
|
-
max_depth: z.number().min(1).max(5).optional(),
|
|
1008
|
-
max_pages: z.number().min(1).max(1000).optional(),
|
|
1009
|
-
include_patterns: z.array(z.string()).optional(),
|
|
1010
|
-
exclude_patterns: z.array(z.string()).optional(),
|
|
1011
|
-
follow_external: z.boolean().optional(),
|
|
1012
|
-
respect_robots: z.boolean().optional(),
|
|
1013
|
-
extract_content: z.boolean().optional(),
|
|
1014
|
-
concurrency: z.number().min(1).max(20).optional()
|
|
1024
|
+
url: z.string().url().describe("Starting URL for the crawl"),
|
|
1025
|
+
max_depth: z.number().min(1).max(5).optional().describe("Maximum crawl depth from starting URL"),
|
|
1026
|
+
max_pages: z.number().min(1).max(1000).optional().describe("Maximum number of pages to crawl"),
|
|
1027
|
+
include_patterns: z.array(z.string()).optional().describe("URL patterns to include (regex)"),
|
|
1028
|
+
exclude_patterns: z.array(z.string()).optional().describe("URL patterns to exclude (regex)"),
|
|
1029
|
+
follow_external: z.boolean().optional().describe("Follow links to external domains"),
|
|
1030
|
+
respect_robots: z.boolean().optional().describe("Respect robots.txt directives"),
|
|
1031
|
+
extract_content: z.boolean().optional().describe("Extract page content during crawl"),
|
|
1032
|
+
concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests")
|
|
1015
1033
|
}
|
|
1016
1034
|
}, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
|
|
1017
1035
|
try {
|
|
@@ -1046,12 +1064,13 @@ server.registerTool("crawl_deep", {
|
|
|
1046
1064
|
// Tool: map_site - Discover and map website structure
|
|
1047
1065
|
server.registerTool("map_site", {
|
|
1048
1066
|
description: "Discover and map website structure",
|
|
1067
|
+
annotations: { title: "Map Website", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
1049
1068
|
inputSchema: {
|
|
1050
|
-
url: z.string().url(),
|
|
1051
|
-
include_sitemap: z.boolean().optional(),
|
|
1052
|
-
max_urls: z.number().min(1).max(10000).optional(),
|
|
1053
|
-
group_by_path: z.boolean().optional(),
|
|
1054
|
-
include_metadata: z.boolean().optional()
|
|
1069
|
+
url: z.string().url().describe("The website URL to map"),
|
|
1070
|
+
include_sitemap: z.boolean().optional().describe("Include sitemap.xml data in results"),
|
|
1071
|
+
max_urls: z.number().min(1).max(10000).optional().describe("Maximum number of URLs to discover"),
|
|
1072
|
+
group_by_path: z.boolean().optional().describe("Group URLs by path segments"),
|
|
1073
|
+
include_metadata: z.boolean().optional().describe("Include page metadata for each URL")
|
|
1055
1074
|
}
|
|
1056
1075
|
}, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
|
|
1057
1076
|
try {
|
|
@@ -1088,9 +1107,10 @@ server.registerTool("map_site", {
|
|
|
1088
1107
|
// Tool: extract_content - Enhanced content extraction with readability detection
|
|
1089
1108
|
server.registerTool("extract_content", {
|
|
1090
1109
|
description: "Extract and analyze main content from web pages with enhanced readability detection",
|
|
1110
|
+
annotations: { title: "Extract Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
1091
1111
|
inputSchema: {
|
|
1092
|
-
url: z.string().url(),
|
|
1093
|
-
options: z.object({}).optional()
|
|
1112
|
+
url: z.string().url().describe("The URL to extract content from"),
|
|
1113
|
+
options: z.object({}).optional().describe("Additional extraction options")
|
|
1094
1114
|
}
|
|
1095
1115
|
}, withAuth("extract_content", async ({ url, options }) => {
|
|
1096
1116
|
try {
|
|
@@ -1125,10 +1145,11 @@ server.registerTool("extract_content", {
|
|
|
1125
1145
|
// Tool: process_document - Multi-format document processing
|
|
1126
1146
|
server.registerTool("process_document", {
|
|
1127
1147
|
description: "Process documents from multiple sources and formats including PDFs and web pages",
|
|
1148
|
+
annotations: { title: "Process Document", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
1128
1149
|
inputSchema: {
|
|
1129
|
-
source: z.string(),
|
|
1130
|
-
sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional(),
|
|
1131
|
-
options: z.object({}).optional()
|
|
1150
|
+
source: z.string().describe("Document source - URL or file path"),
|
|
1151
|
+
sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional().describe("Type of document source"),
|
|
1152
|
+
options: z.object({}).optional().describe("Additional processing options")
|
|
1132
1153
|
}
|
|
1133
1154
|
}, withAuth("process_document", async ({ source, sourceType, options }) => {
|
|
1134
1155
|
try {
|
|
@@ -1163,9 +1184,10 @@ server.registerTool("process_document", {
|
|
|
1163
1184
|
// Tool: summarize_content - Intelligent content summarization
|
|
1164
1185
|
server.registerTool("summarize_content", {
|
|
1165
1186
|
description: "Generate intelligent summaries of text content with configurable options",
|
|
1187
|
+
annotations: { title: "Summarize Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
|
|
1166
1188
|
inputSchema: {
|
|
1167
|
-
text: z.string(),
|
|
1168
|
-
options: z.object({}).optional()
|
|
1189
|
+
text: z.string().describe("The text content to summarize"),
|
|
1190
|
+
options: z.object({}).optional().describe("Summarization options")
|
|
1169
1191
|
}
|
|
1170
1192
|
}, withAuth("summarize_content", async ({ text, options }) => {
|
|
1171
1193
|
try {
|
|
@@ -1200,9 +1222,10 @@ server.registerTool("summarize_content", {
|
|
|
1200
1222
|
// Tool: analyze_content - Comprehensive content analysis
|
|
1201
1223
|
server.registerTool("analyze_content", {
|
|
1202
1224
|
description: "Perform comprehensive content analysis including language detection and topic extraction",
|
|
1225
|
+
annotations: { title: "Analyze Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
|
|
1203
1226
|
inputSchema: {
|
|
1204
|
-
text: z.string(),
|
|
1205
|
-
options: z.object({}).optional()
|
|
1227
|
+
text: z.string().describe("The text content to analyze"),
|
|
1228
|
+
options: z.object({}).optional().describe("Analysis options")
|
|
1206
1229
|
}
|
|
1207
1230
|
}, withAuth("analyze_content", async ({ text, options }) => {
|
|
1208
1231
|
try {
|
|
@@ -1235,11 +1258,62 @@ server.registerTool("analyze_content", {
|
|
|
1235
1258
|
}));
|
|
1236
1259
|
|
|
1237
1260
|
|
|
1261
|
+
|
|
1262
|
+
// Phase 1: LLM-Powered Structured Extraction
|
|
1263
|
+
|
|
1264
|
+
// Tool: extract_structured - Extract structured data from a URL using LLM and JSON Schema
|
|
1265
|
+
server.registerTool("extract_structured", {
|
|
1266
|
+
description: "Extract structured data from a webpage using LLM-powered analysis and a JSON Schema. Falls back to CSS selector extraction when no LLM provider is configured.",
|
|
1267
|
+
annotations: { title: "Extract Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
1268
|
+
inputSchema: {
|
|
1269
|
+
url: z.string().url().describe("The URL to extract structured data from"),
|
|
1270
|
+
schema: z.object({
|
|
1271
|
+
type: z.string().optional(),
|
|
1272
|
+
properties: z.record(z.any()),
|
|
1273
|
+
required: z.array(z.string()).optional()
|
|
1274
|
+
}).describe("JSON schema defining the data structure to extract"),
|
|
1275
|
+
prompt: z.string().optional().describe("Natural language instructions for extraction"),
|
|
1276
|
+
llmConfig: z.object({
|
|
1277
|
+
provider: z.string().optional(),
|
|
1278
|
+
apiKey: z.string().optional()
|
|
1279
|
+
}).optional().describe("LLM provider configuration for AI-powered extraction"),
|
|
1280
|
+
fallbackToSelectors: z.boolean().optional().default(true).describe("Fall back to CSS selector extraction if LLM is unavailable"),
|
|
1281
|
+
selectorHints: z.record(z.string()).optional().describe("CSS selector hints to guide extraction")
|
|
1282
|
+
}
|
|
1283
|
+
}, withAuth("extract_structured", async ({ url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints }) => {
|
|
1284
|
+
try {
|
|
1285
|
+
const result = await extractStructuredTool.execute({
|
|
1286
|
+
url,
|
|
1287
|
+
schema,
|
|
1288
|
+
prompt,
|
|
1289
|
+
llmConfig,
|
|
1290
|
+
fallbackToSelectors,
|
|
1291
|
+
selectorHints
|
|
1292
|
+
});
|
|
1293
|
+
return {
|
|
1294
|
+
content: [{
|
|
1295
|
+
type: "text",
|
|
1296
|
+
text: JSON.stringify(result, null, 2)
|
|
1297
|
+
}]
|
|
1298
|
+
};
|
|
1299
|
+
} catch (error) {
|
|
1300
|
+
return {
|
|
1301
|
+
content: [{
|
|
1302
|
+
type: "text",
|
|
1303
|
+
text: `Structured extraction failed: ${error.message}`
|
|
1304
|
+
}],
|
|
1305
|
+
isError: true
|
|
1306
|
+
};
|
|
1307
|
+
}
|
|
1308
|
+
}));
|
|
1309
|
+
|
|
1310
|
+
|
|
1238
1311
|
// Wave 2 Advanced Tools
|
|
1239
1312
|
|
|
1240
1313
|
// Tool: batch_scrape - Process multiple URLs simultaneously with job management
|
|
1241
1314
|
server.registerTool("batch_scrape", {
|
|
1242
1315
|
description: "Process multiple URLs simultaneously with support for async job management and webhook notifications",
|
|
1316
|
+
annotations: { title: "Batch Scrape", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1243
1317
|
inputSchema: {
|
|
1244
1318
|
urls: z.array(z.union([
|
|
1245
1319
|
z.string().url(),
|
|
@@ -1250,27 +1324,27 @@ server.registerTool("batch_scrape", {
|
|
|
1250
1324
|
timeout: z.number().min(1000).max(30000).optional(),
|
|
1251
1325
|
metadata: z.record(z.any()).optional()
|
|
1252
1326
|
})
|
|
1253
|
-
])).min(1).max(50),
|
|
1254
|
-
formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
|
|
1255
|
-
mode: z.enum(['sync', 'async']).default('sync'),
|
|
1327
|
+
])).min(1).max(50).describe("Array of URLs or URL objects to scrape"),
|
|
1328
|
+
formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']).describe("Output formats for scraped content"),
|
|
1329
|
+
mode: z.enum(['sync', 'async']).default('sync').describe("Processing mode: sync (wait) or async (background)"),
|
|
1256
1330
|
webhook: z.object({
|
|
1257
1331
|
url: z.string().url(),
|
|
1258
1332
|
events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
|
|
1259
1333
|
headers: z.record(z.string()).optional(),
|
|
1260
1334
|
signingSecret: z.string().optional()
|
|
1261
|
-
}).optional(),
|
|
1262
|
-
extractionSchema: z.record(z.string()).optional(),
|
|
1263
|
-
maxConcurrency: z.number().min(1).max(20).default(10),
|
|
1264
|
-
delayBetweenRequests: z.number().min(0).max(10000).default(100),
|
|
1265
|
-
includeMetadata: z.boolean().default(true),
|
|
1266
|
-
includeFailed: z.boolean().default(true),
|
|
1267
|
-
pageSize: z.number().min(1).max(100).default(25),
|
|
1335
|
+
}).optional().describe("Webhook configuration for async job notifications"),
|
|
1336
|
+
extractionSchema: z.record(z.string()).optional().describe("Schema for structured data extraction from each URL"),
|
|
1337
|
+
maxConcurrency: z.number().min(1).max(20).default(10).describe("Maximum concurrent scraping requests"),
|
|
1338
|
+
delayBetweenRequests: z.number().min(0).max(10000).default(100).describe("Delay in milliseconds between requests"),
|
|
1339
|
+
includeMetadata: z.boolean().default(true).describe("Include page metadata in results"),
|
|
1340
|
+
includeFailed: z.boolean().default(true).describe("Include failed URLs in results"),
|
|
1341
|
+
pageSize: z.number().min(1).max(100).default(25).describe("Number of results per page"),
|
|
1268
1342
|
jobOptions: z.object({
|
|
1269
1343
|
priority: z.number().default(0),
|
|
1270
1344
|
ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
|
|
1271
1345
|
maxRetries: z.number().min(0).max(5).default(1),
|
|
1272
1346
|
tags: z.array(z.string()).default([])
|
|
1273
|
-
}).optional()
|
|
1347
|
+
}).optional().describe("Job management options for async processing")
|
|
1274
1348
|
}
|
|
1275
1349
|
}, withAuth("batch_scrape", async (params) => {
|
|
1276
1350
|
try {
|
|
@@ -1295,8 +1369,9 @@ server.registerTool("batch_scrape", {
|
|
|
1295
1369
|
// Tool: scrape_with_actions - Execute action chains before scraping
|
|
1296
1370
|
server.registerTool("scrape_with_actions", {
|
|
1297
1371
|
description: "Execute browser action chains before scraping content, with form auto-fill and intermediate state capture",
|
|
1372
|
+
annotations: { title: "Scrape with Browser Actions", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1298
1373
|
inputSchema: {
|
|
1299
|
-
url: z.string().url(),
|
|
1374
|
+
url: z.string().url().describe("The URL to scrape"),
|
|
1300
1375
|
actions: z.array(z.object({
|
|
1301
1376
|
type: z.enum(['wait', 'click', 'type', 'press', 'scroll', 'screenshot', 'executeJavaScript']),
|
|
1302
1377
|
selector: z.string().optional(),
|
|
@@ -1307,10 +1382,10 @@ server.registerTool("scrape_with_actions", {
|
|
|
1307
1382
|
description: z.string().optional(),
|
|
1308
1383
|
continueOnError: z.boolean().default(false),
|
|
1309
1384
|
retries: z.number().min(0).max(5).default(0)
|
|
1310
|
-
})).min(1).max(20),
|
|
1311
|
-
formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
|
|
1312
|
-
captureIntermediateStates: z.boolean().default(false),
|
|
1313
|
-
captureScreenshots: z.boolean().default(true),
|
|
1385
|
+
})).min(1).max(20).describe("Browser actions to perform before scraping"),
|
|
1386
|
+
formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']).describe("Output formats for scraped content"),
|
|
1387
|
+
captureIntermediateStates: z.boolean().default(false).describe("Capture page state after each action"),
|
|
1388
|
+
captureScreenshots: z.boolean().default(true).describe("Take screenshots during action execution"),
|
|
1314
1389
|
formAutoFill: z.object({
|
|
1315
1390
|
fields: z.array(z.object({
|
|
1316
1391
|
selector: z.string(),
|
|
@@ -1320,23 +1395,23 @@ server.registerTool("scrape_with_actions", {
|
|
|
1320
1395
|
})),
|
|
1321
1396
|
submitSelector: z.string().optional(),
|
|
1322
1397
|
waitAfterSubmit: z.number().min(0).max(30000).default(2000)
|
|
1323
|
-
}).optional(),
|
|
1398
|
+
}).optional().describe("Form auto-fill configuration"),
|
|
1324
1399
|
browserOptions: z.object({
|
|
1325
1400
|
headless: z.boolean().default(true),
|
|
1326
1401
|
userAgent: z.string().optional(),
|
|
1327
1402
|
viewportWidth: z.number().min(800).max(1920).default(1280),
|
|
1328
1403
|
viewportHeight: z.number().min(600).max(1080).default(720),
|
|
1329
1404
|
timeout: z.number().min(10000).max(120000).default(30000)
|
|
1330
|
-
}).optional(),
|
|
1405
|
+
}).optional().describe("Browser configuration options"),
|
|
1331
1406
|
extractionOptions: z.object({
|
|
1332
1407
|
selectors: z.record(z.string()).optional(),
|
|
1333
1408
|
includeMetadata: z.boolean().default(true),
|
|
1334
1409
|
includeLinks: z.boolean().default(true),
|
|
1335
1410
|
includeImages: z.boolean().default(true)
|
|
1336
|
-
}).optional(),
|
|
1337
|
-
continueOnActionError: z.boolean().default(false),
|
|
1338
|
-
maxRetries: z.number().min(0).max(3).default(1),
|
|
1339
|
-
screenshotOnError: z.boolean().default(true)
|
|
1411
|
+
}).optional().describe("Content extraction options"),
|
|
1412
|
+
continueOnActionError: z.boolean().default(false).describe("Continue executing actions if one fails"),
|
|
1413
|
+
maxRetries: z.number().min(0).max(3).default(1).describe("Maximum retry attempts on failure"),
|
|
1414
|
+
screenshotOnError: z.boolean().default(true).describe("Capture screenshot when an error occurs")
|
|
1340
1415
|
}
|
|
1341
1416
|
}, withAuth("scrape_with_actions", async (params) => {
|
|
1342
1417
|
try {
|
|
@@ -1361,27 +1436,28 @@ server.registerTool("scrape_with_actions", {
|
|
|
1361
1436
|
// Tool: deep_research - Comprehensive multi-stage research with source verification
|
|
1362
1437
|
server.registerTool("deep_research", {
|
|
1363
1438
|
description: "Conduct comprehensive multi-stage research with intelligent query expansion, source verification, and conflict detection",
|
|
1439
|
+
annotations: { title: "Deep Research", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1364
1440
|
inputSchema: {
|
|
1365
|
-
topic: z.string().min(3).max(500),
|
|
1366
|
-
maxDepth: z.number().min(1).max(10).optional().default(5),
|
|
1367
|
-
maxUrls: z.number().min(1).max(1000).optional().default(50),
|
|
1368
|
-
timeLimit: z.number().min(30000).max(300000).optional().default(120000),
|
|
1369
|
-
researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'),
|
|
1370
|
-
sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']),
|
|
1371
|
-
credibilityThreshold: z.number().min(0).max(1).optional().default(0.3),
|
|
1372
|
-
includeRecentOnly: z.boolean().optional().default(false),
|
|
1373
|
-
enableConflictDetection: z.boolean().optional().default(true),
|
|
1374
|
-
enableSourceVerification: z.boolean().optional().default(true),
|
|
1375
|
-
enableSynthesis: z.boolean().optional().default(true),
|
|
1376
|
-
outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'),
|
|
1377
|
-
includeRawData: z.boolean().optional().default(false),
|
|
1378
|
-
includeActivityLog: z.boolean().optional().default(false),
|
|
1441
|
+
topic: z.string().min(3).max(500).describe("Research topic or question"),
|
|
1442
|
+
maxDepth: z.number().min(1).max(10).optional().default(5).describe("Maximum research depth"),
|
|
1443
|
+
maxUrls: z.number().min(1).max(1000).optional().default(50).describe("Maximum URLs to analyze"),
|
|
1444
|
+
timeLimit: z.number().min(30000).max(300000).optional().default(120000).describe("Time limit in milliseconds for the research"),
|
|
1445
|
+
researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad').describe("Research methodology approach"),
|
|
1446
|
+
sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']).describe("Types of sources to include"),
|
|
1447
|
+
credibilityThreshold: z.number().min(0).max(1).optional().default(0.3).describe("Minimum credibility score for sources (0-1)"),
|
|
1448
|
+
includeRecentOnly: z.boolean().optional().default(false).describe("Only include recent sources"),
|
|
1449
|
+
enableConflictDetection: z.boolean().optional().default(true).describe("Detect conflicting information across sources"),
|
|
1450
|
+
enableSourceVerification: z.boolean().optional().default(true).describe("Verify source credibility"),
|
|
1451
|
+
enableSynthesis: z.boolean().optional().default(true).describe("Synthesize findings into a coherent report"),
|
|
1452
|
+
outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive').describe("Output format for the research report"),
|
|
1453
|
+
includeRawData: z.boolean().optional().default(false).describe("Include raw scraped data in output"),
|
|
1454
|
+
includeActivityLog: z.boolean().optional().default(false).describe("Include detailed activity log"),
|
|
1379
1455
|
queryExpansion: z.object({
|
|
1380
1456
|
enableSynonyms: z.boolean().optional().default(true),
|
|
1381
1457
|
enableSpellCheck: z.boolean().optional().default(true),
|
|
1382
1458
|
enableContextual: z.boolean().optional().default(true),
|
|
1383
1459
|
maxVariations: z.number().min(1).max(20).optional().default(8)
|
|
1384
|
-
}).optional(),
|
|
1460
|
+
}).optional().describe("Query expansion settings for broader search coverage"),
|
|
1385
1461
|
llmConfig: z.object({
|
|
1386
1462
|
provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'),
|
|
1387
1463
|
openai: z.object({
|
|
@@ -1395,14 +1471,14 @@ server.registerTool("deep_research", {
|
|
|
1395
1471
|
}).optional(),
|
|
1396
1472
|
enableSemanticAnalysis: z.boolean().optional().default(true),
|
|
1397
1473
|
enableIntelligentSynthesis: z.boolean().optional().default(true)
|
|
1398
|
-
}).optional(),
|
|
1399
|
-
concurrency: z.number().min(1).max(20).optional().default(5),
|
|
1400
|
-
cacheResults: z.boolean().optional().default(true),
|
|
1474
|
+
}).optional().describe("LLM provider configuration for AI-powered analysis"),
|
|
1475
|
+
concurrency: z.number().min(1).max(20).optional().default(5).describe("Number of concurrent research requests"),
|
|
1476
|
+
cacheResults: z.boolean().optional().default(true).describe("Cache research results for reuse"),
|
|
1401
1477
|
webhook: z.object({
|
|
1402
1478
|
url: z.string().url(),
|
|
1403
1479
|
events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']),
|
|
1404
1480
|
headers: z.record(z.string()).optional()
|
|
1405
|
-
}).optional()
|
|
1481
|
+
}).optional().describe("Webhook for progress and completion notifications")
|
|
1406
1482
|
}
|
|
1407
1483
|
}, withAuth("deep_research", async (params) => {
|
|
1408
1484
|
try {
|
|
@@ -1427,13 +1503,14 @@ server.registerTool("deep_research", {
|
|
|
1427
1503
|
// Tool: track_changes - Enhanced Content change tracking with baseline capture and monitoring (Phase 2.4)
|
|
1428
1504
|
server.registerTool("track_changes", {
|
|
1429
1505
|
description: "Enhanced content change tracking with baseline capture, comparison, scheduled monitoring, advanced comparison engine, alert system, and historical analysis",
|
|
1506
|
+
annotations: { title: "Track Changes", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1430
1507
|
inputSchema: {
|
|
1431
|
-
url: z.string().url(),
|
|
1508
|
+
url: z.string().url().describe("The URL to track changes for"),
|
|
1432
1509
|
operation: z.enum([
|
|
1433
|
-
'create_baseline',
|
|
1434
|
-
'compare',
|
|
1435
|
-
'monitor',
|
|
1436
|
-
'get_history',
|
|
1510
|
+
'create_baseline',
|
|
1511
|
+
'compare',
|
|
1512
|
+
'monitor',
|
|
1513
|
+
'get_history',
|
|
1437
1514
|
'get_stats',
|
|
1438
1515
|
'create_scheduled_monitor',
|
|
1439
1516
|
'stop_scheduled_monitor',
|
|
@@ -1442,9 +1519,9 @@ server.registerTool("track_changes", {
|
|
|
1442
1519
|
'create_alert_rule',
|
|
1443
1520
|
'generate_trend_report',
|
|
1444
1521
|
'get_monitoring_templates'
|
|
1445
|
-
]).default('compare'),
|
|
1446
|
-
content: z.string().optional(),
|
|
1447
|
-
html: z.string().optional(),
|
|
1522
|
+
]).default('compare').describe("Tracking operation to perform"),
|
|
1523
|
+
content: z.string().optional().describe("Content to compare against baseline"),
|
|
1524
|
+
html: z.string().optional().describe("HTML content to compare against baseline"),
|
|
1448
1525
|
trackingOptions: z.object({
|
|
1449
1526
|
granularity: z.enum(['page', 'section', 'element', 'text']).default('section'),
|
|
1450
1527
|
trackText: z.boolean().default(true),
|
|
@@ -1461,7 +1538,7 @@ server.registerTool("track_changes", {
|
|
|
1461
1538
|
moderate: z.number().min(0).max(1).default(0.3),
|
|
1462
1539
|
major: z.number().min(0).max(1).default(0.7)
|
|
1463
1540
|
}).optional()
|
|
1464
|
-
}).optional(),
|
|
1541
|
+
}).optional().describe("Options for how changes are tracked and compared"),
|
|
1465
1542
|
monitoringOptions: z.object({
|
|
1466
1543
|
enabled: z.boolean().default(false),
|
|
1467
1544
|
interval: z.number().min(60000).max(24 * 60 * 60 * 1000).default(300000),
|
|
@@ -1471,14 +1548,14 @@ server.registerTool("track_changes", {
|
|
|
1471
1548
|
enableWebhook: z.boolean().default(false),
|
|
1472
1549
|
webhookUrl: z.string().url().optional(),
|
|
1473
1550
|
webhookSecret: z.string().optional()
|
|
1474
|
-
}).optional(),
|
|
1551
|
+
}).optional().describe("Monitoring schedule and notification settings"),
|
|
1475
1552
|
storageOptions: z.object({
|
|
1476
1553
|
enableSnapshots: z.boolean().default(true),
|
|
1477
1554
|
retainHistory: z.boolean().default(true),
|
|
1478
1555
|
maxHistoryEntries: z.number().min(1).max(1000).default(100),
|
|
1479
1556
|
compressionEnabled: z.boolean().default(true),
|
|
1480
1557
|
deltaStorageEnabled: z.boolean().default(true)
|
|
1481
|
-
}).optional(),
|
|
1558
|
+
}).optional().describe("Storage and history retention settings"),
|
|
1482
1559
|
queryOptions: z.object({
|
|
1483
1560
|
limit: z.number().min(1).max(500).default(50),
|
|
1484
1561
|
offset: z.number().min(0).default(0),
|
|
@@ -1486,7 +1563,7 @@ server.registerTool("track_changes", {
|
|
|
1486
1563
|
endTime: z.number().optional(),
|
|
1487
1564
|
includeContent: z.boolean().default(false),
|
|
1488
1565
|
significanceFilter: z.enum(['all', 'minor', 'moderate', 'major', 'critical']).optional()
|
|
1489
|
-
}).optional(),
|
|
1566
|
+
}).optional().describe("Query options for history and stats retrieval"),
|
|
1490
1567
|
notificationOptions: z.object({
|
|
1491
1568
|
webhook: z.object({
|
|
1492
1569
|
enabled: z.boolean().default(false),
|
|
@@ -1502,32 +1579,32 @@ server.registerTool("track_changes", {
|
|
|
1502
1579
|
channel: z.string().optional(),
|
|
1503
1580
|
username: z.string().optional()
|
|
1504
1581
|
}).optional()
|
|
1505
|
-
}).optional(),
|
|
1582
|
+
}).optional().describe("Notification configuration for webhooks and Slack"),
|
|
1506
1583
|
// Enhanced Phase 2.4 options
|
|
1507
1584
|
scheduledMonitorOptions: z.object({
|
|
1508
1585
|
schedule: z.string().optional(), // Cron expression
|
|
1509
1586
|
templateId: z.string().optional(), // Monitoring template ID
|
|
1510
1587
|
enabled: z.boolean().default(true)
|
|
1511
|
-
}).optional(),
|
|
1588
|
+
}).optional().describe("Scheduled monitoring options with cron expressions"),
|
|
1512
1589
|
alertRuleOptions: z.object({
|
|
1513
1590
|
ruleId: z.string().optional(),
|
|
1514
1591
|
condition: z.string().optional(), // Condition description
|
|
1515
1592
|
actions: z.array(z.enum(['webhook', 'email', 'slack'])).optional(),
|
|
1516
1593
|
throttle: z.number().min(0).optional(),
|
|
1517
1594
|
priority: z.enum(['low', 'medium', 'high']).optional()
|
|
1518
|
-
}).optional(),
|
|
1595
|
+
}).optional().describe("Alert rule configuration for change notifications"),
|
|
1519
1596
|
exportOptions: z.object({
|
|
1520
1597
|
format: z.enum(['json', 'csv']).default('json'),
|
|
1521
1598
|
startTime: z.number().optional(),
|
|
1522
1599
|
endTime: z.number().optional(),
|
|
1523
1600
|
includeContent: z.boolean().default(false),
|
|
1524
1601
|
includeSnapshots: z.boolean().default(false)
|
|
1525
|
-
}).optional(),
|
|
1602
|
+
}).optional().describe("Export options for change history data"),
|
|
1526
1603
|
dashboardOptions: z.object({
|
|
1527
1604
|
includeRecentAlerts: z.boolean().default(true),
|
|
1528
1605
|
includeTrends: z.boolean().default(true),
|
|
1529
1606
|
includeMonitorStatus: z.boolean().default(true)
|
|
1530
|
-
}).optional()
|
|
1607
|
+
}).optional().describe("Dashboard display options")
|
|
1531
1608
|
}
|
|
1532
1609
|
}, withAuth("track_changes", async (params) => {
|
|
1533
1610
|
try {
|
|
@@ -1552,8 +1629,9 @@ server.registerTool("track_changes", {
|
|
|
1552
1629
|
// Tool: generate_llms_txt - Generate LLMs.txt and LLMs-full.txt files (Phase 2.5)
|
|
1553
1630
|
server.registerTool("generate_llms_txt", {
|
|
1554
1631
|
description: "Analyze websites and generate standard-compliant LLMs.txt and LLMs-full.txt files defining AI model interaction guidelines",
|
|
1632
|
+
annotations: { title: "Generate llms.txt", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
1555
1633
|
inputSchema: {
|
|
1556
|
-
url: z.string().url(),
|
|
1634
|
+
url: z.string().url().describe("The website URL to generate llms.txt for"),
|
|
1557
1635
|
analysisOptions: z.object({
|
|
1558
1636
|
maxDepth: z.number().min(1).max(5).optional().default(3),
|
|
1559
1637
|
maxPages: z.number().min(10).max(500).optional().default(100),
|
|
@@ -1561,7 +1639,7 @@ server.registerTool("generate_llms_txt", {
|
|
|
1561
1639
|
analyzeContent: z.boolean().optional().default(true),
|
|
1562
1640
|
checkSecurity: z.boolean().optional().default(true),
|
|
1563
1641
|
respectRobots: z.boolean().optional().default(true)
|
|
1564
|
-
}).optional(),
|
|
1642
|
+
}).optional().describe("Website analysis options for depth, scope, and detection"),
|
|
1565
1643
|
outputOptions: z.object({
|
|
1566
1644
|
includeDetailed: z.boolean().optional().default(true),
|
|
1567
1645
|
includeAnalysis: z.boolean().optional().default(false),
|
|
@@ -1569,9 +1647,9 @@ server.registerTool("generate_llms_txt", {
|
|
|
1569
1647
|
organizationName: z.string().optional(),
|
|
1570
1648
|
customGuidelines: z.array(z.string()).optional(),
|
|
1571
1649
|
customRestrictions: z.array(z.string()).optional()
|
|
1572
|
-
}).optional(),
|
|
1573
|
-
complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard'),
|
|
1574
|
-
format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both')
|
|
1650
|
+
}).optional().describe("Output customization and organization details"),
|
|
1651
|
+
complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard').describe("Compliance level for generated guidelines"),
|
|
1652
|
+
format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both').describe("Output format: llms.txt, llms-full.txt, or both")
|
|
1575
1653
|
}
|
|
1576
1654
|
}, withAuth("generate_llms_txt", async (params) => {
|
|
1577
1655
|
try {
|
|
@@ -1596,8 +1674,9 @@ server.registerTool("generate_llms_txt", {
|
|
|
1596
1674
|
// Tool: stealth_mode - Advanced anti-detection browser management (Wave 3)
|
|
1597
1675
|
server.registerTool("stealth_mode", {
|
|
1598
1676
|
description: "Advanced anti-detection browser management with stealth features, fingerprint randomization, and human behavior simulation",
|
|
1677
|
+
annotations: { title: "Stealth Mode", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1599
1678
|
inputSchema: {
|
|
1600
|
-
operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure'),
|
|
1679
|
+
operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure').describe("Stealth operation to perform"),
|
|
1601
1680
|
stealthConfig: z.object({
|
|
1602
1681
|
level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
|
|
1603
1682
|
randomizeFingerprint: z.boolean().default(true),
|
|
@@ -1635,9 +1714,9 @@ server.registerTool("stealth_mode", {
|
|
|
1635
1714
|
fontSpoofing: z.boolean().default(true),
|
|
1636
1715
|
hardwareSpoofing: z.boolean().default(true)
|
|
1637
1716
|
}).optional()
|
|
1638
|
-
}).optional(),
|
|
1639
|
-
contextId: z.string().optional(),
|
|
1640
|
-
urlToTest: z.string().url().optional()
|
|
1717
|
+
}).optional().describe("Stealth browser configuration with anti-detection settings"),
|
|
1718
|
+
contextId: z.string().optional().describe("Browser context ID for page operations"),
|
|
1719
|
+
urlToTest: z.string().url().optional().describe("URL to navigate to when creating a page")
|
|
1641
1720
|
}
|
|
1642
1721
|
}, withAuth("stealth_mode", async ({ operation, stealthConfig, contextId, urlToTest }) => {
|
|
1643
1722
|
try {
|
|
@@ -1717,20 +1796,21 @@ server.registerTool("stealth_mode", {
|
|
|
1717
1796
|
// Tool: localization - Multi-language and geo-location management (Wave 3)
|
|
1718
1797
|
server.registerTool("localization", {
|
|
1719
1798
|
description: "Multi-language and geo-location management with country-specific settings, browser locale emulation, timezone spoofing, and geo-blocked content handling",
|
|
1799
|
+
annotations: { title: "Localization", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1720
1800
|
inputSchema: {
|
|
1721
|
-
operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country'),
|
|
1722
|
-
countryCode: z.string().length(2).optional(),
|
|
1723
|
-
language: z.string().optional(),
|
|
1724
|
-
timezone: z.string().optional(),
|
|
1725
|
-
currency: z.string().length(3).optional(),
|
|
1726
|
-
customHeaders: z.record(z.string()).optional(),
|
|
1727
|
-
userAgent: z.string().optional(),
|
|
1728
|
-
acceptLanguage: z.string().optional(),
|
|
1801
|
+
operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country').describe("Localization operation to perform"),
|
|
1802
|
+
countryCode: z.string().length(2).optional().describe("ISO 3166-1 alpha-2 country code"),
|
|
1803
|
+
language: z.string().optional().describe("Language code (e.g. 'en', 'fr', 'de')"),
|
|
1804
|
+
timezone: z.string().optional().describe("IANA timezone identifier (e.g. 'America/New_York')"),
|
|
1805
|
+
currency: z.string().length(3).optional().describe("ISO 4217 currency code (e.g. 'USD', 'EUR')"),
|
|
1806
|
+
customHeaders: z.record(z.string()).optional().describe("Custom HTTP headers for localized requests"),
|
|
1807
|
+
userAgent: z.string().optional().describe("Custom user agent string"),
|
|
1808
|
+
acceptLanguage: z.string().optional().describe("Accept-Language header value"),
|
|
1729
1809
|
geoLocation: z.object({
|
|
1730
1810
|
latitude: z.number().min(-90).max(90),
|
|
1731
1811
|
longitude: z.number().min(-180).max(180),
|
|
1732
1812
|
accuracy: z.number().min(1).max(100).optional()
|
|
1733
|
-
}).optional(),
|
|
1813
|
+
}).optional().describe("GPS coordinates for geolocation emulation"),
|
|
1734
1814
|
proxySettings: z.object({
|
|
1735
1815
|
enabled: z.boolean().default(false),
|
|
1736
1816
|
region: z.string().optional(),
|
|
@@ -1749,26 +1829,26 @@ server.registerTool("localization", {
|
|
|
1749
1829
|
maxRetries: z.number().default(3),
|
|
1750
1830
|
timeout: z.number().default(10000)
|
|
1751
1831
|
}).optional()
|
|
1752
|
-
}).optional(),
|
|
1832
|
+
}).optional().describe("Proxy configuration for geo-targeted requests"),
|
|
1753
1833
|
searchParams: z.object({
|
|
1754
1834
|
query: z.string().optional(),
|
|
1755
1835
|
limit: z.number().optional(),
|
|
1756
1836
|
offset: z.number().optional(),
|
|
1757
1837
|
headers: z.record(z.string()).optional()
|
|
1758
|
-
}).optional(),
|
|
1838
|
+
}).optional().describe("Search parameters for localized search queries"),
|
|
1759
1839
|
browserOptions: z.object({
|
|
1760
1840
|
locale: z.string().optional(),
|
|
1761
1841
|
timezoneId: z.string().optional(),
|
|
1762
1842
|
extraHTTPHeaders: z.record(z.string()).optional(),
|
|
1763
1843
|
userAgent: z.string().optional()
|
|
1764
|
-
}).optional(),
|
|
1765
|
-
content: z.string().optional(),
|
|
1766
|
-
url: z.string().url().optional(),
|
|
1844
|
+
}).optional().describe("Browser context options for locale emulation"),
|
|
1845
|
+
content: z.string().optional().describe("Content for auto-detection of language and locale"),
|
|
1846
|
+
url: z.string().url().optional().describe("URL for geo-blocking detection or auto-detection"),
|
|
1767
1847
|
response: z.object({
|
|
1768
1848
|
status: z.number(),
|
|
1769
1849
|
body: z.string().optional(),
|
|
1770
1850
|
statusText: z.string().optional()
|
|
1771
|
-
}).optional()
|
|
1851
|
+
}).optional().describe("HTTP response for geo-blocking analysis")
|
|
1772
1852
|
}
|
|
1773
1853
|
}, withAuth("localization", async (params) => {
|
|
1774
1854
|
try {
|
|
@@ -1850,11 +1930,92 @@ server.registerTool("localization", {
|
|
|
1850
1930
|
}
|
|
1851
1931
|
}));
|
|
1852
1932
|
|
|
1853
|
-
//
|
|
1933
|
+
// Determine transport mode: HTTP if --http flag or MCP_HTTP env var is set
|
|
1934
|
+
const useHttp = process.argv.includes('--http') || process.env.MCP_HTTP === 'true';
|
|
1935
|
+
|
|
1936
|
+
// Set up transport and start the server
|
|
1854
1937
|
async function runServer() {
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1938
|
+
if (useHttp) {
|
|
1939
|
+
const port = parseInt(process.env.PORT || '3000', 10);
|
|
1940
|
+
|
|
1941
|
+
// Stateless transport — no session tracking, each request is independent
|
|
1942
|
+
// This avoids the bug where server.connect(newTransport) kills previous sessions
|
|
1943
|
+
const transport = new StreamableHTTPServerTransport({
|
|
1944
|
+
sessionIdGenerator: undefined,
|
|
1945
|
+
});
|
|
1946
|
+
await server.connect(transport);
|
|
1947
|
+
|
|
1948
|
+
const httpServer = createServer(async (req, res) => {
|
|
1949
|
+
// CORS headers for Smithery gateway
|
|
1950
|
+
res.setHeader('Access-Control-Allow-Origin', '*');
|
|
1951
|
+
res.setHeader('Access-Control-Allow-Methods', 'GET, POST, DELETE, OPTIONS');
|
|
1952
|
+
res.setHeader('Access-Control-Allow-Headers', 'Content-Type, mcp-session-id');
|
|
1953
|
+
res.setHeader('Access-Control-Expose-Headers', 'mcp-session-id');
|
|
1954
|
+
|
|
1955
|
+
if (req.method === 'OPTIONS') {
|
|
1956
|
+
res.writeHead(204);
|
|
1957
|
+
res.end();
|
|
1958
|
+
return;
|
|
1959
|
+
}
|
|
1960
|
+
|
|
1961
|
+
// Health check endpoint
|
|
1962
|
+
if (req.url === '/health') {
|
|
1963
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
1964
|
+
res.end(JSON.stringify({ status: 'ok', version: '3.0' }));
|
|
1965
|
+
return;
|
|
1966
|
+
}
|
|
1967
|
+
|
|
1968
|
+
// MCP server card for Smithery discovery
|
|
1969
|
+
if (req.url === '/.well-known/mcp/server-card.json') {
|
|
1970
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
1971
|
+
res.end(JSON.stringify({
|
|
1972
|
+
serverInfo: {
|
|
1973
|
+
name: "crawlforge",
|
|
1974
|
+
version: "3.0.12",
|
|
1975
|
+
description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
|
|
1976
|
+
homepage: "https://www.crawlforge.dev",
|
|
1977
|
+
icon: "https://www.crawlforge.dev/icon.png"
|
|
1978
|
+
},
|
|
1979
|
+
transport: {
|
|
1980
|
+
type: "streamable-http",
|
|
1981
|
+
url: "/mcp"
|
|
1982
|
+
},
|
|
1983
|
+
configSchema: {
|
|
1984
|
+
type: "object",
|
|
1985
|
+
properties: {
|
|
1986
|
+
apiKey: {
|
|
1987
|
+
type: "string",
|
|
1988
|
+
title: "CrawlForge API Key",
|
|
1989
|
+
description: "Your CrawlForge API key. Get one free at https://www.crawlforge.dev/signup (includes 1,000 credits)",
|
|
1990
|
+
"x-from": { header: "x-api-key" }
|
|
1991
|
+
}
|
|
1992
|
+
},
|
|
1993
|
+
required: ["apiKey"]
|
|
1994
|
+
}
|
|
1995
|
+
}));
|
|
1996
|
+
return;
|
|
1997
|
+
}
|
|
1998
|
+
|
|
1999
|
+
// Route /mcp to the transport handler
|
|
2000
|
+
if (req.url === '/mcp' || req.url === '/') {
|
|
2001
|
+
await transport.handleRequest(req, res);
|
|
2002
|
+
return;
|
|
2003
|
+
}
|
|
2004
|
+
|
|
2005
|
+
res.writeHead(404);
|
|
2006
|
+
res.end('Not Found');
|
|
2007
|
+
});
|
|
2008
|
+
|
|
2009
|
+
httpServer.listen(port, () => {
|
|
2010
|
+
console.error(`CrawlForge MCP Server v3.0 running on HTTP port ${port}`);
|
|
2011
|
+
console.error(`MCP endpoint: http://localhost:${port}/mcp`);
|
|
2012
|
+
console.error(`Health check: http://localhost:${port}/health`);
|
|
2013
|
+
});
|
|
2014
|
+
} else {
|
|
2015
|
+
const transport = new StdioServerTransport();
|
|
2016
|
+
await server.connect(transport);
|
|
2017
|
+
console.error("CrawlForge MCP Server v3.0 running on stdio");
|
|
2018
|
+
}
|
|
1858
2019
|
console.error(`Environment: ${config.server.nodeEnv}`);
|
|
1859
2020
|
|
|
1860
2021
|
console.error("Search enabled: true (via CrawlForge proxy)");
|
|
@@ -1867,7 +2028,8 @@ async function runServer() {
|
|
|
1867
2028
|
const trackingTools = ", track_changes";
|
|
1868
2029
|
const llmsTxtTools = ", generate_llms_txt";
|
|
1869
2030
|
const wave3Tools = ", stealth_mode, localization";
|
|
1870
|
-
|
|
2031
|
+
const phase1Tools = ", extract_structured";
|
|
2032
|
+
console.error(`Tools available: ${baseTools}${searchTool}${phase3Tools}${wave2Tools}${researchTools}${trackingTools}${llmsTxtTools}${wave3Tools}${phase1Tools}`);
|
|
1871
2033
|
|
|
1872
2034
|
|
|
1873
2035
|
// === MEMORY LEAK PREVENTION ===
|
|
@@ -1893,7 +2055,8 @@ async function gracefulShutdown(signal) {
|
|
|
1893
2055
|
trackChangesTool,
|
|
1894
2056
|
generateLLMsTxtTool,
|
|
1895
2057
|
stealthBrowserManager,
|
|
1896
|
-
localizationManager
|
|
2058
|
+
localizationManager,
|
|
2059
|
+
extractStructuredTool
|
|
1897
2060
|
].filter(tool => tool && (typeof tool.destroy === 'function' || typeof tool.cleanup === 'function'));
|
|
1898
2061
|
|
|
1899
2062
|
console.error(`Cleaning up ${toolsToCleanup.length} tools...`);
|