crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
package/server.js
ADDED
|
@@ -0,0 +1,1963 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
4
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
5
|
+
import { z } from "zod";
|
|
6
|
+
import { load } from "cheerio";
|
|
7
|
+
import { SearchWebTool } from "./src/tools/search/searchWeb.js";
|
|
8
|
+
import { CrawlDeepTool } from "./src/tools/crawl/crawlDeep.js";
|
|
9
|
+
import { MapSiteTool } from "./src/tools/crawl/mapSite.js";
|
|
10
|
+
import { ExtractContentTool } from "./src/tools/extract/extractContent.js";
|
|
11
|
+
import { ProcessDocumentTool } from "./src/tools/extract/processDocument.js";
|
|
12
|
+
import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
|
|
13
|
+
import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
|
|
14
|
+
// Wave 2 Advanced Tools
|
|
15
|
+
import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
|
|
16
|
+
import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
|
|
17
|
+
// Deep Research Tool
|
|
18
|
+
import { DeepResearchTool } from "./src/tools/research/deepResearch.js";
|
|
19
|
+
// Change Tracking Tool - commented out due to import issue
|
|
20
|
+
// import { TrackChangesTool } from "./src/tools/tracking/trackChanges.js";
|
|
21
|
+
// LLMs.txt Generator Tool (Phase 2.5)
|
|
22
|
+
import { GenerateLLMsTxtTool } from "./src/tools/llmstxt/generateLLMsTxt.js";
|
|
23
|
+
// Wave 3-4 Core Managers
|
|
24
|
+
import { StealthBrowserManager } from "./src/core/StealthBrowserManager.js";
|
|
25
|
+
import { LocalizationManager } from "./src/core/LocalizationManager.js";
|
|
26
|
+
import { memoryMonitor } from "./src/utils/MemoryMonitor.js";
|
|
27
|
+
import { config, validateConfig, isSearchConfigured, getToolConfig, getActiveSearchProvider } from "./src/constants/config.js";
|
|
28
|
+
// Authentication Manager
|
|
29
|
+
import AuthManager from "./src/core/AuthManager.js";
|
|
30
|
+
|
|
31
|
+
// Enable creator mode if BYPASS_API_KEY is set
|
|
32
|
+
if (process.env.BYPASS_API_KEY === 'true') {
|
|
33
|
+
process.env.CRAWLFORGE_CREATOR_MODE = 'true';
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Initialize Authentication Manager
|
|
37
|
+
await AuthManager.initialize();
|
|
38
|
+
|
|
39
|
+
// Check if first time setup is needed (skip in creator mode)
|
|
40
|
+
if (!AuthManager.isAuthenticated() && !AuthManager.isCreatorMode()) {
|
|
41
|
+
const apiKey = process.env.CRAWLFORGE_API_KEY;
|
|
42
|
+
if (apiKey) {
|
|
43
|
+
// Auto-setup if API key is provided via environment
|
|
44
|
+
console.log('🔧 Auto-configuring CrawlForge with provided API key...');
|
|
45
|
+
const success = await AuthManager.runSetup(apiKey);
|
|
46
|
+
if (!success) {
|
|
47
|
+
console.error('❌ Failed to authenticate with provided API key');
|
|
48
|
+
console.error('Please check your API key or run: npm run setup');
|
|
49
|
+
process.exit(1);
|
|
50
|
+
}
|
|
51
|
+
} else {
|
|
52
|
+
console.log('');
|
|
53
|
+
console.log('╔═══════════════════════════════════════════════════════╗');
|
|
54
|
+
console.log('║ CrawlForge MCP Server - Setup Required ║');
|
|
55
|
+
console.log('╚═══════════════════════════════════════════════════════╝');
|
|
56
|
+
console.log('');
|
|
57
|
+
console.log('Welcome! This appears to be your first time using CrawlForge.');
|
|
58
|
+
console.log('');
|
|
59
|
+
console.log('To get started, please run:');
|
|
60
|
+
console.log(' npm run setup');
|
|
61
|
+
console.log('');
|
|
62
|
+
console.log('Or set your API key via environment variable:');
|
|
63
|
+
console.log(' export CRAWLFORGE_API_KEY="your_api_key_here"');
|
|
64
|
+
console.log('');
|
|
65
|
+
console.log('Get your free API key at: https://crawlforge.com/signup');
|
|
66
|
+
console.log('(Includes 1,000 free credits!)');
|
|
67
|
+
console.log('');
|
|
68
|
+
process.exit(0);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Validate configuration
|
|
73
|
+
const configErrors = validateConfig();
|
|
74
|
+
if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
|
|
75
|
+
console.error('Configuration errors:', configErrors);
|
|
76
|
+
process.exit(1);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Create the server
|
|
80
|
+
const server = new McpServer({ name: "crawlforge", version: "3.0.0" });
|
|
81
|
+
|
|
82
|
+
// Helper function to wrap tool handlers with authentication and credit tracking
|
|
83
|
+
function withAuth(toolName, handler) {
|
|
84
|
+
return async (params) => {
|
|
85
|
+
const startTime = Date.now();
|
|
86
|
+
|
|
87
|
+
try {
|
|
88
|
+
// Skip credit checks in creator mode
|
|
89
|
+
if (!AuthManager.isCreatorMode()) {
|
|
90
|
+
// Check credits before executing
|
|
91
|
+
const creditCost = AuthManager.getToolCost(toolName);
|
|
92
|
+
const hasCredits = await AuthManager.checkCredits(creditCost);
|
|
93
|
+
|
|
94
|
+
if (!hasCredits) {
|
|
95
|
+
return {
|
|
96
|
+
content: [{
|
|
97
|
+
type: "text",
|
|
98
|
+
text: JSON.stringify({
|
|
99
|
+
error: "Insufficient credits",
|
|
100
|
+
message: `This operation requires ${creditCost} credits. Please upgrade your plan at https://crawlforge.com/pricing`,
|
|
101
|
+
creditsRequired: creditCost
|
|
102
|
+
}, null, 2)
|
|
103
|
+
}]
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Execute the tool
|
|
109
|
+
const result = await handler(params);
|
|
110
|
+
|
|
111
|
+
// Report usage for successful execution (skip in creator mode)
|
|
112
|
+
const processingTime = Date.now() - startTime;
|
|
113
|
+
if (!AuthManager.isCreatorMode()) {
|
|
114
|
+
const creditCost = AuthManager.getToolCost(toolName);
|
|
115
|
+
await AuthManager.reportUsage(
|
|
116
|
+
toolName,
|
|
117
|
+
creditCost,
|
|
118
|
+
params,
|
|
119
|
+
200,
|
|
120
|
+
processingTime
|
|
121
|
+
);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return result;
|
|
125
|
+
} catch (error) {
|
|
126
|
+
// Report usage even for errors (reduced credit cost) - skip in creator mode
|
|
127
|
+
const processingTime = Date.now() - startTime;
|
|
128
|
+
if (!AuthManager.isCreatorMode()) {
|
|
129
|
+
await AuthManager.reportUsage(
|
|
130
|
+
toolName,
|
|
131
|
+
Math.max(1, Math.floor(AuthManager.getToolCost(toolName) * 0.5)), // Half credits for errors
|
|
132
|
+
params,
|
|
133
|
+
500,
|
|
134
|
+
processingTime
|
|
135
|
+
);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
throw error;
|
|
139
|
+
}
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Initialize tools
|
|
144
|
+
let searchWebTool = null;
|
|
145
|
+
if (isSearchConfigured()) {
|
|
146
|
+
searchWebTool = new SearchWebTool(getToolConfig('search_web'));
|
|
147
|
+
}
|
|
148
|
+
const crawlDeepTool = new CrawlDeepTool(getToolConfig('crawl_deep'));
|
|
149
|
+
const mapSiteTool = new MapSiteTool(getToolConfig('map_site'));
|
|
150
|
+
|
|
151
|
+
// Initialize Phase 3 tools
|
|
152
|
+
const extractContentTool = new ExtractContentTool();
|
|
153
|
+
const processDocumentTool = new ProcessDocumentTool();
|
|
154
|
+
const summarizeContentTool = new SummarizeContentTool();
|
|
155
|
+
const analyzeContentTool = new AnalyzeContentTool();
|
|
156
|
+
|
|
157
|
+
// Initialize Wave 2 Advanced Tools
|
|
158
|
+
const batchScrapeTool = new BatchScrapeTool();
|
|
159
|
+
const scrapeWithActionsTool = new ScrapeWithActionsTool();
|
|
160
|
+
|
|
161
|
+
// Initialize Deep Research Tool
|
|
162
|
+
const deepResearchTool = new DeepResearchTool();
|
|
163
|
+
|
|
164
|
+
// Initialize Change Tracking Tool - temporarily disabled due to import issue
|
|
165
|
+
// const trackChangesTool = new TrackChangesTool();
|
|
166
|
+
|
|
167
|
+
// Initialize LLMs.txt Generator Tool (Phase 2.5)
|
|
168
|
+
const generateLLMsTxtTool = new GenerateLLMsTxtTool();
|
|
169
|
+
|
|
170
|
+
// Initialize Wave 3-4 Core Managers
|
|
171
|
+
const stealthBrowserManager = new StealthBrowserManager();
|
|
172
|
+
const localizationManager = new LocalizationManager();
|
|
173
|
+
|
|
174
|
+
// Zod schemas for tool parameters and responses
|
|
175
|
+
const FetchUrlSchema = z.object({
|
|
176
|
+
url: z.string().url(),
|
|
177
|
+
headers: z.record(z.string()).optional(),
|
|
178
|
+
timeout: z.number().min(1000).max(30000).optional().default(10000)
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
const ExtractTextSchema = z.object({
|
|
182
|
+
url: z.string().url(),
|
|
183
|
+
remove_scripts: z.boolean().optional().default(true),
|
|
184
|
+
remove_styles: z.boolean().optional().default(true)
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
const ExtractLinksSchema = z.object({
|
|
188
|
+
url: z.string().url(),
|
|
189
|
+
filter_external: z.boolean().optional().default(false),
|
|
190
|
+
base_url: z.string().url().optional()
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
const ExtractMetadataSchema = z.object({
|
|
194
|
+
url: z.string().url()
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
const ScrapeStructuredSchema = z.object({
|
|
198
|
+
url: z.string().url(),
|
|
199
|
+
selectors: z.record(z.string())
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
const SearchWebSchema = z.object({
|
|
203
|
+
query: z.string(),
|
|
204
|
+
limit: z.number().min(1).max(100).optional(),
|
|
205
|
+
offset: z.number().min(0).optional(),
|
|
206
|
+
lang: z.string().optional(),
|
|
207
|
+
safe_search: z.boolean().optional(),
|
|
208
|
+
time_range: z.enum(['day', 'week', 'month', 'year', 'all']).optional(),
|
|
209
|
+
site: z.string().optional(),
|
|
210
|
+
file_type: z.string().optional()
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
const CrawlDeepSchema = z.object({
|
|
214
|
+
url: z.string().url(),
|
|
215
|
+
max_depth: z.number().min(1).max(5).optional(),
|
|
216
|
+
max_pages: z.number().min(1).max(1000).optional(),
|
|
217
|
+
include_patterns: z.array(z.string()).optional(),
|
|
218
|
+
exclude_patterns: z.array(z.string()).optional(),
|
|
219
|
+
follow_external: z.boolean().optional(),
|
|
220
|
+
respect_robots: z.boolean().optional(),
|
|
221
|
+
extract_content: z.boolean().optional(),
|
|
222
|
+
concurrency: z.number().min(1).max(20).optional()
|
|
223
|
+
});
|
|
224
|
+
|
|
225
|
+
const MapSiteSchema = z.object({
|
|
226
|
+
url: z.string().url(),
|
|
227
|
+
include_sitemap: z.boolean().optional(),
|
|
228
|
+
max_urls: z.number().min(1).max(10000).optional(),
|
|
229
|
+
group_by_path: z.boolean().optional(),
|
|
230
|
+
include_metadata: z.boolean().optional()
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
const ExtractContentSchema = z.object({
|
|
234
|
+
url: z.string().url(),
|
|
235
|
+
options: z.object({}).optional()
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
const ProcessDocumentSchema = z.object({
|
|
239
|
+
source: z.string(),
|
|
240
|
+
sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional(),
|
|
241
|
+
options: z.object({}).optional()
|
|
242
|
+
});
|
|
243
|
+
|
|
244
|
+
const SummarizeContentSchema = z.object({
|
|
245
|
+
text: z.string(),
|
|
246
|
+
options: z.object({}).optional()
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
const AnalyzeContentSchema = z.object({
|
|
250
|
+
text: z.string(),
|
|
251
|
+
options: z.object({}).optional()
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
// Wave 2 Advanced Tools Schemas
|
|
255
|
+
const BatchScrapeSchema = z.object({
|
|
256
|
+
urls: z.array(z.union([
|
|
257
|
+
z.string().url(),
|
|
258
|
+
z.object({
|
|
259
|
+
url: z.string().url(),
|
|
260
|
+
selectors: z.record(z.string()).optional(),
|
|
261
|
+
headers: z.record(z.string()).optional(),
|
|
262
|
+
timeout: z.number().min(1000).max(30000).optional(),
|
|
263
|
+
metadata: z.record(z.any()).optional()
|
|
264
|
+
})
|
|
265
|
+
])).min(1).max(50),
|
|
266
|
+
|
|
267
|
+
formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
|
|
268
|
+
mode: z.enum(['sync', 'async']).default('sync'),
|
|
269
|
+
|
|
270
|
+
webhook: z.object({
|
|
271
|
+
url: z.string().url(),
|
|
272
|
+
events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
|
|
273
|
+
headers: z.record(z.string()).optional(),
|
|
274
|
+
signingSecret: z.string().optional()
|
|
275
|
+
}).optional(),
|
|
276
|
+
|
|
277
|
+
extractionSchema: z.record(z.string()).optional(),
|
|
278
|
+
maxConcurrency: z.number().min(1).max(20).default(10),
|
|
279
|
+
delayBetweenRequests: z.number().min(0).max(10000).default(100),
|
|
280
|
+
includeMetadata: z.boolean().default(true),
|
|
281
|
+
includeFailed: z.boolean().default(true),
|
|
282
|
+
pageSize: z.number().min(1).max(100).default(25),
|
|
283
|
+
|
|
284
|
+
jobOptions: z.object({
|
|
285
|
+
priority: z.number().default(0),
|
|
286
|
+
ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
|
|
287
|
+
maxRetries: z.number().min(0).max(5).default(1),
|
|
288
|
+
tags: z.array(z.string()).default([])
|
|
289
|
+
}).optional()
|
|
290
|
+
});
|
|
291
|
+
|
|
292
|
+
const ScrapeWithActionsSchema = z.object({
|
|
293
|
+
url: z.string().url(),
|
|
294
|
+
actions: z.array(z.object({
|
|
295
|
+
type: z.enum(['wait', 'click', 'type', 'press', 'scroll', 'screenshot', 'executeJavaScript']),
|
|
296
|
+
selector: z.string().optional(),
|
|
297
|
+
text: z.string().optional(),
|
|
298
|
+
key: z.string().optional(),
|
|
299
|
+
script: z.string().optional(),
|
|
300
|
+
timeout: z.number().optional(),
|
|
301
|
+
description: z.string().optional(),
|
|
302
|
+
continueOnError: z.boolean().default(false),
|
|
303
|
+
retries: z.number().min(0).max(5).default(0)
|
|
304
|
+
})).min(1).max(20),
|
|
305
|
+
|
|
306
|
+
formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
|
|
307
|
+
captureIntermediateStates: z.boolean().default(false),
|
|
308
|
+
captureScreenshots: z.boolean().default(true),
|
|
309
|
+
|
|
310
|
+
formAutoFill: z.object({
|
|
311
|
+
fields: z.array(z.object({
|
|
312
|
+
selector: z.string(),
|
|
313
|
+
value: z.string(),
|
|
314
|
+
type: z.enum(['text', 'select', 'checkbox', 'radio', 'file']).default('text'),
|
|
315
|
+
waitAfter: z.number().min(0).max(5000).default(100)
|
|
316
|
+
})),
|
|
317
|
+
submitSelector: z.string().optional(),
|
|
318
|
+
waitAfterSubmit: z.number().min(0).max(30000).default(2000)
|
|
319
|
+
}).optional(),
|
|
320
|
+
|
|
321
|
+
browserOptions: z.object({
|
|
322
|
+
headless: z.boolean().default(true),
|
|
323
|
+
userAgent: z.string().optional(),
|
|
324
|
+
viewportWidth: z.number().min(800).max(1920).default(1280),
|
|
325
|
+
viewportHeight: z.number().min(600).max(1080).default(720),
|
|
326
|
+
timeout: z.number().min(10000).max(120000).default(30000)
|
|
327
|
+
}).optional(),
|
|
328
|
+
|
|
329
|
+
extractionOptions: z.object({
|
|
330
|
+
selectors: z.record(z.string()).optional(),
|
|
331
|
+
includeMetadata: z.boolean().default(true),
|
|
332
|
+
includeLinks: z.boolean().default(true),
|
|
333
|
+
includeImages: z.boolean().default(true)
|
|
334
|
+
}).optional(),
|
|
335
|
+
|
|
336
|
+
continueOnActionError: z.boolean().default(false),
|
|
337
|
+
maxRetries: z.number().min(0).max(3).default(1),
|
|
338
|
+
screenshotOnError: z.boolean().default(true)
|
|
339
|
+
});
|
|
340
|
+
|
|
341
|
+
// Deep Research Tool Schema
|
|
342
|
+
const DeepResearchSchema = z.object({
|
|
343
|
+
topic: z.string().min(3).max(500),
|
|
344
|
+
maxDepth: z.number().min(1).max(10).optional().default(5),
|
|
345
|
+
maxUrls: z.number().min(1).max(1000).optional().default(50),
|
|
346
|
+
timeLimit: z.number().min(30000).max(300000).optional().default(120000),
|
|
347
|
+
researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'),
|
|
348
|
+
sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']),
|
|
349
|
+
credibilityThreshold: z.number().min(0).max(1).optional().default(0.3),
|
|
350
|
+
includeRecentOnly: z.boolean().optional().default(false),
|
|
351
|
+
enableConflictDetection: z.boolean().optional().default(true),
|
|
352
|
+
enableSourceVerification: z.boolean().optional().default(true),
|
|
353
|
+
enableSynthesis: z.boolean().optional().default(true),
|
|
354
|
+
outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'),
|
|
355
|
+
includeRawData: z.boolean().optional().default(false),
|
|
356
|
+
includeActivityLog: z.boolean().optional().default(false),
|
|
357
|
+
queryExpansion: z.object({
|
|
358
|
+
enableSynonyms: z.boolean().optional().default(true),
|
|
359
|
+
enableSpellCheck: z.boolean().optional().default(true),
|
|
360
|
+
enableContextual: z.boolean().optional().default(true),
|
|
361
|
+
maxVariations: z.number().min(1).max(20).optional().default(8)
|
|
362
|
+
}).optional(),
|
|
363
|
+
llmConfig: z.object({
|
|
364
|
+
provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'),
|
|
365
|
+
openai: z.object({
|
|
366
|
+
apiKey: z.string().optional(),
|
|
367
|
+
model: z.string().optional().default('gpt-3.5-turbo'),
|
|
368
|
+
embeddingModel: z.string().optional().default('text-embedding-ada-002')
|
|
369
|
+
}).optional(),
|
|
370
|
+
anthropic: z.object({
|
|
371
|
+
apiKey: z.string().optional(),
|
|
372
|
+
model: z.string().optional().default('claude-3-haiku-20240307')
|
|
373
|
+
}).optional(),
|
|
374
|
+
enableSemanticAnalysis: z.boolean().optional().default(true),
|
|
375
|
+
enableIntelligentSynthesis: z.boolean().optional().default(true)
|
|
376
|
+
}).optional(),
|
|
377
|
+
concurrency: z.number().min(1).max(20).optional().default(5),
|
|
378
|
+
cacheResults: z.boolean().optional().default(true),
|
|
379
|
+
webhook: z.object({
|
|
380
|
+
url: z.string().url(),
|
|
381
|
+
events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']),
|
|
382
|
+
headers: z.record(z.string()).optional()
|
|
383
|
+
}).optional()
|
|
384
|
+
});
|
|
385
|
+
|
|
386
|
+
// Change Tracking Tool Schema
|
|
387
|
+
const TrackChangesSchema = z.object({
|
|
388
|
+
url: z.string().url(),
|
|
389
|
+
operation: z.enum(['create_baseline', 'compare', 'monitor', 'get_history', 'get_stats']).default('compare'),
|
|
390
|
+
content: z.string().optional(),
|
|
391
|
+
html: z.string().optional(),
|
|
392
|
+
trackingOptions: z.object({
|
|
393
|
+
granularity: z.enum(['page', 'section', 'element', 'text']).default('section'),
|
|
394
|
+
trackText: z.boolean().default(true),
|
|
395
|
+
trackStructure: z.boolean().default(true),
|
|
396
|
+
trackAttributes: z.boolean().default(false),
|
|
397
|
+
trackImages: z.boolean().default(false),
|
|
398
|
+
trackLinks: z.boolean().default(true),
|
|
399
|
+
ignoreWhitespace: z.boolean().default(true),
|
|
400
|
+
ignoreCase: z.boolean().default(false),
|
|
401
|
+
customSelectors: z.array(z.string()).optional(),
|
|
402
|
+
excludeSelectors: z.array(z.string()).optional(),
|
|
403
|
+
significanceThresholds: z.object({
|
|
404
|
+
minor: z.number().min(0).max(1).default(0.1),
|
|
405
|
+
moderate: z.number().min(0).max(1).default(0.3),
|
|
406
|
+
major: z.number().min(0).max(1).default(0.7)
|
|
407
|
+
}).optional()
|
|
408
|
+
}).optional(),
|
|
409
|
+
monitoringOptions: z.object({
|
|
410
|
+
enabled: z.boolean().default(false),
|
|
411
|
+
interval: z.number().min(60000).max(24 * 60 * 60 * 1000).default(300000),
|
|
412
|
+
maxRetries: z.number().min(0).max(5).default(3),
|
|
413
|
+
retryDelay: z.number().min(1000).max(60000).default(5000),
|
|
414
|
+
notificationThreshold: z.enum(['minor', 'moderate', 'major', 'critical']).default('moderate'),
|
|
415
|
+
enableWebhook: z.boolean().default(false),
|
|
416
|
+
webhookUrl: z.string().url().optional(),
|
|
417
|
+
webhookSecret: z.string().optional()
|
|
418
|
+
}).optional(),
|
|
419
|
+
storageOptions: z.object({
|
|
420
|
+
enableSnapshots: z.boolean().default(true),
|
|
421
|
+
retainHistory: z.boolean().default(true),
|
|
422
|
+
maxHistoryEntries: z.number().min(1).max(1000).default(100),
|
|
423
|
+
compressionEnabled: z.boolean().default(true),
|
|
424
|
+
deltaStorageEnabled: z.boolean().default(true)
|
|
425
|
+
}).optional(),
|
|
426
|
+
queryOptions: z.object({
|
|
427
|
+
limit: z.number().min(1).max(500).default(50),
|
|
428
|
+
offset: z.number().min(0).default(0),
|
|
429
|
+
startTime: z.number().optional(),
|
|
430
|
+
endTime: z.number().optional(),
|
|
431
|
+
includeContent: z.boolean().default(false),
|
|
432
|
+
significanceFilter: z.enum(['all', 'minor', 'moderate', 'major', 'critical']).optional()
|
|
433
|
+
}).optional(),
|
|
434
|
+
notificationOptions: z.object({
|
|
435
|
+
webhook: z.object({
|
|
436
|
+
enabled: z.boolean().default(false),
|
|
437
|
+
url: z.string().url().optional(),
|
|
438
|
+
method: z.enum(['POST', 'PUT']).default('POST'),
|
|
439
|
+
headers: z.record(z.string()).optional(),
|
|
440
|
+
signingSecret: z.string().optional(),
|
|
441
|
+
includeContent: z.boolean().default(false)
|
|
442
|
+
}).optional(),
|
|
443
|
+
slack: z.object({
|
|
444
|
+
enabled: z.boolean().default(false),
|
|
445
|
+
webhookUrl: z.string().url().optional(),
|
|
446
|
+
channel: z.string().optional(),
|
|
447
|
+
username: z.string().optional()
|
|
448
|
+
}).optional()
|
|
449
|
+
}).optional()
|
|
450
|
+
});
|
|
451
|
+
|
|
452
|
+
// LLMs.txt Generator Tool Schema (Phase 2.5)
|
|
453
|
+
const GenerateLLMsTxtSchema = z.object({
|
|
454
|
+
url: z.string().url(),
|
|
455
|
+
analysisOptions: z.object({
|
|
456
|
+
maxDepth: z.number().min(1).max(5).optional().default(3),
|
|
457
|
+
maxPages: z.number().min(10).max(500).optional().default(100),
|
|
458
|
+
detectAPIs: z.boolean().optional().default(true),
|
|
459
|
+
analyzeContent: z.boolean().optional().default(true),
|
|
460
|
+
checkSecurity: z.boolean().optional().default(true),
|
|
461
|
+
respectRobots: z.boolean().optional().default(true)
|
|
462
|
+
}).optional(),
|
|
463
|
+
outputOptions: z.object({
|
|
464
|
+
includeDetailed: z.boolean().optional().default(true),
|
|
465
|
+
includeAnalysis: z.boolean().optional().default(false),
|
|
466
|
+
contactEmail: z.string().email().optional(),
|
|
467
|
+
organizationName: z.string().optional(),
|
|
468
|
+
customGuidelines: z.array(z.string()).optional(),
|
|
469
|
+
customRestrictions: z.array(z.string()).optional()
|
|
470
|
+
}).optional(),
|
|
471
|
+
complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard'),
|
|
472
|
+
format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both')
|
|
473
|
+
});
|
|
474
|
+
|
|
475
|
+
// Stealth Mode Tool Schema (Wave 3)
|
|
476
|
+
const StealthModeSchema = z.object({
|
|
477
|
+
operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure'),
|
|
478
|
+
stealthConfig: z.object({
|
|
479
|
+
level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
|
|
480
|
+
randomizeFingerprint: z.boolean().default(true),
|
|
481
|
+
hideWebDriver: z.boolean().default(true),
|
|
482
|
+
blockWebRTC: z.boolean().default(true),
|
|
483
|
+
spoofTimezone: z.boolean().default(true),
|
|
484
|
+
randomizeHeaders: z.boolean().default(true),
|
|
485
|
+
useRandomUserAgent: z.boolean().default(true),
|
|
486
|
+
simulateHumanBehavior: z.boolean().default(true),
|
|
487
|
+
customUserAgent: z.string().optional(),
|
|
488
|
+
customViewport: z.object({
|
|
489
|
+
width: z.number().min(800).max(1920),
|
|
490
|
+
height: z.number().min(600).max(1080)
|
|
491
|
+
}).optional(),
|
|
492
|
+
locale: z.string().default('en-US'),
|
|
493
|
+
timezone: z.string().optional(),
|
|
494
|
+
webRTCPublicIP: z.string().optional(),
|
|
495
|
+
webRTCLocalIPs: z.array(z.string()).optional(),
|
|
496
|
+
proxyRotation: z.object({
|
|
497
|
+
enabled: z.boolean().default(false),
|
|
498
|
+
proxies: z.array(z.string()).optional(),
|
|
499
|
+
rotationInterval: z.number().default(300000)
|
|
500
|
+
}).optional(),
|
|
501
|
+
antiDetection: z.object({
|
|
502
|
+
cloudflareBypass: z.boolean().default(true),
|
|
503
|
+
recaptchaHandling: z.boolean().default(true),
|
|
504
|
+
hideAutomation: z.boolean().default(true),
|
|
505
|
+
spoofMediaDevices: z.boolean().default(true),
|
|
506
|
+
spoofBatteryAPI: z.boolean().default(true)
|
|
507
|
+
}).optional(),
|
|
508
|
+
fingerprinting: z.object({
|
|
509
|
+
canvasNoise: z.boolean().default(true),
|
|
510
|
+
webglSpoofing: z.boolean().default(true),
|
|
511
|
+
audioContextSpoofing: z.boolean().default(true),
|
|
512
|
+
fontSpoofing: z.boolean().default(true),
|
|
513
|
+
hardwareSpoofing: z.boolean().default(true)
|
|
514
|
+
}).optional()
|
|
515
|
+
}).optional(),
|
|
516
|
+
contextId: z.string().optional(),
|
|
517
|
+
urlToTest: z.string().url().optional()
|
|
518
|
+
});
|
|
519
|
+
|
|
520
|
+
// Localization Tool Schema (Wave 3)
|
|
521
|
+
const LocalizationSchema = z.object({
|
|
522
|
+
operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country'),
|
|
523
|
+
countryCode: z.string().length(2).optional(),
|
|
524
|
+
language: z.string().optional(),
|
|
525
|
+
timezone: z.string().optional(),
|
|
526
|
+
currency: z.string().length(3).optional(),
|
|
527
|
+
customHeaders: z.record(z.string()).optional(),
|
|
528
|
+
userAgent: z.string().optional(),
|
|
529
|
+
acceptLanguage: z.string().optional(),
|
|
530
|
+
geoLocation: z.object({
|
|
531
|
+
latitude: z.number().min(-90).max(90),
|
|
532
|
+
longitude: z.number().min(-180).max(180),
|
|
533
|
+
accuracy: z.number().min(1).max(100).optional()
|
|
534
|
+
}).optional(),
|
|
535
|
+
proxySettings: z.object({
|
|
536
|
+
enabled: z.boolean().default(false),
|
|
537
|
+
region: z.string().optional(),
|
|
538
|
+
type: z.enum(['http', 'https', 'socks4', 'socks5']).default('https'),
|
|
539
|
+
server: z.string().optional(),
|
|
540
|
+
port: z.number().optional(),
|
|
541
|
+
username: z.string().optional(),
|
|
542
|
+
password: z.string().optional(),
|
|
543
|
+
rotation: z.object({
|
|
544
|
+
enabled: z.boolean().default(false),
|
|
545
|
+
interval: z.number().default(300000),
|
|
546
|
+
strategy: z.enum(['round-robin', 'random', 'failover']).default('round-robin')
|
|
547
|
+
}).optional(),
|
|
548
|
+
fallback: z.object({
|
|
549
|
+
enabled: z.boolean().default(true),
|
|
550
|
+
maxRetries: z.number().default(3),
|
|
551
|
+
timeout: z.number().default(10000)
|
|
552
|
+
}).optional()
|
|
553
|
+
}).optional(),
|
|
554
|
+
searchParams: z.object({
|
|
555
|
+
query: z.string().optional(),
|
|
556
|
+
limit: z.number().optional(),
|
|
557
|
+
offset: z.number().optional(),
|
|
558
|
+
headers: z.record(z.string()).optional()
|
|
559
|
+
}).optional(),
|
|
560
|
+
browserOptions: z.object({
|
|
561
|
+
locale: z.string().optional(),
|
|
562
|
+
timezoneId: z.string().optional(),
|
|
563
|
+
extraHTTPHeaders: z.record(z.string()).optional(),
|
|
564
|
+
userAgent: z.string().optional()
|
|
565
|
+
}).optional(),
|
|
566
|
+
content: z.string().optional(),
|
|
567
|
+
url: z.string().url().optional(),
|
|
568
|
+
response: z.object({
|
|
569
|
+
status: z.number(),
|
|
570
|
+
body: z.string().optional(),
|
|
571
|
+
statusText: z.string().optional()
|
|
572
|
+
}).optional()
|
|
573
|
+
});
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
// Utility function to fetch URL with error handling
|
|
577
|
+
async function fetchWithTimeout(url, options = {}) {
|
|
578
|
+
const { timeout = 10000, headers = {} } = options;
|
|
579
|
+
|
|
580
|
+
const controller = new AbortController();
|
|
581
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
582
|
+
|
|
583
|
+
try {
|
|
584
|
+
const response = await fetch(url, {
|
|
585
|
+
signal: controller.signal,
|
|
586
|
+
headers: {
|
|
587
|
+
'User-Agent': 'CrawlForge/1.0.0',
|
|
588
|
+
...headers
|
|
589
|
+
}
|
|
590
|
+
});
|
|
591
|
+
|
|
592
|
+
clearTimeout(timeoutId);
|
|
593
|
+
return response;
|
|
594
|
+
} catch (error) {
|
|
595
|
+
clearTimeout(timeoutId);
|
|
596
|
+
if (error.name === 'AbortError') {
|
|
597
|
+
throw new Error(`Request timeout after ${timeout}ms`);
|
|
598
|
+
}
|
|
599
|
+
throw error;
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
// Tool: fetch_url - Basic URL fetching with headers and response handling
|
|
604
|
+
server.registerTool("fetch_url", {
|
|
605
|
+
description: "Fetch content from a URL with optional headers and timeout",
|
|
606
|
+
inputSchema: {
|
|
607
|
+
url: z.string().url(),
|
|
608
|
+
headers: z.record(z.string()).optional(),
|
|
609
|
+
timeout: z.number().min(1000).max(30000).optional().default(10000)
|
|
610
|
+
}
|
|
611
|
+
}, withAuth("fetch_url", async ({ url, headers, timeout }) => {
|
|
612
|
+
try {
|
|
613
|
+
const response = await fetchWithTimeout(url, {
|
|
614
|
+
timeout: timeout || 10000,
|
|
615
|
+
headers: headers || {}
|
|
616
|
+
});
|
|
617
|
+
|
|
618
|
+
const body = await response.text();
|
|
619
|
+
const responseHeaders = {};
|
|
620
|
+
response.headers.forEach((value, key) => {
|
|
621
|
+
responseHeaders[key] = value;
|
|
622
|
+
});
|
|
623
|
+
|
|
624
|
+
return {
|
|
625
|
+
content: [{
|
|
626
|
+
type: "text",
|
|
627
|
+
text: JSON.stringify({
|
|
628
|
+
status: response.status,
|
|
629
|
+
statusText: response.statusText,
|
|
630
|
+
headers: responseHeaders,
|
|
631
|
+
body: body,
|
|
632
|
+
contentType: response.headers.get('content-type') || 'unknown',
|
|
633
|
+
size: body.length,
|
|
634
|
+
url: response.url
|
|
635
|
+
}, null, 2)
|
|
636
|
+
}]
|
|
637
|
+
};
|
|
638
|
+
} catch (error) {
|
|
639
|
+
return {
|
|
640
|
+
content: [{
|
|
641
|
+
type: "text",
|
|
642
|
+
text: `Failed to fetch URL: ${error.message}`
|
|
643
|
+
}],
|
|
644
|
+
isError: true
|
|
645
|
+
};
|
|
646
|
+
}
|
|
647
|
+
}));
|
|
648
|
+
|
|
649
|
+
// Tool: extract_text - Extract clean text content from HTML
|
|
650
|
+
server.registerTool("extract_text", {
|
|
651
|
+
description: "Extract clean text content from a webpage",
|
|
652
|
+
inputSchema: {
|
|
653
|
+
url: z.string().url(),
|
|
654
|
+
remove_scripts: z.boolean().optional().default(true),
|
|
655
|
+
remove_styles: z.boolean().optional().default(true)
|
|
656
|
+
}
|
|
657
|
+
}, withAuth("extract_text", async ({ url, remove_scripts, remove_styles }) => {
|
|
658
|
+
try {
|
|
659
|
+
const response = await fetchWithTimeout(url);
|
|
660
|
+
if (!response.ok) {
|
|
661
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
const html = await response.text();
|
|
665
|
+
const $ = load(html);
|
|
666
|
+
|
|
667
|
+
// Remove unwanted elements
|
|
668
|
+
if (remove_scripts !== false) {
|
|
669
|
+
$('script').remove();
|
|
670
|
+
}
|
|
671
|
+
if (remove_styles !== false) {
|
|
672
|
+
$('style').remove();
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
// Remove common non-content elements
|
|
676
|
+
$('nav, header, footer, aside, .advertisement, .ad, .sidebar').remove();
|
|
677
|
+
|
|
678
|
+
// Extract text content
|
|
679
|
+
const text = $('body').text().replace(/\s+/g, ' ').trim();
|
|
680
|
+
|
|
681
|
+
return {
|
|
682
|
+
content: [{
|
|
683
|
+
type: "text",
|
|
684
|
+
text: JSON.stringify({
|
|
685
|
+
text: text,
|
|
686
|
+
word_count: text.split(/\s+/).filter(word => word.length > 0).length,
|
|
687
|
+
char_count: text.length,
|
|
688
|
+
url: response.url
|
|
689
|
+
}, null, 2)
|
|
690
|
+
}]
|
|
691
|
+
};
|
|
692
|
+
} catch (error) {
|
|
693
|
+
return {
|
|
694
|
+
content: [{
|
|
695
|
+
type: "text",
|
|
696
|
+
text: `Failed to extract text: ${error.message}`
|
|
697
|
+
}],
|
|
698
|
+
isError: true
|
|
699
|
+
};
|
|
700
|
+
}
|
|
701
|
+
}));
|
|
702
|
+
|
|
703
|
+
// Tool: extract_links - Extract all links from a webpage with optional filtering
|
|
704
|
+
server.registerTool("extract_links", {
|
|
705
|
+
description: "Extract all links from a webpage with optional filtering",
|
|
706
|
+
inputSchema: {
|
|
707
|
+
url: z.string().url(),
|
|
708
|
+
filter_external: z.boolean().optional().default(false),
|
|
709
|
+
base_url: z.string().url().optional()
|
|
710
|
+
}
|
|
711
|
+
}, withAuth("extract_links", async ({ url, filter_external, base_url }) => {
|
|
712
|
+
try {
|
|
713
|
+
const response = await fetchWithTimeout(url);
|
|
714
|
+
if (!response.ok) {
|
|
715
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
const html = await response.text();
|
|
719
|
+
const $ = load(html);
|
|
720
|
+
|
|
721
|
+
const baseUrl = base_url || new URL(url).origin;
|
|
722
|
+
const pageUrl = new URL(url);
|
|
723
|
+
const links = [];
|
|
724
|
+
|
|
725
|
+
$('a[href]').each((_, element) => {
|
|
726
|
+
const href = $(element).attr('href');
|
|
727
|
+
const text = $(element).text().trim();
|
|
728
|
+
|
|
729
|
+
if (!href) return;
|
|
730
|
+
|
|
731
|
+
let absoluteUrl;
|
|
732
|
+
let isExternal = false;
|
|
733
|
+
|
|
734
|
+
try {
|
|
735
|
+
if (href.startsWith('http://') || href.startsWith('https://')) {
|
|
736
|
+
absoluteUrl = href;
|
|
737
|
+
isExternal = new URL(href).origin !== pageUrl.origin;
|
|
738
|
+
} else {
|
|
739
|
+
absoluteUrl = new URL(href, baseUrl).toString();
|
|
740
|
+
isExternal = false;
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
// Apply filtering
|
|
744
|
+
if (filter_external && isExternal) {
|
|
745
|
+
return;
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
links.push({
|
|
749
|
+
href: absoluteUrl,
|
|
750
|
+
text: text,
|
|
751
|
+
is_external: isExternal,
|
|
752
|
+
original_href: href
|
|
753
|
+
});
|
|
754
|
+
} catch (urlError) {
|
|
755
|
+
// Skip invalid URLs
|
|
756
|
+
}
|
|
757
|
+
});
|
|
758
|
+
|
|
759
|
+
// Remove duplicates
|
|
760
|
+
const uniqueLinks = links.filter((link, index, arr) =>
|
|
761
|
+
arr.findIndex(l => l.href === link.href) === index
|
|
762
|
+
);
|
|
763
|
+
|
|
764
|
+
return {
|
|
765
|
+
content: [{
|
|
766
|
+
type: "text",
|
|
767
|
+
text: JSON.stringify({
|
|
768
|
+
links: uniqueLinks,
|
|
769
|
+
total_count: uniqueLinks.length,
|
|
770
|
+
internal_count: uniqueLinks.filter(l => !l.is_external).length,
|
|
771
|
+
external_count: uniqueLinks.filter(l => l.is_external).length,
|
|
772
|
+
base_url: baseUrl
|
|
773
|
+
}, null, 2)
|
|
774
|
+
}]
|
|
775
|
+
};
|
|
776
|
+
} catch (error) {
|
|
777
|
+
return {
|
|
778
|
+
content: [{
|
|
779
|
+
type: "text",
|
|
780
|
+
text: `Failed to extract links: ${error.message}`
|
|
781
|
+
}],
|
|
782
|
+
isError: true
|
|
783
|
+
};
|
|
784
|
+
}
|
|
785
|
+
}));
|
|
786
|
+
|
|
787
|
+
// Tool: extract_metadata - Extract page metadata
|
|
788
|
+
server.registerTool("extract_metadata", {
|
|
789
|
+
description: "Extract metadata from a webpage (title, description, keywords, etc.)",
|
|
790
|
+
inputSchema: {
|
|
791
|
+
url: z.string().url()
|
|
792
|
+
}
|
|
793
|
+
}, withAuth("extract_metadata", async ({ url }) => {
|
|
794
|
+
try {
|
|
795
|
+
const response = await fetchWithTimeout(url);
|
|
796
|
+
if (!response.ok) {
|
|
797
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
const html = await response.text();
|
|
801
|
+
const $ = load(html);
|
|
802
|
+
|
|
803
|
+
// Extract basic metadata
|
|
804
|
+
const title = $('title').text().trim() || $('h1').first().text().trim();
|
|
805
|
+
const description = $('meta[name="description"]').attr('content') ||
|
|
806
|
+
$('meta[property="og:description"]').attr('content') || '';
|
|
807
|
+
const keywords = $('meta[name="keywords"]').attr('content') || '';
|
|
808
|
+
const canonical = $('link[rel="canonical"]').attr('href') || '';
|
|
809
|
+
|
|
810
|
+
// Extract Open Graph tags
|
|
811
|
+
const ogTags = {};
|
|
812
|
+
$('meta[property^="og:"]').each((_, element) => {
|
|
813
|
+
const property = $(element).attr('property');
|
|
814
|
+
const content = $(element).attr('content');
|
|
815
|
+
if (property && content) {
|
|
816
|
+
ogTags[property.replace('og:', '')] = content;
|
|
817
|
+
}
|
|
818
|
+
});
|
|
819
|
+
|
|
820
|
+
// Extract Twitter Card tags
|
|
821
|
+
const twitterTags = {};
|
|
822
|
+
$('meta[name^="twitter:"]').each((_, element) => {
|
|
823
|
+
const name = $(element).attr('name');
|
|
824
|
+
const content = $(element).attr('content');
|
|
825
|
+
if (name && content) {
|
|
826
|
+
twitterTags[name.replace('twitter:', '')] = content;
|
|
827
|
+
}
|
|
828
|
+
});
|
|
829
|
+
|
|
830
|
+
// Extract additional metadata
|
|
831
|
+
const author = $('meta[name="author"]').attr('content') || '';
|
|
832
|
+
const robots = $('meta[name="robots"]').attr('content') || '';
|
|
833
|
+
const viewport = $('meta[name="viewport"]').attr('content') || '';
|
|
834
|
+
const charset = $('meta[charset]').attr('charset') ||
|
|
835
|
+
$('meta[http-equiv="Content-Type"]').attr('content') || '';
|
|
836
|
+
|
|
837
|
+
return {
|
|
838
|
+
content: [{
|
|
839
|
+
type: "text",
|
|
840
|
+
text: JSON.stringify({
|
|
841
|
+
title: title,
|
|
842
|
+
description: description,
|
|
843
|
+
keywords: keywords.split(',').map(k => k.trim()).filter(k => k),
|
|
844
|
+
canonical_url: canonical,
|
|
845
|
+
author: author,
|
|
846
|
+
robots: robots,
|
|
847
|
+
viewport: viewport,
|
|
848
|
+
charset: charset,
|
|
849
|
+
og_tags: ogTags,
|
|
850
|
+
twitter_tags: twitterTags,
|
|
851
|
+
url: response.url
|
|
852
|
+
}, null, 2)
|
|
853
|
+
}]
|
|
854
|
+
};
|
|
855
|
+
} catch (error) {
|
|
856
|
+
return {
|
|
857
|
+
content: [{
|
|
858
|
+
type: "text",
|
|
859
|
+
text: `Failed to extract metadata: ${error.message}`
|
|
860
|
+
}],
|
|
861
|
+
isError: true
|
|
862
|
+
};
|
|
863
|
+
}
|
|
864
|
+
}));
|
|
865
|
+
|
|
866
|
+
// Tool: scrape_structured - Extract structured data using CSS selectors
|
|
867
|
+
server.registerTool("scrape_structured", {
|
|
868
|
+
description: "Extract structured data from a webpage using CSS selectors",
|
|
869
|
+
inputSchema: {
|
|
870
|
+
url: z.string().url(),
|
|
871
|
+
selectors: z.record(z.string())
|
|
872
|
+
}
|
|
873
|
+
}, withAuth("scrape_structured", async ({ url, selectors }) => {
|
|
874
|
+
try {
|
|
875
|
+
const response = await fetchWithTimeout(url);
|
|
876
|
+
if (!response.ok) {
|
|
877
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
const html = await response.text();
|
|
881
|
+
const $ = load(html);
|
|
882
|
+
|
|
883
|
+
const results = {};
|
|
884
|
+
|
|
885
|
+
for (const [fieldName, selector] of Object.entries(selectors)) {
|
|
886
|
+
try {
|
|
887
|
+
const elements = $(selector);
|
|
888
|
+
|
|
889
|
+
if (elements.length === 0) {
|
|
890
|
+
results[fieldName] = null;
|
|
891
|
+
} else if (elements.length === 1) {
|
|
892
|
+
// Single element - return text content
|
|
893
|
+
results[fieldName] = elements.text().trim();
|
|
894
|
+
} else {
|
|
895
|
+
// Multiple elements - return array of text content
|
|
896
|
+
results[fieldName] = elements.map((_, el) => $(el).text().trim()).get();
|
|
897
|
+
}
|
|
898
|
+
} catch (selectorError) {
|
|
899
|
+
results[fieldName] = {
|
|
900
|
+
error: `Invalid selector: ${selector}`,
|
|
901
|
+
message: selectorError.message
|
|
902
|
+
};
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
return {
|
|
907
|
+
content: [{
|
|
908
|
+
type: "text",
|
|
909
|
+
text: JSON.stringify({
|
|
910
|
+
data: results,
|
|
911
|
+
selectors_used: selectors,
|
|
912
|
+
elements_found: Object.keys(results).length,
|
|
913
|
+
url: response.url
|
|
914
|
+
}, null, 2)
|
|
915
|
+
}]
|
|
916
|
+
};
|
|
917
|
+
} catch (error) {
|
|
918
|
+
return {
|
|
919
|
+
content: [{
|
|
920
|
+
type: "text",
|
|
921
|
+
text: `Failed to scrape structured data: ${error.message}`
|
|
922
|
+
}],
|
|
923
|
+
isError: true
|
|
924
|
+
};
|
|
925
|
+
}
|
|
926
|
+
}));
|
|
927
|
+
|
|
928
|
+
// Tool: search_web - Web search with configurable providers
|
|
929
|
+
if (searchWebTool) {
|
|
930
|
+
const activeProvider = getActiveSearchProvider();
|
|
931
|
+
const providerName = activeProvider === 'google' ? 'Google Custom Search API' :
|
|
932
|
+
activeProvider === 'duckduckgo' ? 'DuckDuckGo' : 'Auto-selected provider';
|
|
933
|
+
|
|
934
|
+
server.registerTool("search_web", {
|
|
935
|
+
description: `Search the web using ${providerName}`,
|
|
936
|
+
inputSchema: {
|
|
937
|
+
query: z.string(),
|
|
938
|
+
limit: z.number().min(1).max(100).optional(),
|
|
939
|
+
offset: z.number().min(0).optional(),
|
|
940
|
+
lang: z.string().optional(),
|
|
941
|
+
safe_search: z.boolean().optional(),
|
|
942
|
+
time_range: z.enum(['day', 'week', 'month', 'year', 'all']).optional(),
|
|
943
|
+
site: z.string().optional(),
|
|
944
|
+
file_type: z.string().optional()
|
|
945
|
+
}
|
|
946
|
+
}, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
|
|
947
|
+
try {
|
|
948
|
+
if (!query) {
|
|
949
|
+
return {
|
|
950
|
+
content: [{
|
|
951
|
+
type: "text",
|
|
952
|
+
text: "Query parameter is required"
|
|
953
|
+
}],
|
|
954
|
+
isError: true
|
|
955
|
+
};
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type });
|
|
959
|
+
return {
|
|
960
|
+
content: [{
|
|
961
|
+
type: "text",
|
|
962
|
+
text: JSON.stringify(result, null, 2)
|
|
963
|
+
}]
|
|
964
|
+
};
|
|
965
|
+
} catch (error) {
|
|
966
|
+
return {
|
|
967
|
+
content: [{
|
|
968
|
+
type: "text",
|
|
969
|
+
text: `Search failed: ${error.message}`
|
|
970
|
+
}],
|
|
971
|
+
isError: true
|
|
972
|
+
};
|
|
973
|
+
}
|
|
974
|
+
}));
|
|
975
|
+
} else {
|
|
976
|
+
const activeProvider = getActiveSearchProvider();
|
|
977
|
+
if (activeProvider === 'google') {
|
|
978
|
+
console.error("Warning: search_web tool not configured. Set GOOGLE_API_KEY and GOOGLE_SEARCH_ENGINE_ID to enable Google search.");
|
|
979
|
+
} else {
|
|
980
|
+
console.error("Warning: search_web tool initialization failed. Check your SEARCH_PROVIDER configuration.");
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
// Tool: crawl_deep - Deep crawl websites with BFS algorithm
|
|
985
|
+
server.registerTool("crawl_deep", {
|
|
986
|
+
description: "Crawl websites deeply using breadth-first search",
|
|
987
|
+
inputSchema: {
|
|
988
|
+
url: z.string().url(),
|
|
989
|
+
max_depth: z.number().min(1).max(5).optional(),
|
|
990
|
+
max_pages: z.number().min(1).max(1000).optional(),
|
|
991
|
+
include_patterns: z.array(z.string()).optional(),
|
|
992
|
+
exclude_patterns: z.array(z.string()).optional(),
|
|
993
|
+
follow_external: z.boolean().optional(),
|
|
994
|
+
respect_robots: z.boolean().optional(),
|
|
995
|
+
extract_content: z.boolean().optional(),
|
|
996
|
+
concurrency: z.number().min(1).max(20).optional()
|
|
997
|
+
}
|
|
998
|
+
}, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
|
|
999
|
+
try {
|
|
1000
|
+
if (!url) {
|
|
1001
|
+
return {
|
|
1002
|
+
content: [{
|
|
1003
|
+
type: "text",
|
|
1004
|
+
text: "URL parameter is required"
|
|
1005
|
+
}],
|
|
1006
|
+
isError: true
|
|
1007
|
+
};
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency });
|
|
1011
|
+
return {
|
|
1012
|
+
content: [{
|
|
1013
|
+
type: "text",
|
|
1014
|
+
text: JSON.stringify(result, null, 2)
|
|
1015
|
+
}]
|
|
1016
|
+
};
|
|
1017
|
+
} catch (error) {
|
|
1018
|
+
return {
|
|
1019
|
+
content: [{
|
|
1020
|
+
type: "text",
|
|
1021
|
+
text: `Crawl failed: ${error.message}`
|
|
1022
|
+
}],
|
|
1023
|
+
isError: true
|
|
1024
|
+
};
|
|
1025
|
+
}
|
|
1026
|
+
}));
|
|
1027
|
+
|
|
1028
|
+
// Tool: map_site - Discover and map website structure
|
|
1029
|
+
server.registerTool("map_site", {
|
|
1030
|
+
description: "Discover and map website structure",
|
|
1031
|
+
inputSchema: {
|
|
1032
|
+
url: z.string().url(),
|
|
1033
|
+
include_sitemap: z.boolean().optional(),
|
|
1034
|
+
max_urls: z.number().min(1).max(10000).optional(),
|
|
1035
|
+
group_by_path: z.boolean().optional(),
|
|
1036
|
+
include_metadata: z.boolean().optional()
|
|
1037
|
+
}
|
|
1038
|
+
}, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
|
|
1039
|
+
try {
|
|
1040
|
+
if (!url) {
|
|
1041
|
+
return {
|
|
1042
|
+
content: [{
|
|
1043
|
+
type: "text",
|
|
1044
|
+
text: "URL parameter is required"
|
|
1045
|
+
}],
|
|
1046
|
+
isError: true
|
|
1047
|
+
};
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata });
|
|
1051
|
+
return {
|
|
1052
|
+
content: [{
|
|
1053
|
+
type: "text",
|
|
1054
|
+
text: JSON.stringify(result, null, 2)
|
|
1055
|
+
}]
|
|
1056
|
+
};
|
|
1057
|
+
} catch (error) {
|
|
1058
|
+
return {
|
|
1059
|
+
content: [{
|
|
1060
|
+
type: "text",
|
|
1061
|
+
text: `Site mapping failed: ${error.message}`
|
|
1062
|
+
}],
|
|
1063
|
+
isError: true
|
|
1064
|
+
};
|
|
1065
|
+
}
|
|
1066
|
+
}));
|
|
1067
|
+
|
|
1068
|
+
// Phase 3 Tools: Enhanced Content Processing
|
|
1069
|
+
|
|
1070
|
+
// Tool: extract_content - Enhanced content extraction with readability detection
|
|
1071
|
+
server.registerTool("extract_content", {
|
|
1072
|
+
description: "Extract and analyze main content from web pages with enhanced readability detection",
|
|
1073
|
+
inputSchema: {
|
|
1074
|
+
url: z.string().url(),
|
|
1075
|
+
options: z.object({}).optional()
|
|
1076
|
+
}
|
|
1077
|
+
}, withAuth("extract_content", async ({ url, options }) => {
|
|
1078
|
+
try {
|
|
1079
|
+
if (!url) {
|
|
1080
|
+
return {
|
|
1081
|
+
content: [{
|
|
1082
|
+
type: "text",
|
|
1083
|
+
text: "URL parameter is required"
|
|
1084
|
+
}],
|
|
1085
|
+
isError: true
|
|
1086
|
+
};
|
|
1087
|
+
}
|
|
1088
|
+
|
|
1089
|
+
const result = await extractContentTool.execute({ url, options });
|
|
1090
|
+
return {
|
|
1091
|
+
content: [{
|
|
1092
|
+
type: "text",
|
|
1093
|
+
text: JSON.stringify(result, null, 2)
|
|
1094
|
+
}]
|
|
1095
|
+
};
|
|
1096
|
+
} catch (error) {
|
|
1097
|
+
return {
|
|
1098
|
+
content: [{
|
|
1099
|
+
type: "text",
|
|
1100
|
+
text: `Content extraction failed: ${error.message}`
|
|
1101
|
+
}],
|
|
1102
|
+
isError: true
|
|
1103
|
+
};
|
|
1104
|
+
}
|
|
1105
|
+
}));
|
|
1106
|
+
|
|
1107
|
+
// Tool: process_document - Multi-format document processing
|
|
1108
|
+
server.registerTool("process_document", {
|
|
1109
|
+
description: "Process documents from multiple sources and formats including PDFs and web pages",
|
|
1110
|
+
inputSchema: {
|
|
1111
|
+
source: z.string(),
|
|
1112
|
+
sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional(),
|
|
1113
|
+
options: z.object({}).optional()
|
|
1114
|
+
}
|
|
1115
|
+
}, withAuth("process_document", async ({ source, sourceType, options }) => {
|
|
1116
|
+
try {
|
|
1117
|
+
if (!source) {
|
|
1118
|
+
return {
|
|
1119
|
+
content: [{
|
|
1120
|
+
type: "text",
|
|
1121
|
+
text: "Source parameter is required"
|
|
1122
|
+
}],
|
|
1123
|
+
isError: true
|
|
1124
|
+
};
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
const result = await processDocumentTool.execute({ source, sourceType, options });
|
|
1128
|
+
return {
|
|
1129
|
+
content: [{
|
|
1130
|
+
type: "text",
|
|
1131
|
+
text: JSON.stringify(result, null, 2)
|
|
1132
|
+
}]
|
|
1133
|
+
};
|
|
1134
|
+
} catch (error) {
|
|
1135
|
+
return {
|
|
1136
|
+
content: [{
|
|
1137
|
+
type: "text",
|
|
1138
|
+
text: `Document processing failed: ${error.message}`
|
|
1139
|
+
}],
|
|
1140
|
+
isError: true
|
|
1141
|
+
};
|
|
1142
|
+
}
|
|
1143
|
+
}));
|
|
1144
|
+
|
|
1145
|
+
// Tool: summarize_content - Intelligent content summarization
|
|
1146
|
+
server.registerTool("summarize_content", {
|
|
1147
|
+
description: "Generate intelligent summaries of text content with configurable options",
|
|
1148
|
+
inputSchema: {
|
|
1149
|
+
text: z.string(),
|
|
1150
|
+
options: z.object({}).optional()
|
|
1151
|
+
}
|
|
1152
|
+
}, withAuth("summarize_content", async ({ text, options }) => {
|
|
1153
|
+
try {
|
|
1154
|
+
if (!text) {
|
|
1155
|
+
return {
|
|
1156
|
+
content: [{
|
|
1157
|
+
type: "text",
|
|
1158
|
+
text: "Text parameter is required"
|
|
1159
|
+
}],
|
|
1160
|
+
isError: true
|
|
1161
|
+
};
|
|
1162
|
+
}
|
|
1163
|
+
|
|
1164
|
+
const result = await summarizeContentTool.execute({ text, options });
|
|
1165
|
+
return {
|
|
1166
|
+
content: [{
|
|
1167
|
+
type: "text",
|
|
1168
|
+
text: JSON.stringify(result, null, 2)
|
|
1169
|
+
}]
|
|
1170
|
+
};
|
|
1171
|
+
} catch (error) {
|
|
1172
|
+
return {
|
|
1173
|
+
content: [{
|
|
1174
|
+
type: "text",
|
|
1175
|
+
text: `Content summarization failed: ${error.message}`
|
|
1176
|
+
}],
|
|
1177
|
+
isError: true
|
|
1178
|
+
};
|
|
1179
|
+
}
|
|
1180
|
+
}));
|
|
1181
|
+
|
|
1182
|
+
// Tool: analyze_content - Comprehensive content analysis
|
|
1183
|
+
server.registerTool("analyze_content", {
|
|
1184
|
+
description: "Perform comprehensive content analysis including language detection and topic extraction",
|
|
1185
|
+
inputSchema: {
|
|
1186
|
+
text: z.string(),
|
|
1187
|
+
options: z.object({}).optional()
|
|
1188
|
+
}
|
|
1189
|
+
}, withAuth("analyze_content", async ({ text, options }) => {
|
|
1190
|
+
try {
|
|
1191
|
+
if (!text) {
|
|
1192
|
+
return {
|
|
1193
|
+
content: [{
|
|
1194
|
+
type: "text",
|
|
1195
|
+
text: "Text parameter is required"
|
|
1196
|
+
}],
|
|
1197
|
+
isError: true
|
|
1198
|
+
};
|
|
1199
|
+
}
|
|
1200
|
+
|
|
1201
|
+
const result = await analyzeContentTool.execute({ text, options });
|
|
1202
|
+
return {
|
|
1203
|
+
content: [{
|
|
1204
|
+
type: "text",
|
|
1205
|
+
text: JSON.stringify(result, null, 2)
|
|
1206
|
+
}]
|
|
1207
|
+
};
|
|
1208
|
+
} catch (error) {
|
|
1209
|
+
return {
|
|
1210
|
+
content: [{
|
|
1211
|
+
type: "text",
|
|
1212
|
+
text: `Content analysis failed: ${error.message}`
|
|
1213
|
+
}],
|
|
1214
|
+
isError: true
|
|
1215
|
+
};
|
|
1216
|
+
}
|
|
1217
|
+
}));
|
|
1218
|
+
|
|
1219
|
+
|
|
1220
|
+
// Wave 2 Advanced Tools
|
|
1221
|
+
|
|
1222
|
+
// Tool: batch_scrape - Process multiple URLs simultaneously with job management
|
|
1223
|
+
server.registerTool("batch_scrape", {
|
|
1224
|
+
description: "Process multiple URLs simultaneously with support for async job management and webhook notifications",
|
|
1225
|
+
inputSchema: {
|
|
1226
|
+
urls: z.array(z.union([
|
|
1227
|
+
z.string().url(),
|
|
1228
|
+
z.object({
|
|
1229
|
+
url: z.string().url(),
|
|
1230
|
+
selectors: z.record(z.string()).optional(),
|
|
1231
|
+
headers: z.record(z.string()).optional(),
|
|
1232
|
+
timeout: z.number().min(1000).max(30000).optional(),
|
|
1233
|
+
metadata: z.record(z.any()).optional()
|
|
1234
|
+
})
|
|
1235
|
+
])).min(1).max(50),
|
|
1236
|
+
formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
|
|
1237
|
+
mode: z.enum(['sync', 'async']).default('sync'),
|
|
1238
|
+
webhook: z.object({
|
|
1239
|
+
url: z.string().url(),
|
|
1240
|
+
events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
|
|
1241
|
+
headers: z.record(z.string()).optional(),
|
|
1242
|
+
signingSecret: z.string().optional()
|
|
1243
|
+
}).optional(),
|
|
1244
|
+
extractionSchema: z.record(z.string()).optional(),
|
|
1245
|
+
maxConcurrency: z.number().min(1).max(20).default(10),
|
|
1246
|
+
delayBetweenRequests: z.number().min(0).max(10000).default(100),
|
|
1247
|
+
includeMetadata: z.boolean().default(true),
|
|
1248
|
+
includeFailed: z.boolean().default(true),
|
|
1249
|
+
pageSize: z.number().min(1).max(100).default(25),
|
|
1250
|
+
jobOptions: z.object({
|
|
1251
|
+
priority: z.number().default(0),
|
|
1252
|
+
ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
|
|
1253
|
+
maxRetries: z.number().min(0).max(5).default(1),
|
|
1254
|
+
tags: z.array(z.string()).default([])
|
|
1255
|
+
}).optional()
|
|
1256
|
+
}
|
|
1257
|
+
}, withAuth("batch_scrape", async (params) => {
|
|
1258
|
+
try {
|
|
1259
|
+
const result = await batchScrapeTool.execute(params);
|
|
1260
|
+
return {
|
|
1261
|
+
content: [{
|
|
1262
|
+
type: "text",
|
|
1263
|
+
text: JSON.stringify(result, null, 2)
|
|
1264
|
+
}]
|
|
1265
|
+
};
|
|
1266
|
+
} catch (error) {
|
|
1267
|
+
return {
|
|
1268
|
+
content: [{
|
|
1269
|
+
type: "text",
|
|
1270
|
+
text: `Batch scrape failed: ${error.message}`
|
|
1271
|
+
}],
|
|
1272
|
+
isError: true
|
|
1273
|
+
};
|
|
1274
|
+
}
|
|
1275
|
+
}));
|
|
1276
|
+
|
|
1277
|
+
// Tool: scrape_with_actions - Execute action chains before scraping
|
|
1278
|
+
server.registerTool("scrape_with_actions", {
|
|
1279
|
+
description: "Execute browser action chains before scraping content, with form auto-fill and intermediate state capture",
|
|
1280
|
+
inputSchema: {
|
|
1281
|
+
url: z.string().url(),
|
|
1282
|
+
actions: z.array(z.object({
|
|
1283
|
+
type: z.enum(['wait', 'click', 'type', 'press', 'scroll', 'screenshot', 'executeJavaScript']),
|
|
1284
|
+
selector: z.string().optional(),
|
|
1285
|
+
text: z.string().optional(),
|
|
1286
|
+
key: z.string().optional(),
|
|
1287
|
+
script: z.string().optional(),
|
|
1288
|
+
timeout: z.number().optional(),
|
|
1289
|
+
description: z.string().optional(),
|
|
1290
|
+
continueOnError: z.boolean().default(false),
|
|
1291
|
+
retries: z.number().min(0).max(5).default(0)
|
|
1292
|
+
})).min(1).max(20),
|
|
1293
|
+
formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
|
|
1294
|
+
captureIntermediateStates: z.boolean().default(false),
|
|
1295
|
+
captureScreenshots: z.boolean().default(true),
|
|
1296
|
+
formAutoFill: z.object({
|
|
1297
|
+
fields: z.array(z.object({
|
|
1298
|
+
selector: z.string(),
|
|
1299
|
+
value: z.string(),
|
|
1300
|
+
type: z.enum(['text', 'select', 'checkbox', 'radio', 'file']).default('text'),
|
|
1301
|
+
waitAfter: z.number().min(0).max(5000).default(100)
|
|
1302
|
+
})),
|
|
1303
|
+
submitSelector: z.string().optional(),
|
|
1304
|
+
waitAfterSubmit: z.number().min(0).max(30000).default(2000)
|
|
1305
|
+
}).optional(),
|
|
1306
|
+
browserOptions: z.object({
|
|
1307
|
+
headless: z.boolean().default(true),
|
|
1308
|
+
userAgent: z.string().optional(),
|
|
1309
|
+
viewportWidth: z.number().min(800).max(1920).default(1280),
|
|
1310
|
+
viewportHeight: z.number().min(600).max(1080).default(720),
|
|
1311
|
+
timeout: z.number().min(10000).max(120000).default(30000)
|
|
1312
|
+
}).optional(),
|
|
1313
|
+
extractionOptions: z.object({
|
|
1314
|
+
selectors: z.record(z.string()).optional(),
|
|
1315
|
+
includeMetadata: z.boolean().default(true),
|
|
1316
|
+
includeLinks: z.boolean().default(true),
|
|
1317
|
+
includeImages: z.boolean().default(true)
|
|
1318
|
+
}).optional(),
|
|
1319
|
+
continueOnActionError: z.boolean().default(false),
|
|
1320
|
+
maxRetries: z.number().min(0).max(3).default(1),
|
|
1321
|
+
screenshotOnError: z.boolean().default(true)
|
|
1322
|
+
}
|
|
1323
|
+
}, withAuth("scrape_with_actions", async (params) => {
|
|
1324
|
+
try {
|
|
1325
|
+
const result = await scrapeWithActionsTool.execute(params);
|
|
1326
|
+
return {
|
|
1327
|
+
content: [{
|
|
1328
|
+
type: "text",
|
|
1329
|
+
text: JSON.stringify(result, null, 2)
|
|
1330
|
+
}]
|
|
1331
|
+
};
|
|
1332
|
+
} catch (error) {
|
|
1333
|
+
return {
|
|
1334
|
+
content: [{
|
|
1335
|
+
type: "text",
|
|
1336
|
+
text: `Scrape with actions failed: ${error.message}`
|
|
1337
|
+
}],
|
|
1338
|
+
isError: true
|
|
1339
|
+
};
|
|
1340
|
+
}
|
|
1341
|
+
}));
|
|
1342
|
+
|
|
1343
|
+
// Tool: deep_research - Comprehensive multi-stage research with source verification
|
|
1344
|
+
server.registerTool("deep_research", {
|
|
1345
|
+
description: "Conduct comprehensive multi-stage research with intelligent query expansion, source verification, and conflict detection",
|
|
1346
|
+
inputSchema: {
|
|
1347
|
+
topic: z.string().min(3).max(500),
|
|
1348
|
+
maxDepth: z.number().min(1).max(10).optional().default(5),
|
|
1349
|
+
maxUrls: z.number().min(1).max(1000).optional().default(50),
|
|
1350
|
+
timeLimit: z.number().min(30000).max(300000).optional().default(120000),
|
|
1351
|
+
researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'),
|
|
1352
|
+
sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']),
|
|
1353
|
+
credibilityThreshold: z.number().min(0).max(1).optional().default(0.3),
|
|
1354
|
+
includeRecentOnly: z.boolean().optional().default(false),
|
|
1355
|
+
enableConflictDetection: z.boolean().optional().default(true),
|
|
1356
|
+
enableSourceVerification: z.boolean().optional().default(true),
|
|
1357
|
+
enableSynthesis: z.boolean().optional().default(true),
|
|
1358
|
+
outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'),
|
|
1359
|
+
includeRawData: z.boolean().optional().default(false),
|
|
1360
|
+
includeActivityLog: z.boolean().optional().default(false),
|
|
1361
|
+
queryExpansion: z.object({
|
|
1362
|
+
enableSynonyms: z.boolean().optional().default(true),
|
|
1363
|
+
enableSpellCheck: z.boolean().optional().default(true),
|
|
1364
|
+
enableContextual: z.boolean().optional().default(true),
|
|
1365
|
+
maxVariations: z.number().min(1).max(20).optional().default(8)
|
|
1366
|
+
}).optional(),
|
|
1367
|
+
llmConfig: z.object({
|
|
1368
|
+
provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'),
|
|
1369
|
+
openai: z.object({
|
|
1370
|
+
apiKey: z.string().optional(),
|
|
1371
|
+
model: z.string().optional().default('gpt-3.5-turbo'),
|
|
1372
|
+
embeddingModel: z.string().optional().default('text-embedding-ada-002')
|
|
1373
|
+
}).optional(),
|
|
1374
|
+
anthropic: z.object({
|
|
1375
|
+
apiKey: z.string().optional(),
|
|
1376
|
+
model: z.string().optional().default('claude-3-haiku-20240307')
|
|
1377
|
+
}).optional(),
|
|
1378
|
+
enableSemanticAnalysis: z.boolean().optional().default(true),
|
|
1379
|
+
enableIntelligentSynthesis: z.boolean().optional().default(true)
|
|
1380
|
+
}).optional(),
|
|
1381
|
+
concurrency: z.number().min(1).max(20).optional().default(5),
|
|
1382
|
+
cacheResults: z.boolean().optional().default(true),
|
|
1383
|
+
webhook: z.object({
|
|
1384
|
+
url: z.string().url(),
|
|
1385
|
+
events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']),
|
|
1386
|
+
headers: z.record(z.string()).optional()
|
|
1387
|
+
}).optional()
|
|
1388
|
+
}
|
|
1389
|
+
}, withAuth("deep_research", async (params) => {
|
|
1390
|
+
try {
|
|
1391
|
+
const result = await deepResearchTool.execute(params);
|
|
1392
|
+
return {
|
|
1393
|
+
content: [{
|
|
1394
|
+
type: "text",
|
|
1395
|
+
text: JSON.stringify(result, null, 2)
|
|
1396
|
+
}]
|
|
1397
|
+
};
|
|
1398
|
+
} catch (error) {
|
|
1399
|
+
return {
|
|
1400
|
+
content: [{
|
|
1401
|
+
type: "text",
|
|
1402
|
+
text: `Deep research failed: ${error.message}`
|
|
1403
|
+
}],
|
|
1404
|
+
isError: true
|
|
1405
|
+
};
|
|
1406
|
+
}
|
|
1407
|
+
}));
|
|
1408
|
+
|
|
1409
|
+
// Tool: track_changes - Enhanced Content change tracking with baseline capture and monitoring (Phase 2.4)
|
|
1410
|
+
// Temporarily disabled due to import issue
|
|
1411
|
+
/*
|
|
1412
|
+
server.registerTool("track_changes", {
|
|
1413
|
+
description: "Enhanced content change tracking with baseline capture, comparison, scheduled monitoring, advanced comparison engine, alert system, and historical analysis",
|
|
1414
|
+
inputSchema: {
|
|
1415
|
+
url: z.string().url(),
|
|
1416
|
+
operation: z.enum([
|
|
1417
|
+
'create_baseline',
|
|
1418
|
+
'compare',
|
|
1419
|
+
'monitor',
|
|
1420
|
+
'get_history',
|
|
1421
|
+
'get_stats',
|
|
1422
|
+
'create_scheduled_monitor',
|
|
1423
|
+
'stop_scheduled_monitor',
|
|
1424
|
+
'get_dashboard',
|
|
1425
|
+
'export_history',
|
|
1426
|
+
'create_alert_rule',
|
|
1427
|
+
'generate_trend_report',
|
|
1428
|
+
'get_monitoring_templates'
|
|
1429
|
+
]).default('compare'),
|
|
1430
|
+
content: z.string().optional(),
|
|
1431
|
+
html: z.string().optional(),
|
|
1432
|
+
trackingOptions: z.object({
|
|
1433
|
+
granularity: z.enum(['page', 'section', 'element', 'text']).default('section'),
|
|
1434
|
+
trackText: z.boolean().default(true),
|
|
1435
|
+
trackStructure: z.boolean().default(true),
|
|
1436
|
+
trackAttributes: z.boolean().default(false),
|
|
1437
|
+
trackImages: z.boolean().default(false),
|
|
1438
|
+
trackLinks: z.boolean().default(true),
|
|
1439
|
+
ignoreWhitespace: z.boolean().default(true),
|
|
1440
|
+
ignoreCase: z.boolean().default(false),
|
|
1441
|
+
customSelectors: z.array(z.string()).optional(),
|
|
1442
|
+
excludeSelectors: z.array(z.string()).optional(),
|
|
1443
|
+
significanceThresholds: z.object({
|
|
1444
|
+
minor: z.number().min(0).max(1).default(0.1),
|
|
1445
|
+
moderate: z.number().min(0).max(1).default(0.3),
|
|
1446
|
+
major: z.number().min(0).max(1).default(0.7)
|
|
1447
|
+
}).optional()
|
|
1448
|
+
}).optional(),
|
|
1449
|
+
monitoringOptions: z.object({
|
|
1450
|
+
enabled: z.boolean().default(false),
|
|
1451
|
+
interval: z.number().min(60000).max(24 * 60 * 60 * 1000).default(300000),
|
|
1452
|
+
maxRetries: z.number().min(0).max(5).default(3),
|
|
1453
|
+
retryDelay: z.number().min(1000).max(60000).default(5000),
|
|
1454
|
+
notificationThreshold: z.enum(['minor', 'moderate', 'major', 'critical']).default('moderate'),
|
|
1455
|
+
enableWebhook: z.boolean().default(false),
|
|
1456
|
+
webhookUrl: z.string().url().optional(),
|
|
1457
|
+
webhookSecret: z.string().optional()
|
|
1458
|
+
}).optional(),
|
|
1459
|
+
storageOptions: z.object({
|
|
1460
|
+
enableSnapshots: z.boolean().default(true),
|
|
1461
|
+
retainHistory: z.boolean().default(true),
|
|
1462
|
+
maxHistoryEntries: z.number().min(1).max(1000).default(100),
|
|
1463
|
+
compressionEnabled: z.boolean().default(true),
|
|
1464
|
+
deltaStorageEnabled: z.boolean().default(true)
|
|
1465
|
+
}).optional(),
|
|
1466
|
+
queryOptions: z.object({
|
|
1467
|
+
limit: z.number().min(1).max(500).default(50),
|
|
1468
|
+
offset: z.number().min(0).default(0),
|
|
1469
|
+
startTime: z.number().optional(),
|
|
1470
|
+
endTime: z.number().optional(),
|
|
1471
|
+
includeContent: z.boolean().default(false),
|
|
1472
|
+
significanceFilter: z.enum(['all', 'minor', 'moderate', 'major', 'critical']).optional()
|
|
1473
|
+
}).optional(),
|
|
1474
|
+
notificationOptions: z.object({
|
|
1475
|
+
webhook: z.object({
|
|
1476
|
+
enabled: z.boolean().default(false),
|
|
1477
|
+
url: z.string().url().optional(),
|
|
1478
|
+
method: z.enum(['POST', 'PUT']).default('POST'),
|
|
1479
|
+
headers: z.record(z.string()).optional(),
|
|
1480
|
+
signingSecret: z.string().optional(),
|
|
1481
|
+
includeContent: z.boolean().default(false)
|
|
1482
|
+
}).optional(),
|
|
1483
|
+
slack: z.object({
|
|
1484
|
+
enabled: z.boolean().default(false),
|
|
1485
|
+
webhookUrl: z.string().url().optional(),
|
|
1486
|
+
channel: z.string().optional(),
|
|
1487
|
+
username: z.string().optional()
|
|
1488
|
+
}).optional()
|
|
1489
|
+
}).optional(),
|
|
1490
|
+
// Enhanced Phase 2.4 options
|
|
1491
|
+
scheduledMonitorOptions: z.object({
|
|
1492
|
+
schedule: z.string().optional(), // Cron expression
|
|
1493
|
+
templateId: z.string().optional(), // Monitoring template ID
|
|
1494
|
+
enabled: z.boolean().default(true)
|
|
1495
|
+
}).optional(),
|
|
1496
|
+
alertRuleOptions: z.object({
|
|
1497
|
+
ruleId: z.string().optional(),
|
|
1498
|
+
condition: z.string().optional(), // Condition description
|
|
1499
|
+
actions: z.array(z.enum(['webhook', 'email', 'slack'])).optional(),
|
|
1500
|
+
throttle: z.number().min(0).optional(),
|
|
1501
|
+
priority: z.enum(['low', 'medium', 'high']).optional()
|
|
1502
|
+
}).optional(),
|
|
1503
|
+
exportOptions: z.object({
|
|
1504
|
+
format: z.enum(['json', 'csv']).default('json'),
|
|
1505
|
+
startTime: z.number().optional(),
|
|
1506
|
+
endTime: z.number().optional(),
|
|
1507
|
+
includeContent: z.boolean().default(false),
|
|
1508
|
+
includeSnapshots: z.boolean().default(false)
|
|
1509
|
+
}).optional(),
|
|
1510
|
+
dashboardOptions: z.object({
|
|
1511
|
+
includeRecentAlerts: z.boolean().default(true),
|
|
1512
|
+
includeTrends: z.boolean().default(true),
|
|
1513
|
+
includeMonitorStatus: z.boolean().default(true)
|
|
1514
|
+
}).optional()
|
|
1515
|
+
})
|
|
1516
|
+
}, async (params) => {
|
|
1517
|
+
try {
|
|
1518
|
+
const result = await trackChangesTool.execute(params);
|
|
1519
|
+
return {
|
|
1520
|
+
content: [{
|
|
1521
|
+
type: "text",
|
|
1522
|
+
text: JSON.stringify(result, null, 2)
|
|
1523
|
+
}]
|
|
1524
|
+
};
|
|
1525
|
+
} catch (error) {
|
|
1526
|
+
return {
|
|
1527
|
+
content: [{
|
|
1528
|
+
type: "text",
|
|
1529
|
+
text: `Change tracking failed: ${error.message}`
|
|
1530
|
+
}],
|
|
1531
|
+
isError: true
|
|
1532
|
+
};
|
|
1533
|
+
}
|
|
1534
|
+
});
|
|
1535
|
+
|
|
1536
|
+
// Tool: generate_llms_txt - Generate LLMs.txt and LLMs-full.txt files (Phase 2.5)
|
|
1537
|
+
server.registerTool("generate_llms_txt", {
|
|
1538
|
+
description: "Analyze websites and generate standard-compliant LLMs.txt and LLMs-full.txt files defining AI model interaction guidelines",
|
|
1539
|
+
inputSchema: {
|
|
1540
|
+
url: z.string().url(),
|
|
1541
|
+
analysisOptions: z.object({
|
|
1542
|
+
maxDepth: z.number().min(1).max(5).optional().default(3),
|
|
1543
|
+
maxPages: z.number().min(10).max(500).optional().default(100),
|
|
1544
|
+
detectAPIs: z.boolean().optional().default(true),
|
|
1545
|
+
analyzeContent: z.boolean().optional().default(true),
|
|
1546
|
+
checkSecurity: z.boolean().optional().default(true),
|
|
1547
|
+
respectRobots: z.boolean().optional().default(true)
|
|
1548
|
+
}).optional(),
|
|
1549
|
+
outputOptions: z.object({
|
|
1550
|
+
includeDetailed: z.boolean().optional().default(true),
|
|
1551
|
+
includeAnalysis: z.boolean().optional().default(false),
|
|
1552
|
+
contactEmail: z.string().email().optional(),
|
|
1553
|
+
organizationName: z.string().optional(),
|
|
1554
|
+
customGuidelines: z.array(z.string()).optional(),
|
|
1555
|
+
customRestrictions: z.array(z.string()).optional()
|
|
1556
|
+
}).optional(),
|
|
1557
|
+
complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard'),
|
|
1558
|
+
format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both')
|
|
1559
|
+
}
|
|
1560
|
+
}, withAuth("generate_llms_txt", async (params) => {
|
|
1561
|
+
try {
|
|
1562
|
+
const result = await generateLLMsTxtTool.execute(params);
|
|
1563
|
+
return {
|
|
1564
|
+
content: [{
|
|
1565
|
+
type: "text",
|
|
1566
|
+
text: JSON.stringify(result, null, 2)
|
|
1567
|
+
}]
|
|
1568
|
+
};
|
|
1569
|
+
} catch (error) {
|
|
1570
|
+
return {
|
|
1571
|
+
content: [{
|
|
1572
|
+
type: "text",
|
|
1573
|
+
text: `LLMs.txt generation failed: ${error.message}`
|
|
1574
|
+
}],
|
|
1575
|
+
isError: true
|
|
1576
|
+
};
|
|
1577
|
+
}
|
|
1578
|
+
});
|
|
1579
|
+
*/
|
|
1580
|
+
|
|
1581
|
+
// Tool: stealth_mode - Advanced anti-detection browser management (Wave 3)
|
|
1582
|
+
server.registerTool("stealth_mode", {
|
|
1583
|
+
description: "Advanced anti-detection browser management with stealth features, fingerprint randomization, and human behavior simulation",
|
|
1584
|
+
inputSchema: {
|
|
1585
|
+
operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure'),
|
|
1586
|
+
stealthConfig: z.object({
|
|
1587
|
+
level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
|
|
1588
|
+
randomizeFingerprint: z.boolean().default(true),
|
|
1589
|
+
hideWebDriver: z.boolean().default(true),
|
|
1590
|
+
blockWebRTC: z.boolean().default(true),
|
|
1591
|
+
spoofTimezone: z.boolean().default(true),
|
|
1592
|
+
randomizeHeaders: z.boolean().default(true),
|
|
1593
|
+
useRandomUserAgent: z.boolean().default(true),
|
|
1594
|
+
simulateHumanBehavior: z.boolean().default(true),
|
|
1595
|
+
customUserAgent: z.string().optional(),
|
|
1596
|
+
customViewport: z.object({
|
|
1597
|
+
width: z.number().min(800).max(1920),
|
|
1598
|
+
height: z.number().min(600).max(1080)
|
|
1599
|
+
}).optional(),
|
|
1600
|
+
locale: z.string().default('en-US'),
|
|
1601
|
+
timezone: z.string().optional(),
|
|
1602
|
+
webRTCPublicIP: z.string().optional(),
|
|
1603
|
+
webRTCLocalIPs: z.array(z.string()).optional(),
|
|
1604
|
+
proxyRotation: z.object({
|
|
1605
|
+
enabled: z.boolean().default(false),
|
|
1606
|
+
proxies: z.array(z.string()).optional(),
|
|
1607
|
+
rotationInterval: z.number().default(300000)
|
|
1608
|
+
}).optional(),
|
|
1609
|
+
antiDetection: z.object({
|
|
1610
|
+
cloudflareBypass: z.boolean().default(true),
|
|
1611
|
+
recaptchaHandling: z.boolean().default(true),
|
|
1612
|
+
hideAutomation: z.boolean().default(true),
|
|
1613
|
+
spoofMediaDevices: z.boolean().default(true),
|
|
1614
|
+
spoofBatteryAPI: z.boolean().default(true)
|
|
1615
|
+
}).optional(),
|
|
1616
|
+
fingerprinting: z.object({
|
|
1617
|
+
canvasNoise: z.boolean().default(true),
|
|
1618
|
+
webglSpoofing: z.boolean().default(true),
|
|
1619
|
+
audioContextSpoofing: z.boolean().default(true),
|
|
1620
|
+
fontSpoofing: z.boolean().default(true),
|
|
1621
|
+
hardwareSpoofing: z.boolean().default(true)
|
|
1622
|
+
}).optional()
|
|
1623
|
+
}).optional(),
|
|
1624
|
+
contextId: z.string().optional(),
|
|
1625
|
+
urlToTest: z.string().url().optional()
|
|
1626
|
+
}
|
|
1627
|
+
}, withAuth("stealth_mode", async ({ operation, stealthConfig, contextId, urlToTest }) => {
|
|
1628
|
+
try {
|
|
1629
|
+
let result;
|
|
1630
|
+
|
|
1631
|
+
switch (operation) {
|
|
1632
|
+
case 'configure':
|
|
1633
|
+
if (stealthConfig) {
|
|
1634
|
+
const validated = stealthBrowserManager.validateConfig(stealthConfig);
|
|
1635
|
+
result = { configured: true, config: validated };
|
|
1636
|
+
} else {
|
|
1637
|
+
result = { error: 'stealthConfig is required for configure operation' };
|
|
1638
|
+
}
|
|
1639
|
+
break;
|
|
1640
|
+
|
|
1641
|
+
case 'enable':
|
|
1642
|
+
stealthBrowserManager.enableStealthMode(stealthConfig?.level || 'medium');
|
|
1643
|
+
result = { enabled: true, level: stealthConfig?.level || 'medium' };
|
|
1644
|
+
break;
|
|
1645
|
+
|
|
1646
|
+
case 'disable':
|
|
1647
|
+
stealthBrowserManager.disableStealthMode();
|
|
1648
|
+
result = { disabled: true };
|
|
1649
|
+
break;
|
|
1650
|
+
|
|
1651
|
+
case 'create_context':
|
|
1652
|
+
const contextData = await stealthBrowserManager.createStealthContext(stealthConfig);
|
|
1653
|
+
result = {
|
|
1654
|
+
contextId: contextData.contextId,
|
|
1655
|
+
fingerprint: contextData.fingerprint,
|
|
1656
|
+
created: true
|
|
1657
|
+
};
|
|
1658
|
+
break;
|
|
1659
|
+
|
|
1660
|
+
case 'create_page':
|
|
1661
|
+
if (!contextId) {
|
|
1662
|
+
throw new Error('contextId is required for create_page operation');
|
|
1663
|
+
}
|
|
1664
|
+
const page = await stealthBrowserManager.createStealthPage(contextId);
|
|
1665
|
+
result = {
|
|
1666
|
+
pageCreated: true,
|
|
1667
|
+
contextId: contextId,
|
|
1668
|
+
url: urlToTest ? await page.goto(urlToTest) : null
|
|
1669
|
+
};
|
|
1670
|
+
break;
|
|
1671
|
+
|
|
1672
|
+
case 'get_stats':
|
|
1673
|
+
result = stealthBrowserManager.getStats();
|
|
1674
|
+
break;
|
|
1675
|
+
|
|
1676
|
+
case 'cleanup':
|
|
1677
|
+
await stealthBrowserManager.cleanup();
|
|
1678
|
+
result = { cleaned: true };
|
|
1679
|
+
break;
|
|
1680
|
+
|
|
1681
|
+
default:
|
|
1682
|
+
result = { error: `Unknown operation: ${operation}` };
|
|
1683
|
+
}
|
|
1684
|
+
|
|
1685
|
+
return {
|
|
1686
|
+
content: [{
|
|
1687
|
+
type: "text",
|
|
1688
|
+
text: JSON.stringify(result, null, 2)
|
|
1689
|
+
}]
|
|
1690
|
+
};
|
|
1691
|
+
} catch (error) {
|
|
1692
|
+
return {
|
|
1693
|
+
content: [{
|
|
1694
|
+
type: "text",
|
|
1695
|
+
text: `Stealth mode operation failed: ${error.message}`
|
|
1696
|
+
}],
|
|
1697
|
+
isError: true
|
|
1698
|
+
};
|
|
1699
|
+
}
|
|
1700
|
+
}));
|
|
1701
|
+
|
|
1702
|
+
// Tool: localization - Multi-language and geo-location management (Wave 3)
|
|
1703
|
+
server.registerTool("localization", {
|
|
1704
|
+
description: "Multi-language and geo-location management with country-specific settings, browser locale emulation, timezone spoofing, and geo-blocked content handling",
|
|
1705
|
+
inputSchema: {
|
|
1706
|
+
operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country'),
|
|
1707
|
+
countryCode: z.string().length(2).optional(),
|
|
1708
|
+
language: z.string().optional(),
|
|
1709
|
+
timezone: z.string().optional(),
|
|
1710
|
+
currency: z.string().length(3).optional(),
|
|
1711
|
+
customHeaders: z.record(z.string()).optional(),
|
|
1712
|
+
userAgent: z.string().optional(),
|
|
1713
|
+
acceptLanguage: z.string().optional(),
|
|
1714
|
+
geoLocation: z.object({
|
|
1715
|
+
latitude: z.number().min(-90).max(90),
|
|
1716
|
+
longitude: z.number().min(-180).max(180),
|
|
1717
|
+
accuracy: z.number().min(1).max(100).optional()
|
|
1718
|
+
}).optional(),
|
|
1719
|
+
proxySettings: z.object({
|
|
1720
|
+
enabled: z.boolean().default(false),
|
|
1721
|
+
region: z.string().optional(),
|
|
1722
|
+
type: z.enum(['http', 'https', 'socks4', 'socks5']).default('https'),
|
|
1723
|
+
server: z.string().optional(),
|
|
1724
|
+
port: z.number().optional(),
|
|
1725
|
+
username: z.string().optional(),
|
|
1726
|
+
password: z.string().optional(),
|
|
1727
|
+
rotation: z.object({
|
|
1728
|
+
enabled: z.boolean().default(false),
|
|
1729
|
+
interval: z.number().default(300000),
|
|
1730
|
+
strategy: z.enum(['round-robin', 'random', 'failover']).default('round-robin')
|
|
1731
|
+
}).optional(),
|
|
1732
|
+
fallback: z.object({
|
|
1733
|
+
enabled: z.boolean().default(true),
|
|
1734
|
+
maxRetries: z.number().default(3),
|
|
1735
|
+
timeout: z.number().default(10000)
|
|
1736
|
+
}).optional()
|
|
1737
|
+
}).optional(),
|
|
1738
|
+
searchParams: z.object({
|
|
1739
|
+
query: z.string().optional(),
|
|
1740
|
+
limit: z.number().optional(),
|
|
1741
|
+
offset: z.number().optional(),
|
|
1742
|
+
headers: z.record(z.string()).optional()
|
|
1743
|
+
}).optional(),
|
|
1744
|
+
browserOptions: z.object({
|
|
1745
|
+
locale: z.string().optional(),
|
|
1746
|
+
timezoneId: z.string().optional(),
|
|
1747
|
+
extraHTTPHeaders: z.record(z.string()).optional(),
|
|
1748
|
+
userAgent: z.string().optional()
|
|
1749
|
+
}).optional(),
|
|
1750
|
+
content: z.string().optional(),
|
|
1751
|
+
url: z.string().url().optional(),
|
|
1752
|
+
response: z.object({
|
|
1753
|
+
status: z.number(),
|
|
1754
|
+
body: z.string().optional(),
|
|
1755
|
+
statusText: z.string().optional()
|
|
1756
|
+
}).optional()
|
|
1757
|
+
}
|
|
1758
|
+
}, withAuth("localization", async (params) => {
|
|
1759
|
+
try {
|
|
1760
|
+
const { operation } = params;
|
|
1761
|
+
let result;
|
|
1762
|
+
|
|
1763
|
+
switch (operation) {
|
|
1764
|
+
case 'configure_country':
|
|
1765
|
+
if (!params.countryCode) {
|
|
1766
|
+
throw new Error('countryCode is required for configure_country operation');
|
|
1767
|
+
}
|
|
1768
|
+
result = await localizationManager.configureCountry(params.countryCode, params);
|
|
1769
|
+
break;
|
|
1770
|
+
|
|
1771
|
+
case 'localize_search':
|
|
1772
|
+
if (!params.searchParams) {
|
|
1773
|
+
throw new Error('searchParams is required for localize_search operation');
|
|
1774
|
+
}
|
|
1775
|
+
result = await localizationManager.localizeSearchQuery(params.searchParams, params.countryCode);
|
|
1776
|
+
break;
|
|
1777
|
+
|
|
1778
|
+
case 'localize_browser':
|
|
1779
|
+
if (!params.browserOptions) {
|
|
1780
|
+
throw new Error('browserOptions is required for localize_browser operation');
|
|
1781
|
+
}
|
|
1782
|
+
result = await localizationManager.localizeBrowserContext(params.browserOptions, params.countryCode);
|
|
1783
|
+
break;
|
|
1784
|
+
|
|
1785
|
+
case 'generate_timezone_spoof':
|
|
1786
|
+
result = {
|
|
1787
|
+
timezoneScript: await localizationManager.generateTimezoneSpoof(params.countryCode),
|
|
1788
|
+
countryCode: params.countryCode || localizationManager.getCurrentSettings().countryCode
|
|
1789
|
+
};
|
|
1790
|
+
break;
|
|
1791
|
+
|
|
1792
|
+
case 'handle_geo_blocking':
|
|
1793
|
+
if (!params.url || !params.response) {
|
|
1794
|
+
throw new Error('url and response are required for handle_geo_blocking operation');
|
|
1795
|
+
}
|
|
1796
|
+
result = await localizationManager.handleGeoBlocking(params.url, params.response);
|
|
1797
|
+
break;
|
|
1798
|
+
|
|
1799
|
+
case 'auto_detect':
|
|
1800
|
+
if (!params.content || !params.url) {
|
|
1801
|
+
throw new Error('content and url are required for auto_detect operation');
|
|
1802
|
+
}
|
|
1803
|
+
result = await localizationManager.autoDetectLocalization(params.content, params.url);
|
|
1804
|
+
break;
|
|
1805
|
+
|
|
1806
|
+
case 'get_stats':
|
|
1807
|
+
result = localizationManager.getStats();
|
|
1808
|
+
break;
|
|
1809
|
+
|
|
1810
|
+
case 'get_supported_countries':
|
|
1811
|
+
result = {
|
|
1812
|
+
supportedCountries: localizationManager.getSupportedCountries(),
|
|
1813
|
+
totalCount: localizationManager.getSupportedCountries().length
|
|
1814
|
+
};
|
|
1815
|
+
break;
|
|
1816
|
+
|
|
1817
|
+
default:
|
|
1818
|
+
result = { error: `Unknown operation: ${operation}` };
|
|
1819
|
+
}
|
|
1820
|
+
|
|
1821
|
+
return {
|
|
1822
|
+
content: [{
|
|
1823
|
+
type: "text",
|
|
1824
|
+
text: JSON.stringify(result, null, 2)
|
|
1825
|
+
}]
|
|
1826
|
+
};
|
|
1827
|
+
} catch (error) {
|
|
1828
|
+
return {
|
|
1829
|
+
content: [{
|
|
1830
|
+
type: "text",
|
|
1831
|
+
text: `Localization operation failed: ${error.message}`
|
|
1832
|
+
}],
|
|
1833
|
+
isError: true
|
|
1834
|
+
};
|
|
1835
|
+
}
|
|
1836
|
+
}));
|
|
1837
|
+
|
|
1838
|
+
// Set up the stdio transport and start the server
|
|
1839
|
+
async function runServer() {
|
|
1840
|
+
const transport = new StdioServerTransport();
|
|
1841
|
+
await server.connect(transport);
|
|
1842
|
+
console.error("CrawlForge MCP Server v3.0 running on stdio");
|
|
1843
|
+
console.error(`Environment: ${config.server.nodeEnv}`);
|
|
1844
|
+
|
|
1845
|
+
if (isSearchConfigured()) {
|
|
1846
|
+
const activeProvider = getActiveSearchProvider();
|
|
1847
|
+
console.error(`Search enabled: ${isSearchConfigured()} (provider: ${activeProvider})`);
|
|
1848
|
+
} else {
|
|
1849
|
+
console.error(`Search enabled: ${isSearchConfigured()}`);
|
|
1850
|
+
}
|
|
1851
|
+
|
|
1852
|
+
const baseTools = 'fetch_url, extract_text, extract_links, extract_metadata, scrape_structured, crawl_deep, map_site';
|
|
1853
|
+
const searchTool = isSearchConfigured() ? ', search_web' : '';
|
|
1854
|
+
const phase3Tools = ', extract_content, process_document, summarize_content, analyze_content';
|
|
1855
|
+
const wave2Tools = ', batch_scrape, scrape_with_actions';
|
|
1856
|
+
const researchTools = ', deep_research';
|
|
1857
|
+
const trackingTools = ''; // track_changes temporarily disabled
|
|
1858
|
+
const llmsTxtTools = ', generate_llms_txt';
|
|
1859
|
+
const wave3Tools = ', stealth_mode, localization';
|
|
1860
|
+
console.error(`Tools available: ${baseTools}${searchTool}${phase3Tools}${wave2Tools}${researchTools}${trackingTools}${llmsTxtTools}${wave3Tools}`);
|
|
1861
|
+
|
|
1862
|
+
// Start memory monitoring in development
|
|
1863
|
+
if (config.server.nodeEnv === "development") {
|
|
1864
|
+
memoryMonitor.start();
|
|
1865
|
+
console.error("Memory monitoring started");
|
|
1866
|
+
}
|
|
1867
|
+
}
|
|
1868
|
+
|
|
1869
|
+
runServer().catch((error) => {
|
|
1870
|
+
console.error("Server error:", error);
|
|
1871
|
+
process.exit(1);
|
|
1872
|
+
});
|
|
1873
|
+
// === MEMORY LEAK PREVENTION ===
|
|
1874
|
+
// Add graceful shutdown handling to prevent memory leaks
|
|
1875
|
+
|
|
1876
|
+
let isShuttingDown = false;
|
|
1877
|
+
|
|
1878
|
+
async function gracefulShutdown(signal) {
|
|
1879
|
+
if (isShuttingDown) {
|
|
1880
|
+
console.error("Force shutdown...");
|
|
1881
|
+
process.exit(1);
|
|
1882
|
+
}
|
|
1883
|
+
|
|
1884
|
+
isShuttingDown = true;
|
|
1885
|
+
console.error(`Received ${signal}. Starting graceful shutdown...`);
|
|
1886
|
+
|
|
1887
|
+
try {
|
|
1888
|
+
// Cleanup tools that have destroy methods
|
|
1889
|
+
const toolsToCleanup = [
|
|
1890
|
+
batchScrapeTool,
|
|
1891
|
+
scrapeWithActionsTool,
|
|
1892
|
+
deepResearchTool,
|
|
1893
|
+
// trackChangesTool, // temporarily disabled
|
|
1894
|
+
generateLLMsTxtTool,
|
|
1895
|
+
stealthBrowserManager,
|
|
1896
|
+
localizationManager
|
|
1897
|
+
].filter(tool => tool && (typeof tool.destroy === 'function' || typeof tool.cleanup === 'function'));
|
|
1898
|
+
|
|
1899
|
+
console.error(`Cleaning up ${toolsToCleanup.length} tools...`);
|
|
1900
|
+
|
|
1901
|
+
// Cleanup tools with timeout
|
|
1902
|
+
await Promise.race([
|
|
1903
|
+
Promise.all(toolsToCleanup.map(async (tool) => {
|
|
1904
|
+
try {
|
|
1905
|
+
if (typeof tool.destroy === 'function') {
|
|
1906
|
+
await tool.destroy();
|
|
1907
|
+
} else if (typeof tool.cleanup === 'function') {
|
|
1908
|
+
await tool.cleanup();
|
|
1909
|
+
}
|
|
1910
|
+
console.error(`Cleaned up ${tool.constructor.name}`);
|
|
1911
|
+
} catch (error) {
|
|
1912
|
+
console.error(`Error cleaning up ${tool.constructor.name}:`, error.message);
|
|
1913
|
+
}
|
|
1914
|
+
})),
|
|
1915
|
+
new Promise(resolve => setTimeout(resolve, 5000)) // 5 second timeout
|
|
1916
|
+
]);
|
|
1917
|
+
|
|
1918
|
+
// Stop memory monitoring
|
|
1919
|
+
if (memoryMonitor.isMonitoring) {
|
|
1920
|
+
memoryMonitor.stop();
|
|
1921
|
+
console.error("Memory monitoring stopped");
|
|
1922
|
+
}
|
|
1923
|
+
|
|
1924
|
+
// Force garbage collection if available
|
|
1925
|
+
if (global.gc) {
|
|
1926
|
+
console.error("Running final garbage collection...");
|
|
1927
|
+
global.gc();
|
|
1928
|
+
}
|
|
1929
|
+
|
|
1930
|
+
console.error("Graceful shutdown completed");
|
|
1931
|
+
process.exit(0);
|
|
1932
|
+
|
|
1933
|
+
} catch (error) {
|
|
1934
|
+
console.error("Error during graceful shutdown:", error);
|
|
1935
|
+
process.exit(1);
|
|
1936
|
+
}
|
|
1937
|
+
}
|
|
1938
|
+
|
|
1939
|
+
// Register signal handlers
|
|
1940
|
+
process.on('SIGINT', () => gracefulShutdown('SIGINT'));
|
|
1941
|
+
process.on('SIGTERM', () => gracefulShutdown('SIGTERM'));
|
|
1942
|
+
|
|
1943
|
+
// Handle uncaught exceptions and unhandled rejections
|
|
1944
|
+
process.on('uncaughtException', (error) => {
|
|
1945
|
+
console.error('Uncaught Exception:', error);
|
|
1946
|
+
gracefulShutdown('uncaughtException');
|
|
1947
|
+
});
|
|
1948
|
+
|
|
1949
|
+
process.on('unhandledRejection', (reason, promise) => {
|
|
1950
|
+
console.error('Unhandled Rejection at:', promise, 'reason:', reason);
|
|
1951
|
+
gracefulShutdown('unhandledRejection');
|
|
1952
|
+
});
|
|
1953
|
+
|
|
1954
|
+
// Memory monitoring (development only)
|
|
1955
|
+
if (config.server.nodeEnv === 'development') {
|
|
1956
|
+
setInterval(() => {
|
|
1957
|
+
const usage = process.memoryUsage();
|
|
1958
|
+
const memoryMB = (usage.heapUsed / 1024 / 1024).toFixed(2);
|
|
1959
|
+
if (memoryMB > 200) { // Alert if over 200MB
|
|
1960
|
+
console.error(`Memory usage: ${memoryMB}MB (high usage detected)`);
|
|
1961
|
+
}
|
|
1962
|
+
}, 60000); // Check every minute
|
|
1963
|
+
}
|