@cosmocoder/mcp-web-docs 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +368 -0
- package/build/__mocks__/embeddings.d.ts +17 -0
- package/build/__mocks__/embeddings.js +66 -0
- package/build/__mocks__/embeddings.js.map +1 -0
- package/build/config.d.ts +44 -0
- package/build/config.js +158 -0
- package/build/config.js.map +1 -0
- package/build/config.test.d.ts +1 -0
- package/build/config.test.js +165 -0
- package/build/config.test.js.map +1 -0
- package/build/crawler/auth.d.ts +128 -0
- package/build/crawler/auth.js +546 -0
- package/build/crawler/auth.js.map +1 -0
- package/build/crawler/auth.test.d.ts +1 -0
- package/build/crawler/auth.test.js +174 -0
- package/build/crawler/auth.test.js.map +1 -0
- package/build/crawler/base.d.ts +24 -0
- package/build/crawler/base.js +149 -0
- package/build/crawler/base.js.map +1 -0
- package/build/crawler/base.test.d.ts +1 -0
- package/build/crawler/base.test.js +234 -0
- package/build/crawler/base.test.js.map +1 -0
- package/build/crawler/browser-config.d.ts +2 -0
- package/build/crawler/browser-config.js +29 -0
- package/build/crawler/browser-config.js.map +1 -0
- package/build/crawler/browser-config.test.d.ts +1 -0
- package/build/crawler/browser-config.test.js +56 -0
- package/build/crawler/browser-config.test.js.map +1 -0
- package/build/crawler/cheerio.d.ts +11 -0
- package/build/crawler/cheerio.js +134 -0
- package/build/crawler/cheerio.js.map +1 -0
- package/build/crawler/chromium.d.ts +21 -0
- package/build/crawler/chromium.js +596 -0
- package/build/crawler/chromium.js.map +1 -0
- package/build/crawler/content-extractor-types.d.ts +25 -0
- package/build/crawler/content-extractor-types.js +2 -0
- package/build/crawler/content-extractor-types.js.map +1 -0
- package/build/crawler/content-extractors.d.ts +9 -0
- package/build/crawler/content-extractors.js +9 -0
- package/build/crawler/content-extractors.js.map +1 -0
- package/build/crawler/content-utils.d.ts +2 -0
- package/build/crawler/content-utils.js +22 -0
- package/build/crawler/content-utils.js.map +1 -0
- package/build/crawler/content-utils.test.d.ts +1 -0
- package/build/crawler/content-utils.test.js +99 -0
- package/build/crawler/content-utils.test.js.map +1 -0
- package/build/crawler/crawlee-crawler.d.ts +63 -0
- package/build/crawler/crawlee-crawler.js +342 -0
- package/build/crawler/crawlee-crawler.js.map +1 -0
- package/build/crawler/crawlee-crawler.test.d.ts +1 -0
- package/build/crawler/crawlee-crawler.test.js +280 -0
- package/build/crawler/crawlee-crawler.test.js.map +1 -0
- package/build/crawler/default-extractor.d.ts +4 -0
- package/build/crawler/default-extractor.js +26 -0
- package/build/crawler/default-extractor.js.map +1 -0
- package/build/crawler/default-extractor.test.d.ts +1 -0
- package/build/crawler/default-extractor.test.js +200 -0
- package/build/crawler/default-extractor.test.js.map +1 -0
- package/build/crawler/default.d.ts +11 -0
- package/build/crawler/default.js +138 -0
- package/build/crawler/default.js.map +1 -0
- package/build/crawler/docs-crawler.d.ts +26 -0
- package/build/crawler/docs-crawler.js +97 -0
- package/build/crawler/docs-crawler.js.map +1 -0
- package/build/crawler/docs-crawler.test.d.ts +1 -0
- package/build/crawler/docs-crawler.test.js +185 -0
- package/build/crawler/docs-crawler.test.js.map +1 -0
- package/build/crawler/factory.d.ts +6 -0
- package/build/crawler/factory.js +83 -0
- package/build/crawler/factory.js.map +1 -0
- package/build/crawler/github-pages-extractor.d.ts +4 -0
- package/build/crawler/github-pages-extractor.js +33 -0
- package/build/crawler/github-pages-extractor.js.map +1 -0
- package/build/crawler/github-pages-extractor.test.d.ts +1 -0
- package/build/crawler/github-pages-extractor.test.js +184 -0
- package/build/crawler/github-pages-extractor.test.js.map +1 -0
- package/build/crawler/github.d.ts +20 -0
- package/build/crawler/github.js +181 -0
- package/build/crawler/github.js.map +1 -0
- package/build/crawler/github.test.d.ts +1 -0
- package/build/crawler/github.test.js +326 -0
- package/build/crawler/github.test.js.map +1 -0
- package/build/crawler/puppeteer.d.ts +16 -0
- package/build/crawler/puppeteer.js +191 -0
- package/build/crawler/puppeteer.js.map +1 -0
- package/build/crawler/queue-manager.d.ts +43 -0
- package/build/crawler/queue-manager.js +169 -0
- package/build/crawler/queue-manager.js.map +1 -0
- package/build/crawler/queue-manager.test.d.ts +1 -0
- package/build/crawler/queue-manager.test.js +509 -0
- package/build/crawler/queue-manager.test.js.map +1 -0
- package/build/crawler/site-rules.d.ts +11 -0
- package/build/crawler/site-rules.js +104 -0
- package/build/crawler/site-rules.js.map +1 -0
- package/build/crawler/site-rules.test.d.ts +1 -0
- package/build/crawler/site-rules.test.js +139 -0
- package/build/crawler/site-rules.test.js.map +1 -0
- package/build/crawler/storybook-extractor.d.ts +34 -0
- package/build/crawler/storybook-extractor.js +767 -0
- package/build/crawler/storybook-extractor.js.map +1 -0
- package/build/crawler/storybook-extractor.test.d.ts +1 -0
- package/build/crawler/storybook-extractor.test.js +491 -0
- package/build/crawler/storybook-extractor.test.js.map +1 -0
- package/build/embeddings/fastembed.d.ts +25 -0
- package/build/embeddings/fastembed.js +188 -0
- package/build/embeddings/fastembed.js.map +1 -0
- package/build/embeddings/fastembed.test.d.ts +1 -0
- package/build/embeddings/fastembed.test.js +307 -0
- package/build/embeddings/fastembed.test.js.map +1 -0
- package/build/embeddings/openai.d.ts +8 -0
- package/build/embeddings/openai.js +56 -0
- package/build/embeddings/openai.js.map +1 -0
- package/build/embeddings/types.d.ts +4 -0
- package/build/embeddings/types.js +2 -0
- package/build/embeddings/types.js.map +1 -0
- package/build/index.d.ts +2 -0
- package/build/index.js +1007 -0
- package/build/index.js.map +1 -0
- package/build/index.test.d.ts +1 -0
- package/build/index.test.js +364 -0
- package/build/index.test.js.map +1 -0
- package/build/indexing/queue-manager.d.ts +36 -0
- package/build/indexing/queue-manager.js +86 -0
- package/build/indexing/queue-manager.js.map +1 -0
- package/build/indexing/queue-manager.test.d.ts +1 -0
- package/build/indexing/queue-manager.test.js +257 -0
- package/build/indexing/queue-manager.test.js.map +1 -0
- package/build/indexing/status.d.ts +39 -0
- package/build/indexing/status.js +207 -0
- package/build/indexing/status.js.map +1 -0
- package/build/indexing/status.test.d.ts +1 -0
- package/build/indexing/status.test.js +246 -0
- package/build/indexing/status.test.js.map +1 -0
- package/build/processor/content.d.ts +16 -0
- package/build/processor/content.js +286 -0
- package/build/processor/content.js.map +1 -0
- package/build/processor/content.test.d.ts +1 -0
- package/build/processor/content.test.js +369 -0
- package/build/processor/content.test.js.map +1 -0
- package/build/processor/markdown.d.ts +11 -0
- package/build/processor/markdown.js +256 -0
- package/build/processor/markdown.js.map +1 -0
- package/build/processor/markdown.test.d.ts +1 -0
- package/build/processor/markdown.test.js +312 -0
- package/build/processor/markdown.test.js.map +1 -0
- package/build/processor/metadata-parser.d.ts +37 -0
- package/build/processor/metadata-parser.js +245 -0
- package/build/processor/metadata-parser.js.map +1 -0
- package/build/processor/metadata-parser.test.d.ts +1 -0
- package/build/processor/metadata-parser.test.js +357 -0
- package/build/processor/metadata-parser.test.js.map +1 -0
- package/build/processor/processor.d.ts +8 -0
- package/build/processor/processor.js +190 -0
- package/build/processor/processor.js.map +1 -0
- package/build/processor/processor.test.d.ts +1 -0
- package/build/processor/processor.test.js +357 -0
- package/build/processor/processor.test.js.map +1 -0
- package/build/rag/cache.d.ts +10 -0
- package/build/rag/cache.js +10 -0
- package/build/rag/cache.js.map +1 -0
- package/build/rag/code-generator.d.ts +11 -0
- package/build/rag/code-generator.js +30 -0
- package/build/rag/code-generator.js.map +1 -0
- package/build/rag/context-assembler.d.ts +23 -0
- package/build/rag/context-assembler.js +113 -0
- package/build/rag/context-assembler.js.map +1 -0
- package/build/rag/docs-search.d.ts +55 -0
- package/build/rag/docs-search.js +380 -0
- package/build/rag/docs-search.js.map +1 -0
- package/build/rag/pipeline.d.ts +26 -0
- package/build/rag/pipeline.js +91 -0
- package/build/rag/pipeline.js.map +1 -0
- package/build/rag/query-processor.d.ts +14 -0
- package/build/rag/query-processor.js +57 -0
- package/build/rag/query-processor.js.map +1 -0
- package/build/rag/reranker.d.ts +55 -0
- package/build/rag/reranker.js +210 -0
- package/build/rag/reranker.js.map +1 -0
- package/build/rag/response-generator.d.ts +20 -0
- package/build/rag/response-generator.js +101 -0
- package/build/rag/response-generator.js.map +1 -0
- package/build/rag/retriever.d.ts +19 -0
- package/build/rag/retriever.js +111 -0
- package/build/rag/retriever.js.map +1 -0
- package/build/rag/validator.d.ts +22 -0
- package/build/rag/validator.js +128 -0
- package/build/rag/validator.js.map +1 -0
- package/build/rag/version-manager.d.ts +23 -0
- package/build/rag/version-manager.js +98 -0
- package/build/rag/version-manager.js.map +1 -0
- package/build/setupTests.d.ts +4 -0
- package/build/setupTests.js +50 -0
- package/build/setupTests.js.map +1 -0
- package/build/storage/storage.d.ts +38 -0
- package/build/storage/storage.js +700 -0
- package/build/storage/storage.js.map +1 -0
- package/build/storage/storage.test.d.ts +1 -0
- package/build/storage/storage.test.js +338 -0
- package/build/storage/storage.test.js.map +1 -0
- package/build/types/rag.d.ts +27 -0
- package/build/types/rag.js +2 -0
- package/build/types/rag.js.map +1 -0
- package/build/types.d.ts +120 -0
- package/build/types.js +2 -0
- package/build/types.js.map +1 -0
- package/build/util/content-utils.d.ts +31 -0
- package/build/util/content-utils.js +120 -0
- package/build/util/content-utils.js.map +1 -0
- package/build/util/content.d.ts +1 -0
- package/build/util/content.js +16 -0
- package/build/util/content.js.map +1 -0
- package/build/util/docs.d.ts +1 -0
- package/build/util/docs.js +26 -0
- package/build/util/docs.js.map +1 -0
- package/build/util/docs.test.d.ts +1 -0
- package/build/util/docs.test.js +49 -0
- package/build/util/docs.test.js.map +1 -0
- package/build/util/favicon.d.ts +6 -0
- package/build/util/favicon.js +88 -0
- package/build/util/favicon.js.map +1 -0
- package/build/util/favicon.test.d.ts +1 -0
- package/build/util/favicon.test.js +140 -0
- package/build/util/favicon.test.js.map +1 -0
- package/build/util/logger.d.ts +17 -0
- package/build/util/logger.js +72 -0
- package/build/util/logger.js.map +1 -0
- package/build/util/logger.test.d.ts +1 -0
- package/build/util/logger.test.js +46 -0
- package/build/util/logger.test.js.map +1 -0
- package/build/util/security.d.ts +312 -0
- package/build/util/security.js +719 -0
- package/build/util/security.js.map +1 -0
- package/build/util/security.test.d.ts +1 -0
- package/build/util/security.test.js +524 -0
- package/build/util/security.test.js.map +1 -0
- package/build/util/site-detector.d.ts +22 -0
- package/build/util/site-detector.js +42 -0
- package/build/util/site-detector.js.map +1 -0
- package/package.json +112 -0
package/build/index.js
ADDED
|
@@ -0,0 +1,1007 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// IMPORTANT: Suppress ALL stdout logging for MCP compatibility
|
|
3
|
+
// MCP servers must only output JSON-RPC messages to stdout
|
|
4
|
+
// Set environment variables to suppress Crawlee/Apify logging
|
|
5
|
+
process.env.CRAWLEE_LOG_LEVEL = 'OFF';
|
|
6
|
+
process.env.APIFY_LOG_LEVEL = 'OFF';
|
|
7
|
+
// Import and suppress Crawlee logging
|
|
8
|
+
import { log, Configuration } from 'crawlee';
|
|
9
|
+
log.setLevel(log.LEVELS.OFF);
|
|
10
|
+
// Configure Crawlee to be silent
|
|
11
|
+
Configuration.getGlobalConfig().set('logLevel', 'OFF');
|
|
12
|
+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
13
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
14
|
+
import { CallToolRequestSchema, ErrorCode, ListToolsRequestSchema, McpError } from '@modelcontextprotocol/sdk/types.js';
|
|
15
|
+
import { DocumentStore } from './storage/storage.js';
|
|
16
|
+
import { FastEmbeddings } from './embeddings/fastembed.js';
|
|
17
|
+
import { WebDocumentProcessor } from './processor/processor.js';
|
|
18
|
+
import { IndexingStatusTracker } from './indexing/status.js';
|
|
19
|
+
import { IndexingQueueManager } from './indexing/queue-manager.js';
|
|
20
|
+
import { loadConfig, isValidPublicUrl, normalizeUrl } from './config.js';
|
|
21
|
+
import { DocsCrawler } from './crawler/docs-crawler.js';
|
|
22
|
+
import { AuthManager } from './crawler/auth.js';
|
|
23
|
+
import { fetchFavicon } from './util/favicon.js';
|
|
24
|
+
import { generateDocId } from './util/docs.js';
|
|
25
|
+
import { logger } from './util/logger.js';
|
|
26
|
+
import { StorageStateSchema, safeJsonParse, validateToolArgs, sanitizeErrorMessage, detectPromptInjection, wrapExternalContent, addInjectionWarnings, SessionExpiredError, AddDocumentationArgsSchema, AuthenticateArgsSchema, ClearAuthArgsSchema, SearchDocumentationArgsSchema, ReindexDocumentationArgsSchema, DeleteDocumentationArgsSchema, } from './util/security.js';
|
|
27
|
+
class WebDocsServer {
|
|
28
|
+
server;
|
|
29
|
+
config;
|
|
30
|
+
store;
|
|
31
|
+
processor;
|
|
32
|
+
statusTracker;
|
|
33
|
+
indexingQueue;
|
|
34
|
+
authManager;
|
|
35
|
+
/** Maps operation ID to progress token for MCP notifications */
|
|
36
|
+
progressTokens = new Map();
|
|
37
|
+
/** Tracks last notified progress to throttle notifications */
|
|
38
|
+
lastNotifiedProgress = new Map();
|
|
39
|
+
constructor() {
|
|
40
|
+
// Initialize basic components that don't need async initialization
|
|
41
|
+
this.statusTracker = new IndexingStatusTracker();
|
|
42
|
+
this.indexingQueue = new IndexingQueueManager();
|
|
43
|
+
// Set up status change listener for MCP progress notifications
|
|
44
|
+
this.statusTracker.addStatusListener((status) => {
|
|
45
|
+
this.sendProgressNotification(status);
|
|
46
|
+
});
|
|
47
|
+
// Initialize MCP server
|
|
48
|
+
this.server = new McpServer({
|
|
49
|
+
name: 'mcp-web-docs',
|
|
50
|
+
version: '1.0.0',
|
|
51
|
+
}, {
|
|
52
|
+
capabilities: {
|
|
53
|
+
tools: {},
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
// Set up tool handlers
|
|
57
|
+
this.setupToolHandlers();
|
|
58
|
+
// Handle errors
|
|
59
|
+
this.server.server.onerror = (error) => logger.error('[MCP Error]', error);
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Send MCP progress notification to client.
|
|
63
|
+
* Only sends if the client provided a progressToken in the original request.
|
|
64
|
+
* Throttled to avoid flooding - sends on 5% increments or status changes.
|
|
65
|
+
*/
|
|
66
|
+
async sendProgressNotification(status) {
|
|
67
|
+
const progressToken = this.progressTokens.get(status.id);
|
|
68
|
+
// Only send if we have a progress token from the client
|
|
69
|
+
if (!progressToken) {
|
|
70
|
+
logger.debug(`[Progress] No token for ${status.id}, skipping notification`);
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
const progressPercent = Math.round(status.progress * 100);
|
|
74
|
+
const lastProgress = this.lastNotifiedProgress.get(status.id) ?? -1;
|
|
75
|
+
// Only notify on significant progress (5% increments) or status changes
|
|
76
|
+
const isStatusChange = status.status === 'complete' || status.status === 'failed' || status.status === 'cancelled';
|
|
77
|
+
const isSignificantProgress = progressPercent - lastProgress >= 5;
|
|
78
|
+
if (!isStatusChange && !isSignificantProgress) {
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
this.lastNotifiedProgress.set(status.id, progressPercent);
|
|
82
|
+
// Build human-readable message
|
|
83
|
+
let message = status.description;
|
|
84
|
+
if (status.pagesProcessed !== undefined && status.pagesFound !== undefined) {
|
|
85
|
+
message = `${status.description} (${status.pagesProcessed}/${status.pagesFound} pages)`;
|
|
86
|
+
}
|
|
87
|
+
try {
|
|
88
|
+
// Send MCP progress notification per spec:
|
|
89
|
+
// https://modelcontextprotocol.io/specification/2025-03-26/basic/utilities/progress
|
|
90
|
+
await this.server.server.notification({
|
|
91
|
+
method: 'notifications/progress',
|
|
92
|
+
params: {
|
|
93
|
+
progressToken,
|
|
94
|
+
progress: progressPercent,
|
|
95
|
+
total: 100,
|
|
96
|
+
message,
|
|
97
|
+
},
|
|
98
|
+
});
|
|
99
|
+
logger.info(`[Progress] Sent notification: ${progressPercent}% - ${message}`);
|
|
100
|
+
}
|
|
101
|
+
catch (error) {
|
|
102
|
+
logger.debug(`[Progress] Failed to send notification:`, error);
|
|
103
|
+
}
|
|
104
|
+
// Clean up tracking for completed operations
|
|
105
|
+
if (isStatusChange) {
|
|
106
|
+
this.lastNotifiedProgress.delete(status.id);
|
|
107
|
+
this.progressTokens.delete(status.id);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
async initialize() {
|
|
111
|
+
// Load configuration
|
|
112
|
+
this.config = await loadConfig();
|
|
113
|
+
// Initialize components that need config
|
|
114
|
+
const embeddings = new FastEmbeddings();
|
|
115
|
+
this.store = new DocumentStore(this.config.dbPath, this.config.vectorDbPath, embeddings, this.config.cacheSize);
|
|
116
|
+
this.processor = new WebDocumentProcessor(embeddings, this.config.maxChunkSize);
|
|
117
|
+
// Initialize auth manager for handling authenticated crawls
|
|
118
|
+
this.authManager = new AuthManager(this.config.dataDir);
|
|
119
|
+
await this.authManager.initialize();
|
|
120
|
+
// Initialize storage
|
|
121
|
+
await this.store.initialize();
|
|
122
|
+
}
|
|
123
|
+
setupToolHandlers() {
|
|
124
|
+
// List available tools
|
|
125
|
+
this.server.server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
126
|
+
tools: [
|
|
127
|
+
{
|
|
128
|
+
name: 'add_documentation',
|
|
129
|
+
description: `Add new documentation site for indexing. Supports authenticated sites via the auth options.
|
|
130
|
+
|
|
131
|
+
IMPORTANT: Before calling this tool, ask the user if they want to restrict crawling to a specific path prefix. For example, if indexing https://docs.example.com/api/v2/overview, the user might want to restrict to '/api/v2' to avoid crawling unrelated sections of the site.`,
|
|
132
|
+
inputSchema: {
|
|
133
|
+
type: 'object',
|
|
134
|
+
properties: {
|
|
135
|
+
url: {
|
|
136
|
+
type: 'string',
|
|
137
|
+
description: 'URL of the documentation site',
|
|
138
|
+
},
|
|
139
|
+
title: {
|
|
140
|
+
type: 'string',
|
|
141
|
+
description: 'Optional title for the documentation',
|
|
142
|
+
},
|
|
143
|
+
id: {
|
|
144
|
+
type: 'string',
|
|
145
|
+
description: 'Optional custom ID for the documentation (used for storage and identification). If not provided, an ID is auto-generated from the URL.',
|
|
146
|
+
},
|
|
147
|
+
pathPrefix: {
|
|
148
|
+
type: 'string',
|
|
149
|
+
description: "Optional path prefix to restrict crawling. Only pages whose URL path starts with this prefix will be indexed. Must start with '/'. Example: '/api/v2' would only crawl pages under that path.",
|
|
150
|
+
},
|
|
151
|
+
auth: {
|
|
152
|
+
type: 'object',
|
|
153
|
+
description: 'Authentication options for protected documentation sites',
|
|
154
|
+
properties: {
|
|
155
|
+
requiresAuth: {
|
|
156
|
+
type: 'boolean',
|
|
157
|
+
description: 'Set to true to open a browser for interactive login before crawling',
|
|
158
|
+
},
|
|
159
|
+
browser: {
|
|
160
|
+
type: 'string',
|
|
161
|
+
enum: ['chromium', 'chrome', 'firefox', 'webkit', 'edge'],
|
|
162
|
+
description: "Optional. If omitted, the user's default browser is automatically detected from OS settings. Only specify to override.",
|
|
163
|
+
},
|
|
164
|
+
loginUrl: {
|
|
165
|
+
type: 'string',
|
|
166
|
+
description: 'Login page URL if different from main URL',
|
|
167
|
+
},
|
|
168
|
+
loginSuccessPattern: {
|
|
169
|
+
type: 'string',
|
|
170
|
+
description: 'URL regex pattern that indicates successful login',
|
|
171
|
+
},
|
|
172
|
+
loginSuccessSelector: {
|
|
173
|
+
type: 'string',
|
|
174
|
+
description: 'CSS selector that appears after successful login',
|
|
175
|
+
},
|
|
176
|
+
loginTimeoutSecs: {
|
|
177
|
+
type: 'number',
|
|
178
|
+
description: 'Timeout for login in seconds (default: 300)',
|
|
179
|
+
},
|
|
180
|
+
},
|
|
181
|
+
},
|
|
182
|
+
},
|
|
183
|
+
required: ['url'],
|
|
184
|
+
},
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
name: 'authenticate',
|
|
188
|
+
description: "Open a browser window for interactive login to a protected site. The session will be saved and reused for future crawls. Use this before add_documentation for sites that require login. The user's default browser is automatically detected from OS settings - do NOT specify a browser unless the user explicitly requests a specific one.",
|
|
189
|
+
inputSchema: {
|
|
190
|
+
type: 'object',
|
|
191
|
+
properties: {
|
|
192
|
+
url: {
|
|
193
|
+
type: 'string',
|
|
194
|
+
description: 'URL of the site to authenticate to',
|
|
195
|
+
},
|
|
196
|
+
browser: {
|
|
197
|
+
type: 'string',
|
|
198
|
+
enum: ['chromium', 'chrome', 'firefox', 'webkit', 'edge'],
|
|
199
|
+
description: "Optional. If omitted, the user's default browser is automatically detected from OS settings. Only specify this to override auto-detection with a specific browser.",
|
|
200
|
+
},
|
|
201
|
+
loginUrl: {
|
|
202
|
+
type: 'string',
|
|
203
|
+
description: 'Login page URL if different from main URL',
|
|
204
|
+
},
|
|
205
|
+
loginTimeoutSecs: {
|
|
206
|
+
type: 'number',
|
|
207
|
+
description: 'Timeout for login in seconds (default: 300 = 5 minutes)',
|
|
208
|
+
},
|
|
209
|
+
},
|
|
210
|
+
required: ['url'],
|
|
211
|
+
},
|
|
212
|
+
},
|
|
213
|
+
{
|
|
214
|
+
name: 'clear_auth',
|
|
215
|
+
description: 'Clear saved authentication session for a domain',
|
|
216
|
+
inputSchema: {
|
|
217
|
+
type: 'object',
|
|
218
|
+
properties: {
|
|
219
|
+
url: {
|
|
220
|
+
type: 'string',
|
|
221
|
+
description: 'URL of the site to clear authentication for',
|
|
222
|
+
},
|
|
223
|
+
},
|
|
224
|
+
required: ['url'],
|
|
225
|
+
},
|
|
226
|
+
},
|
|
227
|
+
{
|
|
228
|
+
name: 'list_documentation',
|
|
229
|
+
description: 'List all indexed documentation sites',
|
|
230
|
+
inputSchema: {
|
|
231
|
+
type: 'object',
|
|
232
|
+
properties: {},
|
|
233
|
+
},
|
|
234
|
+
},
|
|
235
|
+
{
|
|
236
|
+
name: 'search_documentation',
|
|
237
|
+
description: `Search through indexed documentation using hybrid search (full-text + semantic).
|
|
238
|
+
|
|
239
|
+
## Query Tips for Best Results
|
|
240
|
+
|
|
241
|
+
1. **Be specific** - Include unique terms from what you're looking for
|
|
242
|
+
- Instead of: "Button props"
|
|
243
|
+
- Try: "Button props onClick disabled loading"
|
|
244
|
+
|
|
245
|
+
2. **Use exact phrases** - Wrap in quotes for exact matching
|
|
246
|
+
- "authentication middleware" finds that exact phrase
|
|
247
|
+
- authentication middleware finds pages with either word
|
|
248
|
+
|
|
249
|
+
3. **Include context** - Add related terms to narrow results
|
|
250
|
+
- API docs: "GET /users endpoint authentication headers"
|
|
251
|
+
- Config: "webpack config entry output plugins"
|
|
252
|
+
- Functions: "parseJSON function parameters return type"
|
|
253
|
+
|
|
254
|
+
4. **Combine concepts** - More terms = more precise results
|
|
255
|
+
- "Card component status primary negative props table"
|
|
256
|
+
- "database connection pool maxConnections timeout"
|
|
257
|
+
|
|
258
|
+
## How Search Works
|
|
259
|
+
- Full-text search with stemming (run → runs, running)
|
|
260
|
+
- Fuzzy matching for typos (authetication → authentication)
|
|
261
|
+
- Semantic similarity for conceptual matches
|
|
262
|
+
- Results ranked by relevance combining all signals`,
|
|
263
|
+
inputSchema: {
|
|
264
|
+
type: 'object',
|
|
265
|
+
properties: {
|
|
266
|
+
query: {
|
|
267
|
+
type: 'string',
|
|
268
|
+
description: 'Search query - be specific and include unique terms. Use quotes for exact phrases. Example: "Card component props headline status" or "REST API authentication Bearer token"',
|
|
269
|
+
},
|
|
270
|
+
url: {
|
|
271
|
+
type: 'string',
|
|
272
|
+
description: 'Optional: Filter results to a specific documentation site by its URL. If not provided, searches all indexed docs.',
|
|
273
|
+
},
|
|
274
|
+
limit: {
|
|
275
|
+
type: 'number',
|
|
276
|
+
description: 'Maximum number of results (default: 10)',
|
|
277
|
+
},
|
|
278
|
+
},
|
|
279
|
+
required: ['query'],
|
|
280
|
+
},
|
|
281
|
+
},
|
|
282
|
+
{
|
|
283
|
+
name: 'reindex_documentation',
|
|
284
|
+
description: 'Re-index a specific documentation site',
|
|
285
|
+
inputSchema: {
|
|
286
|
+
type: 'object',
|
|
287
|
+
properties: {
|
|
288
|
+
url: {
|
|
289
|
+
type: 'string',
|
|
290
|
+
description: 'URL of the documentation to re-index',
|
|
291
|
+
},
|
|
292
|
+
},
|
|
293
|
+
required: ['url'],
|
|
294
|
+
},
|
|
295
|
+
},
|
|
296
|
+
{
|
|
297
|
+
name: 'get_indexing_status',
|
|
298
|
+
description: 'Get current indexing status',
|
|
299
|
+
inputSchema: {
|
|
300
|
+
type: 'object',
|
|
301
|
+
properties: {},
|
|
302
|
+
},
|
|
303
|
+
},
|
|
304
|
+
{
|
|
305
|
+
name: 'delete_documentation',
|
|
306
|
+
description: 'Delete an indexed documentation site and all its data (vectors, metadata, cached crawl data, and optionally auth session)',
|
|
307
|
+
inputSchema: {
|
|
308
|
+
type: 'object',
|
|
309
|
+
properties: {
|
|
310
|
+
url: {
|
|
311
|
+
type: 'string',
|
|
312
|
+
description: 'URL of the documentation site to delete',
|
|
313
|
+
},
|
|
314
|
+
clearAuth: {
|
|
315
|
+
type: 'boolean',
|
|
316
|
+
description: 'Also clear saved authentication session for this domain (default: false)',
|
|
317
|
+
},
|
|
318
|
+
},
|
|
319
|
+
required: ['url'],
|
|
320
|
+
},
|
|
321
|
+
},
|
|
322
|
+
],
|
|
323
|
+
}));
|
|
324
|
+
// Handle tool calls
|
|
325
|
+
this.server.server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
326
|
+
// Extract progressToken from request metadata (per MCP spec)
|
|
327
|
+
// Clients can include this to receive progress notifications
|
|
328
|
+
const args = request.params.arguments;
|
|
329
|
+
const progressToken = args?._meta?.progressToken;
|
|
330
|
+
switch (request.params.name) {
|
|
331
|
+
case 'add_documentation':
|
|
332
|
+
return this.handleAddDocumentation(request.params.arguments, progressToken);
|
|
333
|
+
case 'list_documentation':
|
|
334
|
+
return this.handleListDocumentation();
|
|
335
|
+
case 'search_documentation':
|
|
336
|
+
return this.handleSearchDocumentation(request.params.arguments);
|
|
337
|
+
case 'reindex_documentation':
|
|
338
|
+
return this.handleReindexDocumentation(request.params.arguments, progressToken);
|
|
339
|
+
case 'get_indexing_status':
|
|
340
|
+
return this.handleGetIndexingStatus();
|
|
341
|
+
case 'authenticate':
|
|
342
|
+
return this.handleAuthenticate(request.params.arguments);
|
|
343
|
+
case 'clear_auth':
|
|
344
|
+
return this.handleClearAuth(request.params.arguments);
|
|
345
|
+
case 'delete_documentation':
|
|
346
|
+
return this.handleDeleteDocumentation(request.params.arguments);
|
|
347
|
+
default:
|
|
348
|
+
throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${request.params.name}`);
|
|
349
|
+
}
|
|
350
|
+
});
|
|
351
|
+
}
|
|
352
|
+
async handleAddDocumentation(args, progressToken) {
|
|
353
|
+
// Validate arguments with schema
|
|
354
|
+
let validatedArgs;
|
|
355
|
+
try {
|
|
356
|
+
validatedArgs = validateToolArgs(args, AddDocumentationArgsSchema);
|
|
357
|
+
}
|
|
358
|
+
catch (error) {
|
|
359
|
+
throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error));
|
|
360
|
+
}
|
|
361
|
+
const { url, title, id, pathPrefix, auth: authOptions } = validatedArgs;
|
|
362
|
+
// Additional SSRF protection check
|
|
363
|
+
if (!isValidPublicUrl(url)) {
|
|
364
|
+
throw new McpError(ErrorCode.InvalidParams, 'Access to private networks is blocked');
|
|
365
|
+
}
|
|
366
|
+
const normalizedUrl = normalizeUrl(url);
|
|
367
|
+
const docTitle = title || new URL(normalizedUrl).hostname;
|
|
368
|
+
// Use custom ID if provided, otherwise auto-generate
|
|
369
|
+
const docId = id || generateDocId(normalizedUrl, docTitle);
|
|
370
|
+
// Log path prefix if provided
|
|
371
|
+
if (pathPrefix) {
|
|
372
|
+
logger.info(`[WebDocsServer] Path prefix restriction: ${pathPrefix}`);
|
|
373
|
+
}
|
|
374
|
+
if (authOptions?.requiresAuth) {
|
|
375
|
+
const hasExistingSession = await this.authManager.hasSession(normalizedUrl);
|
|
376
|
+
if (!hasExistingSession) {
|
|
377
|
+
logger.info(`[WebDocsServer] auth.requiresAuth=true, starting interactive login for ${normalizedUrl}`);
|
|
378
|
+
try {
|
|
379
|
+
await this.authManager.performInteractiveLogin(normalizedUrl, {
|
|
380
|
+
browser: authOptions.browser,
|
|
381
|
+
loginUrl: authOptions.loginUrl,
|
|
382
|
+
loginSuccessPattern: authOptions.loginSuccessPattern,
|
|
383
|
+
loginSuccessSelector: authOptions.loginSuccessSelector,
|
|
384
|
+
loginTimeoutSecs: authOptions.loginTimeoutSecs,
|
|
385
|
+
});
|
|
386
|
+
logger.info(`[WebDocsServer] Authentication successful for ${normalizedUrl}`);
|
|
387
|
+
}
|
|
388
|
+
catch (error) {
|
|
389
|
+
throw new McpError(ErrorCode.InternalError, `Authentication failed: ${sanitizeErrorMessage(error)}. Please try using the 'authenticate' tool separately.`);
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
else {
|
|
393
|
+
// Validate that the existing session is still valid before crawling
|
|
394
|
+
logger.info(`[WebDocsServer] Validating existing session for ${normalizedUrl}...`);
|
|
395
|
+
const validation = await this.authManager.validateSession(normalizedUrl);
|
|
396
|
+
if (!validation.isValid) {
|
|
397
|
+
logger.warn(`[WebDocsServer] Session expired for ${normalizedUrl}: ${validation.reason}`);
|
|
398
|
+
// Clear the expired session
|
|
399
|
+
await this.authManager.clearSession(normalizedUrl);
|
|
400
|
+
throw new McpError(ErrorCode.InvalidParams, `Authentication session has expired (${validation.reason}). Please use the 'authenticate' tool to log in again.`);
|
|
401
|
+
}
|
|
402
|
+
logger.info(`[WebDocsServer] ✓ Session validated for ${normalizedUrl}`);
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
// Store progress token if provided by client
|
|
406
|
+
if (progressToken !== undefined) {
|
|
407
|
+
this.progressTokens.set(docId, progressToken);
|
|
408
|
+
logger.info(`[Progress] Registered token for ${docId}: ${progressToken}`);
|
|
409
|
+
}
|
|
410
|
+
// Cancel any existing operation for this URL
|
|
411
|
+
const controller = await this.indexingQueue.startOperation(normalizedUrl);
|
|
412
|
+
// Start indexing process
|
|
413
|
+
this.statusTracker.startIndexing(docId, normalizedUrl, docTitle);
|
|
414
|
+
// Start indexing in the background with abort support
|
|
415
|
+
const operationPromise = this.indexAndAdd(docId, normalizedUrl, docTitle, false, controller.signal, pathPrefix)
|
|
416
|
+
.catch((error) => {
|
|
417
|
+
const err = error;
|
|
418
|
+
if (err?.name !== 'AbortError') {
|
|
419
|
+
logger.error('[WebDocsServer] Background indexing failed:', error);
|
|
420
|
+
}
|
|
421
|
+
})
|
|
422
|
+
.finally(() => {
|
|
423
|
+
this.indexingQueue.completeOperation(normalizedUrl);
|
|
424
|
+
});
|
|
425
|
+
this.indexingQueue.registerOperation(normalizedUrl, controller, operationPromise);
|
|
426
|
+
return {
|
|
427
|
+
content: [
|
|
428
|
+
{
|
|
429
|
+
type: 'text',
|
|
430
|
+
text: JSON.stringify({
|
|
431
|
+
status: 'started',
|
|
432
|
+
message: `Started indexing ${normalizedUrl}`,
|
|
433
|
+
docId,
|
|
434
|
+
instruction: 'IMPORTANT: You MUST call get_indexing_status repeatedly (every few seconds) until status is "complete" or "failed". Do not ask the user - just keep polling automatically.',
|
|
435
|
+
}, null, 2),
|
|
436
|
+
},
|
|
437
|
+
],
|
|
438
|
+
};
|
|
439
|
+
}
|
|
440
|
+
async handleListDocumentation() {
|
|
441
|
+
const docs = await this.store.listDocuments();
|
|
442
|
+
return {
|
|
443
|
+
content: [
|
|
444
|
+
{
|
|
445
|
+
type: 'text',
|
|
446
|
+
text: JSON.stringify(docs, null, 2),
|
|
447
|
+
},
|
|
448
|
+
],
|
|
449
|
+
};
|
|
450
|
+
}
|
|
451
|
+
async handleSearchDocumentation(args) {
|
|
452
|
+
// Validate arguments with schema
|
|
453
|
+
let validatedArgs;
|
|
454
|
+
try {
|
|
455
|
+
validatedArgs = validateToolArgs(args, SearchDocumentationArgsSchema);
|
|
456
|
+
}
|
|
457
|
+
catch (error) {
|
|
458
|
+
throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error));
|
|
459
|
+
}
|
|
460
|
+
const { query, url, limit = 10 } = validatedArgs;
|
|
461
|
+
// Normalize URL if provided for filtering
|
|
462
|
+
const filterUrl = url ? normalizeUrl(url) : undefined;
|
|
463
|
+
const results = await this.store.searchByText(query, { limit, filterUrl });
|
|
464
|
+
// Apply prompt injection detection and filter/process results
|
|
465
|
+
let blockedCount = 0;
|
|
466
|
+
const safeResults = results
|
|
467
|
+
.map((result) => {
|
|
468
|
+
// Detect prompt injection patterns in the content
|
|
469
|
+
// Note: detectPromptInjection strips code blocks before scanning,
|
|
470
|
+
// so legitimate code examples won't trigger false positives
|
|
471
|
+
const injectionResult = detectPromptInjection(result.content);
|
|
472
|
+
// SECURITY: Block results with high-severity injection patterns
|
|
473
|
+
// These could manipulate the LLM if returned
|
|
474
|
+
if (injectionResult.maxSeverity === 'high') {
|
|
475
|
+
blockedCount++;
|
|
476
|
+
logger.debug(`[Security] Blocked search result from ${result.url} due to high-severity injection pattern: ${injectionResult.detections[0]?.description}`);
|
|
477
|
+
return null; // Will be filtered out
|
|
478
|
+
}
|
|
479
|
+
// For medium/low severity, add warnings but still return
|
|
480
|
+
let safeContent = addInjectionWarnings(result.content, injectionResult);
|
|
481
|
+
// Wrap with external content markers
|
|
482
|
+
safeContent = wrapExternalContent(safeContent, result.url);
|
|
483
|
+
return {
|
|
484
|
+
...result,
|
|
485
|
+
content: safeContent,
|
|
486
|
+
// Include security metadata
|
|
487
|
+
security: {
|
|
488
|
+
isExternalContent: true,
|
|
489
|
+
injectionDetected: injectionResult.hasInjection,
|
|
490
|
+
injectionSeverity: injectionResult.maxSeverity,
|
|
491
|
+
detectionCount: injectionResult.detections.length,
|
|
492
|
+
},
|
|
493
|
+
};
|
|
494
|
+
})
|
|
495
|
+
.filter((result) => result !== null);
|
|
496
|
+
// Build response with security notice if content was blocked
|
|
497
|
+
const response = {
|
|
498
|
+
results: safeResults,
|
|
499
|
+
};
|
|
500
|
+
if (blockedCount > 0) {
|
|
501
|
+
response.securityNotice = `${blockedCount} result(s) were blocked due to high-severity prompt injection patterns detected in the content. This protects against potentially malicious content that could manipulate AI behavior.`;
|
|
502
|
+
}
|
|
503
|
+
return {
|
|
504
|
+
content: [
|
|
505
|
+
{
|
|
506
|
+
type: 'text',
|
|
507
|
+
text: JSON.stringify(response, null, 2),
|
|
508
|
+
},
|
|
509
|
+
],
|
|
510
|
+
};
|
|
511
|
+
}
|
|
512
|
+
async handleReindexDocumentation(args, progressToken) {
|
|
513
|
+
// Validate arguments with schema
|
|
514
|
+
let validatedArgs;
|
|
515
|
+
try {
|
|
516
|
+
validatedArgs = validateToolArgs(args, ReindexDocumentationArgsSchema);
|
|
517
|
+
}
|
|
518
|
+
catch (error) {
|
|
519
|
+
throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error));
|
|
520
|
+
}
|
|
521
|
+
const { url } = validatedArgs;
|
|
522
|
+
// Additional SSRF protection check
|
|
523
|
+
if (!isValidPublicUrl(url)) {
|
|
524
|
+
throw new McpError(ErrorCode.InvalidParams, 'Access to private networks is blocked');
|
|
525
|
+
}
|
|
526
|
+
const normalizedUrl = normalizeUrl(url);
|
|
527
|
+
const doc = await this.store.getDocument(normalizedUrl);
|
|
528
|
+
if (!doc) {
|
|
529
|
+
throw new McpError(ErrorCode.InvalidParams, 'Documentation not found');
|
|
530
|
+
}
|
|
531
|
+
// Cancel any existing operation for this URL
|
|
532
|
+
const wasCancelled = this.indexingQueue.isIndexing(normalizedUrl);
|
|
533
|
+
const controller = await this.indexingQueue.startOperation(normalizedUrl);
|
|
534
|
+
const docId = generateDocId(normalizedUrl, doc.title);
|
|
535
|
+
// Store progress token if provided by client
|
|
536
|
+
if (progressToken !== undefined) {
|
|
537
|
+
this.progressTokens.set(docId, progressToken);
|
|
538
|
+
logger.info(`[Progress] Registered token for ${docId}: ${progressToken}`);
|
|
539
|
+
}
|
|
540
|
+
this.statusTracker.startIndexing(docId, normalizedUrl, doc.title);
|
|
541
|
+
// Start reindexing in the background with abort support
|
|
542
|
+
const operationPromise = this.indexAndAdd(docId, normalizedUrl, doc.title, true, controller.signal)
|
|
543
|
+
.catch((error) => {
|
|
544
|
+
const err = error;
|
|
545
|
+
if (err?.name !== 'AbortError') {
|
|
546
|
+
logger.error('[WebDocsServer] Background reindexing failed:', error);
|
|
547
|
+
}
|
|
548
|
+
})
|
|
549
|
+
.finally(() => {
|
|
550
|
+
this.indexingQueue.completeOperation(normalizedUrl);
|
|
551
|
+
});
|
|
552
|
+
this.indexingQueue.registerOperation(normalizedUrl, controller, operationPromise);
|
|
553
|
+
return {
|
|
554
|
+
content: [
|
|
555
|
+
{
|
|
556
|
+
type: 'text',
|
|
557
|
+
text: JSON.stringify({
|
|
558
|
+
status: 'started',
|
|
559
|
+
message: wasCancelled
|
|
560
|
+
? `Started re-indexing ${normalizedUrl}. Previous operation was cancelled.`
|
|
561
|
+
: `Started re-indexing ${normalizedUrl}`,
|
|
562
|
+
docId,
|
|
563
|
+
instruction: 'IMPORTANT: You MUST call get_indexing_status repeatedly (every few seconds) until status is "complete" or "failed". Do not ask the user - just keep polling automatically.',
|
|
564
|
+
}, null, 2),
|
|
565
|
+
},
|
|
566
|
+
],
|
|
567
|
+
};
|
|
568
|
+
}
|
|
569
|
+
handleGetIndexingStatus() {
|
|
570
|
+
// Get only active operations and recently completed ones (auto-cleans old statuses)
|
|
571
|
+
const statuses = this.statusTracker.getActiveStatuses();
|
|
572
|
+
// Check if any operations are still in progress
|
|
573
|
+
const hasActiveOperations = statuses.some((s) => s.status === 'indexing');
|
|
574
|
+
// Add instruction for agent
|
|
575
|
+
const response = {
|
|
576
|
+
statuses,
|
|
577
|
+
instruction: hasActiveOperations
|
|
578
|
+
? 'Operations still in progress. Call get_indexing_status again in a few seconds to check progress.'
|
|
579
|
+
: 'All operations complete. No need to poll again.',
|
|
580
|
+
};
|
|
581
|
+
return {
|
|
582
|
+
content: [
|
|
583
|
+
{
|
|
584
|
+
type: 'text',
|
|
585
|
+
text: JSON.stringify(response, null, 2),
|
|
586
|
+
},
|
|
587
|
+
],
|
|
588
|
+
};
|
|
589
|
+
}
|
|
590
|
+
/**
|
|
591
|
+
* Handle interactive authentication request.
|
|
592
|
+
* Opens a visible browser for the user to login manually.
|
|
593
|
+
*/
|
|
594
|
+
async handleAuthenticate(args) {
|
|
595
|
+
// Validate arguments with schema
|
|
596
|
+
let validatedArgs;
|
|
597
|
+
try {
|
|
598
|
+
validatedArgs = validateToolArgs(args, AuthenticateArgsSchema);
|
|
599
|
+
}
|
|
600
|
+
catch (error) {
|
|
601
|
+
throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error));
|
|
602
|
+
}
|
|
603
|
+
const { url, browser, loginUrl, loginTimeoutSecs = 300 } = validatedArgs;
|
|
604
|
+
// Additional SSRF protection check
|
|
605
|
+
if (!isValidPublicUrl(url)) {
|
|
606
|
+
throw new McpError(ErrorCode.InvalidParams, 'Access to private networks is blocked');
|
|
607
|
+
}
|
|
608
|
+
const normalizedUrl = normalizeUrl(url);
|
|
609
|
+
const domain = new URL(normalizedUrl).hostname;
|
|
610
|
+
// Check if we already have a session
|
|
611
|
+
const hasSession = await this.authManager.hasSession(normalizedUrl);
|
|
612
|
+
if (hasSession) {
|
|
613
|
+
return {
|
|
614
|
+
content: [
|
|
615
|
+
{
|
|
616
|
+
type: 'text',
|
|
617
|
+
text: JSON.stringify({
|
|
618
|
+
status: 'existing_session',
|
|
619
|
+
message: `Already have a saved session for ${domain}. Use clear_auth first if you need to re-authenticate.`,
|
|
620
|
+
domain,
|
|
621
|
+
}, null, 2),
|
|
622
|
+
},
|
|
623
|
+
],
|
|
624
|
+
};
|
|
625
|
+
}
|
|
626
|
+
try {
|
|
627
|
+
logger.info(`[Auth] Opening ${browser || 'auto-detected'} browser for authentication to ${domain}`);
|
|
628
|
+
// Perform interactive login
|
|
629
|
+
await this.authManager.performInteractiveLogin(normalizedUrl, {
|
|
630
|
+
browser,
|
|
631
|
+
loginUrl,
|
|
632
|
+
loginTimeoutSecs,
|
|
633
|
+
});
|
|
634
|
+
return {
|
|
635
|
+
content: [
|
|
636
|
+
{
|
|
637
|
+
type: 'text',
|
|
638
|
+
text: JSON.stringify({
|
|
639
|
+
status: 'success',
|
|
640
|
+
message: `Successfully authenticated to ${domain}. Session saved for future crawls.`,
|
|
641
|
+
domain,
|
|
642
|
+
instruction: 'You can now use add_documentation to crawl this site. The saved session will be used automatically.',
|
|
643
|
+
}, null, 2),
|
|
644
|
+
},
|
|
645
|
+
],
|
|
646
|
+
};
|
|
647
|
+
}
|
|
648
|
+
catch (error) {
|
|
649
|
+
const safeErrorMessage = sanitizeErrorMessage(error);
|
|
650
|
+
logger.error(`[Auth] Authentication failed:`, safeErrorMessage);
|
|
651
|
+
return {
|
|
652
|
+
content: [
|
|
653
|
+
{
|
|
654
|
+
type: 'text',
|
|
655
|
+
text: JSON.stringify({
|
|
656
|
+
status: 'failed',
|
|
657
|
+
message: `Authentication failed: ${safeErrorMessage}`,
|
|
658
|
+
domain,
|
|
659
|
+
}, null, 2),
|
|
660
|
+
},
|
|
661
|
+
],
|
|
662
|
+
};
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
/**
|
|
666
|
+
* Handle clearing saved authentication for a domain
|
|
667
|
+
*/
|
|
668
|
+
async handleClearAuth(args) {
|
|
669
|
+
// Validate arguments with schema
|
|
670
|
+
let validatedArgs;
|
|
671
|
+
try {
|
|
672
|
+
validatedArgs = validateToolArgs(args, ClearAuthArgsSchema);
|
|
673
|
+
}
|
|
674
|
+
catch (error) {
|
|
675
|
+
throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error));
|
|
676
|
+
}
|
|
677
|
+
const { url } = validatedArgs;
|
|
678
|
+
const normalizedUrl = normalizeUrl(url);
|
|
679
|
+
const domain = new URL(normalizedUrl).hostname;
|
|
680
|
+
await this.authManager.clearSession(normalizedUrl);
|
|
681
|
+
return {
|
|
682
|
+
content: [
|
|
683
|
+
{
|
|
684
|
+
type: 'text',
|
|
685
|
+
text: JSON.stringify({
|
|
686
|
+
status: 'success',
|
|
687
|
+
message: `Cleared saved authentication for ${domain}`,
|
|
688
|
+
domain,
|
|
689
|
+
}, null, 2),
|
|
690
|
+
},
|
|
691
|
+
],
|
|
692
|
+
};
|
|
693
|
+
}
|
|
694
|
+
/**
|
|
695
|
+
* Handle deleting an indexed documentation site and all its data
|
|
696
|
+
*/
|
|
697
|
+
async handleDeleteDocumentation(args) {
|
|
698
|
+
// Validate arguments with schema
|
|
699
|
+
let validatedArgs;
|
|
700
|
+
try {
|
|
701
|
+
validatedArgs = validateToolArgs(args, DeleteDocumentationArgsSchema);
|
|
702
|
+
}
|
|
703
|
+
catch (error) {
|
|
704
|
+
throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error));
|
|
705
|
+
}
|
|
706
|
+
const { url, clearAuth = false } = validatedArgs;
|
|
707
|
+
const normalizedUrl = normalizeUrl(url);
|
|
708
|
+
const domain = new URL(normalizedUrl).hostname;
|
|
709
|
+
// Check if document exists
|
|
710
|
+
const doc = await this.store.getDocument(normalizedUrl);
|
|
711
|
+
if (!doc) {
|
|
712
|
+
return {
|
|
713
|
+
content: [
|
|
714
|
+
{
|
|
715
|
+
type: 'text',
|
|
716
|
+
text: JSON.stringify({
|
|
717
|
+
status: 'not_found',
|
|
718
|
+
message: `No indexed documentation found for ${normalizedUrl}`,
|
|
719
|
+
url: normalizedUrl,
|
|
720
|
+
}, null, 2),
|
|
721
|
+
},
|
|
722
|
+
],
|
|
723
|
+
};
|
|
724
|
+
}
|
|
725
|
+
const deletedItems = [];
|
|
726
|
+
try {
|
|
727
|
+
// 1. Delete from SQLite and LanceDB (via store)
|
|
728
|
+
await this.store.deleteDocument(normalizedUrl);
|
|
729
|
+
deletedItems.push('document metadata (SQLite)', 'vector chunks (LanceDB)');
|
|
730
|
+
logger.info(`[WebDocsServer] Deleted document from store: ${normalizedUrl}`);
|
|
731
|
+
// 2. Delete Crawlee dataset
|
|
732
|
+
const docId = generateDocId(normalizedUrl, doc.title);
|
|
733
|
+
try {
|
|
734
|
+
const { Dataset } = await import('crawlee');
|
|
735
|
+
const dataset = await Dataset.open(docId);
|
|
736
|
+
await dataset.drop();
|
|
737
|
+
deletedItems.push('crawl cache (Crawlee dataset)');
|
|
738
|
+
logger.info(`[WebDocsServer] Deleted Crawlee dataset: ${docId}`);
|
|
739
|
+
}
|
|
740
|
+
catch {
|
|
741
|
+
logger.debug(`[WebDocsServer] No Crawlee dataset to delete for ${docId}`);
|
|
742
|
+
}
|
|
743
|
+
// 3. Optionally clear auth session
|
|
744
|
+
if (clearAuth) {
|
|
745
|
+
await this.authManager.clearSession(normalizedUrl);
|
|
746
|
+
deletedItems.push('authentication session');
|
|
747
|
+
logger.info(`[WebDocsServer] Cleared auth session for ${domain}`);
|
|
748
|
+
}
|
|
749
|
+
return {
|
|
750
|
+
content: [
|
|
751
|
+
{
|
|
752
|
+
type: 'text',
|
|
753
|
+
text: JSON.stringify({
|
|
754
|
+
status: 'success',
|
|
755
|
+
message: `Successfully deleted documentation for ${normalizedUrl}`,
|
|
756
|
+
url: normalizedUrl,
|
|
757
|
+
title: doc.title,
|
|
758
|
+
deletedItems,
|
|
759
|
+
}, null, 2),
|
|
760
|
+
},
|
|
761
|
+
],
|
|
762
|
+
};
|
|
763
|
+
}
|
|
764
|
+
catch (error) {
|
|
765
|
+
const safeErrorMessage = sanitizeErrorMessage(error);
|
|
766
|
+
logger.error(`[WebDocsServer] Error deleting documentation:`, safeErrorMessage);
|
|
767
|
+
return {
|
|
768
|
+
content: [
|
|
769
|
+
{
|
|
770
|
+
type: 'text',
|
|
771
|
+
text: JSON.stringify({
|
|
772
|
+
status: 'error',
|
|
773
|
+
message: `Failed to delete documentation: ${safeErrorMessage}`,
|
|
774
|
+
url: normalizedUrl,
|
|
775
|
+
deletedItems,
|
|
776
|
+
}, null, 2),
|
|
777
|
+
},
|
|
778
|
+
],
|
|
779
|
+
};
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
async indexAndAdd(id, url, title, reIndex = false, signal, pathPrefix) {
|
|
783
|
+
// Helper to check if operation was cancelled
|
|
784
|
+
const checkCancelled = () => {
|
|
785
|
+
if (signal?.aborted) {
|
|
786
|
+
logger.info(`[WebDocsServer] Operation cancelled for ${url}`);
|
|
787
|
+
this.statusTracker.cancelIndexing(id);
|
|
788
|
+
const error = new Error('Operation cancelled');
|
|
789
|
+
error.name = 'AbortError';
|
|
790
|
+
throw error;
|
|
791
|
+
}
|
|
792
|
+
};
|
|
793
|
+
try {
|
|
794
|
+
logger.info(`[WebDocsServer] Starting indexAndAdd for ${url} (reIndex: ${reIndex})`);
|
|
795
|
+
checkCancelled();
|
|
796
|
+
// Check if document exists
|
|
797
|
+
logger.debug(`[WebDocsServer] Checking if document exists: ${url}`);
|
|
798
|
+
const existingDoc = await this.store.getDocument(url);
|
|
799
|
+
if (existingDoc) {
|
|
800
|
+
logger.debug(`[WebDocsServer] Document exists: ${url}`);
|
|
801
|
+
if (!reIndex) {
|
|
802
|
+
logger.info(`[WebDocsServer] Document ${url} already indexed and reIndex=false`);
|
|
803
|
+
this.statusTracker.completeIndexing(id);
|
|
804
|
+
return;
|
|
805
|
+
}
|
|
806
|
+
logger.info(`[WebDocsServer] Will reindex existing document: ${url}`);
|
|
807
|
+
}
|
|
808
|
+
else {
|
|
809
|
+
logger.debug(`[WebDocsServer] Document does not exist: ${url}`);
|
|
810
|
+
}
|
|
811
|
+
checkCancelled();
|
|
812
|
+
// Start crawling
|
|
813
|
+
logger.info(`[WebDocsServer] Starting crawl with depth=${this.config.maxDepth}, maxRequests=${this.config.maxRequestsPerCrawl}${pathPrefix ? `, pathPrefix=${pathPrefix}` : ''}`);
|
|
814
|
+
this.statusTracker.updateProgress(id, 0, 'Finding subpages');
|
|
815
|
+
const crawler = new DocsCrawler(this.config.maxDepth, this.config.maxRequestsPerCrawl, this.config.githubToken);
|
|
816
|
+
// Set path prefix restriction if provided
|
|
817
|
+
if (pathPrefix) {
|
|
818
|
+
crawler.setPathPrefix(pathPrefix);
|
|
819
|
+
}
|
|
820
|
+
// Load saved authentication session if available
|
|
821
|
+
const savedSession = await this.authManager.loadSession(url);
|
|
822
|
+
if (savedSession) {
|
|
823
|
+
try {
|
|
824
|
+
// Validate the session structure before using it
|
|
825
|
+
const validatedState = safeJsonParse(savedSession, StorageStateSchema);
|
|
826
|
+
// The validated state is structurally compatible with StorageState
|
|
827
|
+
crawler.setStorageState(validatedState);
|
|
828
|
+
logger.info(`[WebDocsServer] Using validated authentication session for ${url}`);
|
|
829
|
+
}
|
|
830
|
+
catch (e) {
|
|
831
|
+
logger.warn(`[WebDocsServer] Failed to parse or validate saved session:`, e);
|
|
832
|
+
// Continue without authentication rather than failing
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
const pages = [];
|
|
836
|
+
let processedPages = 0;
|
|
837
|
+
let estimatedProgress = 0;
|
|
838
|
+
logger.info(`[WebDocsServer] Starting page crawl for ${url}`);
|
|
839
|
+
for await (const page of crawler.crawl(url)) {
|
|
840
|
+
// Check for cancellation during crawl
|
|
841
|
+
if (signal?.aborted) {
|
|
842
|
+
logger.info(`[WebDocsServer] Crawl cancelled for ${url}`);
|
|
843
|
+
crawler.abort();
|
|
844
|
+
this.statusTracker.cancelIndexing(id);
|
|
845
|
+
const error = new Error('Operation cancelled');
|
|
846
|
+
error.name = 'AbortError';
|
|
847
|
+
throw error;
|
|
848
|
+
}
|
|
849
|
+
logger.debug(`[WebDocsServer] Found page ${processedPages + 1}: ${page.path}`);
|
|
850
|
+
processedPages++;
|
|
851
|
+
estimatedProgress += 1 / 2 ** processedPages;
|
|
852
|
+
this.statusTracker.updateProgress(id, 0.15 * estimatedProgress + Math.min(0.35, (0.35 * processedPages) / 500), `Finding subpages (${page.path})`);
|
|
853
|
+
this.statusTracker.updateStats(id, { pagesFound: processedPages });
|
|
854
|
+
pages.push(page);
|
|
855
|
+
// Small delay to allow other operations
|
|
856
|
+
await new Promise((resolve) => setTimeout(resolve, 50));
|
|
857
|
+
}
|
|
858
|
+
if (pages.length === 0) {
|
|
859
|
+
logger.warn('[WebDocsServer] No pages found during crawl');
|
|
860
|
+
throw new Error('No pages found to index');
|
|
861
|
+
}
|
|
862
|
+
logger.info(`[WebDocsServer] Found ${pages.length} pages to process`);
|
|
863
|
+
logger.info('[WebDocsServer] Starting content processing and embedding generation');
|
|
864
|
+
this.statusTracker.updateStats(id, { pagesFound: pages.length });
|
|
865
|
+
checkCancelled();
|
|
866
|
+
// Process pages and create embeddings
|
|
867
|
+
const chunks = [];
|
|
868
|
+
const embeddings = [];
|
|
869
|
+
for (let i = 0; i < pages.length; i++) {
|
|
870
|
+
checkCancelled();
|
|
871
|
+
const page = pages[i];
|
|
872
|
+
logger.debug(`[WebDocsServer] Processing page ${i + 1}/${pages.length}: ${page.path}`);
|
|
873
|
+
this.statusTracker.updateProgress(id, 0.5 + 0.3 * (i / pages.length), `Creating embeddings (${i + 1}/${pages.length})`);
|
|
874
|
+
try {
|
|
875
|
+
const processed = await this.processor.process(page);
|
|
876
|
+
logger.debug(`[WebDocsServer] Created ${processed.chunks.length} chunks for ${page.path}`);
|
|
877
|
+
chunks.push(...processed.chunks);
|
|
878
|
+
embeddings.push(...processed.chunks.map((chunk) => chunk.vector));
|
|
879
|
+
this.statusTracker.updateStats(id, {
|
|
880
|
+
pagesProcessed: i + 1,
|
|
881
|
+
chunksCreated: chunks.length,
|
|
882
|
+
});
|
|
883
|
+
}
|
|
884
|
+
catch (error) {
|
|
885
|
+
logger.error(`[WebDocsServer] Error processing page ${page.path}:`, error);
|
|
886
|
+
}
|
|
887
|
+
// Small delay
|
|
888
|
+
await new Promise((resolve) => setTimeout(resolve, 20));
|
|
889
|
+
}
|
|
890
|
+
logger.info(`[WebDocsServer] Total chunks created: ${chunks.length}`);
|
|
891
|
+
// Scan for potential prompt injection patterns in indexed content
|
|
892
|
+
// Note: Detection is informational only. Logs are at DEBUG level to reduce noise
|
|
893
|
+
// from legitimate AI documentation (which contains prompt examples).
|
|
894
|
+
let injectionWarnings = 0;
|
|
895
|
+
for (const chunk of chunks) {
|
|
896
|
+
const injectionResult = detectPromptInjection(chunk.content);
|
|
897
|
+
if (injectionResult.hasInjection) {
|
|
898
|
+
injectionWarnings++;
|
|
899
|
+
if (injectionResult.maxSeverity === 'high') {
|
|
900
|
+
logger.debug(`[Security] Prompt injection pattern detected in ${chunk.path || 'unknown'}: ${injectionResult.detections[0]?.description}`);
|
|
901
|
+
}
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
if (injectionWarnings > 0) {
|
|
905
|
+
logger.debug(`[Security] Detected ${injectionWarnings} chunks with potential prompt injection patterns in ${url}. Content will be marked when returned in search results.`);
|
|
906
|
+
}
|
|
907
|
+
if (embeddings.length === 0) {
|
|
908
|
+
logger.warn(`[WebDocsServer] No content was extracted from ${url}`);
|
|
909
|
+
logger.warn(`[WebDocsServer] Pages found: ${pages.length}`);
|
|
910
|
+
logger.warn(`[WebDocsServer] Chunks created: ${chunks.length}`);
|
|
911
|
+
this.statusTracker.failIndexing(id, 'No content was extracted from the pages');
|
|
912
|
+
return;
|
|
913
|
+
}
|
|
914
|
+
checkCancelled();
|
|
915
|
+
// Delete old data if reindexing
|
|
916
|
+
if (reIndex && existingDoc) {
|
|
917
|
+
this.statusTracker.updateProgress(id, 0.8, 'Deleting old data');
|
|
918
|
+
await this.store.deleteDocument(url);
|
|
919
|
+
}
|
|
920
|
+
checkCancelled();
|
|
921
|
+
// Get favicon
|
|
922
|
+
const favicon = await fetchFavicon(new URL(url));
|
|
923
|
+
// Store the data with retry logic
|
|
924
|
+
this.statusTracker.updateProgress(id, 0.9, `Storing ${embeddings.length} chunks`);
|
|
925
|
+
await this.addDocumentWithRetry({
|
|
926
|
+
metadata: {
|
|
927
|
+
url,
|
|
928
|
+
title,
|
|
929
|
+
favicon: favicon ?? undefined,
|
|
930
|
+
lastIndexed: new Date(),
|
|
931
|
+
},
|
|
932
|
+
chunks: chunks.map((chunk, i) => ({
|
|
933
|
+
...chunk,
|
|
934
|
+
vector: embeddings[i],
|
|
935
|
+
})),
|
|
936
|
+
});
|
|
937
|
+
logger.info(`[WebDocsServer] Successfully indexed ${url}`);
|
|
938
|
+
logger.info(`[WebDocsServer] Pages processed: ${pages.length}`);
|
|
939
|
+
logger.info(`[WebDocsServer] Chunks stored: ${chunks.length}`);
|
|
940
|
+
this.statusTracker.updateStats(id, { chunksCreated: chunks.length });
|
|
941
|
+
this.statusTracker.completeIndexing(id);
|
|
942
|
+
}
|
|
943
|
+
catch (error) {
|
|
944
|
+
// Don't log AbortError as a real error
|
|
945
|
+
if (error instanceof Error && error.name === 'AbortError') {
|
|
946
|
+
logger.info(`[WebDocsServer] Indexing cancelled for ${url}`);
|
|
947
|
+
return;
|
|
948
|
+
}
|
|
949
|
+
// Handle expired session errors specially
|
|
950
|
+
if (error instanceof SessionExpiredError) {
|
|
951
|
+
logger.warn(`[WebDocsServer] Session expired during crawl of ${url}: ${error.message}`);
|
|
952
|
+
logger.warn(`[WebDocsServer] Expected URL: ${error.expectedUrl}, Detected URL: ${error.detectedUrl}`);
|
|
953
|
+
// Clear the expired session
|
|
954
|
+
await this.authManager.clearSession(url);
|
|
955
|
+
logger.info(`[WebDocsServer] Cleared expired session for ${url}`);
|
|
956
|
+
// Report user-friendly error
|
|
957
|
+
const userMessage = `Authentication session has expired. The crawler was redirected to a login page. Please use the 'authenticate' tool to log in again before re-indexing.`;
|
|
958
|
+
this.statusTracker.failIndexing(id, userMessage);
|
|
959
|
+
return;
|
|
960
|
+
}
|
|
961
|
+
logger.error('[WebDocsServer] Error during indexing:', error);
|
|
962
|
+
logger.error('[WebDocsServer] Error details:', error instanceof Error ? error.stack : error);
|
|
963
|
+
this.statusTracker.failIndexing(id, error instanceof Error ? error.message : 'Unknown error');
|
|
964
|
+
}
|
|
965
|
+
}
|
|
966
|
+
/**
|
|
967
|
+
* Add document with retry logic for transient database conflicts
|
|
968
|
+
*/
|
|
969
|
+
async addDocumentWithRetry(doc, maxRetries = 3) {
|
|
970
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
971
|
+
try {
|
|
972
|
+
await this.store.addDocument(doc);
|
|
973
|
+
return;
|
|
974
|
+
}
|
|
975
|
+
catch (error) {
|
|
976
|
+
const isConflict = error instanceof Error && error.message?.includes('Commit conflict');
|
|
977
|
+
if (isConflict && attempt < maxRetries) {
|
|
978
|
+
logger.warn(`[WebDocsServer] Database conflict, retrying (${attempt}/${maxRetries})...`);
|
|
979
|
+
await new Promise((resolve) => setTimeout(resolve, 1000 * attempt)); // Exponential backoff
|
|
980
|
+
continue;
|
|
981
|
+
}
|
|
982
|
+
throw error;
|
|
983
|
+
}
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
async run() {
|
|
987
|
+
// Initialize components
|
|
988
|
+
await this.initialize();
|
|
989
|
+
// Connect to stdio transport
|
|
990
|
+
const transport = new StdioServerTransport();
|
|
991
|
+
await this.server.connect(transport);
|
|
992
|
+
logger.info('Web Docs MCP server running on stdio');
|
|
993
|
+
}
|
|
994
|
+
}
|
|
995
|
+
// Start server
|
|
996
|
+
const server = new WebDocsServer();
|
|
997
|
+
server.run().catch((err) => logger.error('Server failed to start:', err));
|
|
998
|
+
// Handle process signals - cancel all operations before shutdown
|
|
999
|
+
process.on('SIGINT', async () => {
|
|
1000
|
+
logger.info('Received SIGINT, cancelling operations and shutting down...');
|
|
1001
|
+
process.exit(0);
|
|
1002
|
+
});
|
|
1003
|
+
process.on('SIGTERM', async () => {
|
|
1004
|
+
logger.info('Received SIGTERM, cancelling operations and shutting down...');
|
|
1005
|
+
process.exit(0);
|
|
1006
|
+
});
|
|
1007
|
+
//# sourceMappingURL=index.js.map
|