visus-mcp 0.3.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +22 -0
- package/LINKEDIN-STRATEGY.md +367 -0
- package/README.md +491 -16
- package/ROADMAP.md +167 -30
- package/SECURITY-AUDIT-v1.md +277 -0
- package/STATUS.md +801 -42
- package/TROUBLESHOOT-AUTH-20260322-2019.md +291 -0
- package/TROUBLESHOOT-JEST-20260323-1357.md +139 -0
- package/TROUBLESHOOT-LAMBDA-20260322-1945.md +183 -0
- package/VISUS-CLAUDE-CODE-PROMPT.md +1 -1
- package/VISUS-PROJECT-PLAN.md +7 -0
- package/dist/browser/playwright-renderer.d.ts.map +1 -1
- package/dist/browser/playwright-renderer.js +7 -0
- package/dist/browser/playwright-renderer.js.map +1 -1
- package/dist/browser/reader.d.ts +31 -0
- package/dist/browser/reader.d.ts.map +1 -0
- package/dist/browser/reader.js +98 -0
- package/dist/browser/reader.js.map +1 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +37 -5
- package/dist/index.js.map +1 -1
- package/dist/lambda-handler.d.ts +0 -6
- package/dist/lambda-handler.d.ts.map +1 -1
- package/dist/lambda-handler.js +97 -25
- package/dist/lambda-handler.js.map +1 -1
- package/dist/sanitizer/framework-mapper.d.ts +22 -0
- package/dist/sanitizer/framework-mapper.d.ts.map +1 -0
- package/dist/sanitizer/framework-mapper.js +296 -0
- package/dist/sanitizer/framework-mapper.js.map +1 -0
- package/dist/sanitizer/index.d.ts +2 -0
- package/dist/sanitizer/index.d.ts.map +1 -1
- package/dist/sanitizer/index.js +14 -1
- package/dist/sanitizer/index.js.map +1 -1
- package/dist/sanitizer/patterns.js +1 -1
- package/dist/sanitizer/patterns.js.map +1 -1
- package/dist/sanitizer/severity-classifier.d.ts +33 -0
- package/dist/sanitizer/severity-classifier.d.ts.map +1 -0
- package/dist/sanitizer/severity-classifier.js +113 -0
- package/dist/sanitizer/severity-classifier.js.map +1 -0
- package/dist/sanitizer/threat-reporter.d.ts +65 -0
- package/dist/sanitizer/threat-reporter.d.ts.map +1 -0
- package/dist/sanitizer/threat-reporter.js +160 -0
- package/dist/sanitizer/threat-reporter.js.map +1 -0
- package/dist/tools/fetch-structured.d.ts +5 -0
- package/dist/tools/fetch-structured.d.ts.map +1 -1
- package/dist/tools/fetch-structured.js +54 -6
- package/dist/tools/fetch-structured.js.map +1 -1
- package/dist/tools/fetch.d.ts +5 -0
- package/dist/tools/fetch.d.ts.map +1 -1
- package/dist/tools/fetch.js +42 -9
- package/dist/tools/fetch.js.map +1 -1
- package/dist/tools/read.d.ts +51 -0
- package/dist/tools/read.d.ts.map +1 -0
- package/dist/tools/read.js +127 -0
- package/dist/tools/read.js.map +1 -0
- package/dist/tools/search.d.ts +45 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +220 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/types.d.ts +64 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/dist/utils/format-converter.d.ts +39 -0
- package/dist/utils/format-converter.d.ts.map +1 -0
- package/dist/utils/format-converter.js +191 -0
- package/dist/utils/format-converter.js.map +1 -0
- package/dist/utils/truncate.d.ts +26 -0
- package/dist/utils/truncate.d.ts.map +1 -0
- package/dist/utils/truncate.js +54 -0
- package/dist/utils/truncate.js.map +1 -0
- package/infrastructure/stack.ts +55 -6
- package/jest.config.js +3 -0
- package/package.json +9 -2
- package/src/browser/playwright-renderer.ts +8 -0
- package/src/browser/reader.ts +129 -0
- package/src/index.ts +49 -5
- package/src/lambda-handler.ts +131 -26
- package/src/sanitizer/framework-mapper.ts +347 -0
- package/src/sanitizer/index.ts +18 -1
- package/src/sanitizer/patterns.ts +1 -1
- package/src/sanitizer/severity-classifier.ts +132 -0
- package/src/sanitizer/threat-reporter.ts +261 -0
- package/src/tools/fetch-structured.ts +58 -6
- package/src/tools/fetch.ts +44 -9
- package/src/tools/read.ts +143 -0
- package/src/tools/search.ts +263 -0
- package/src/types.ts +69 -0
- package/src/utils/format-converter.ts +236 -0
- package/src/utils/truncate.ts +64 -0
- package/tests/auth-smoke.test.ts +480 -0
- package/tests/fetch-tool.test.ts +595 -2
- package/tests/reader.test.ts +353 -0
- package/tests/sanitizer.test.ts +52 -0
- package/tests/search.test.ts +456 -0
- package/tests/threat-reporter.test.ts +266 -0
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Visus Search Tool - Safe Web Search
|
|
3
|
+
*
|
|
4
|
+
* Queries DuckDuckGo's Instant Answer API and sanitizes all results
|
|
5
|
+
* before returning them to the LLM.
|
|
6
|
+
*
|
|
7
|
+
* SECURITY: Every search result snippet and title passes through the
|
|
8
|
+
* sanitization pipeline. This prevents prompt injection via search results.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { sanitize } from '../sanitizer/index.js';
|
|
12
|
+
import { generateThreatReport } from '../sanitizer/threat-reporter.js';
|
|
13
|
+
import type { VisusSearchInput, VisusSearchOutput, Result } from '../types.js';
|
|
14
|
+
import { Ok, Err } from '../types.js';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* DuckDuckGo API Response Types
|
|
18
|
+
*/
|
|
19
|
+
interface DuckDuckGoRelatedTopic {
|
|
20
|
+
Text?: string;
|
|
21
|
+
FirstURL?: string;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
interface DuckDuckGoResponse {
|
|
25
|
+
AbstractText?: string;
|
|
26
|
+
AbstractURL?: string;
|
|
27
|
+
RelatedTopics?: Array<DuckDuckGoRelatedTopic | { Topics: DuckDuckGoRelatedTopic[] }>;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Search the web via DuckDuckGo and return sanitized results
|
|
32
|
+
*
|
|
33
|
+
* @param input Search query and options
|
|
34
|
+
* @returns Sanitized search results with injection detection metadata
|
|
35
|
+
*/
|
|
36
|
+
export async function visusSearch(input: VisusSearchInput): Promise<Result<VisusSearchOutput, Error>> {
|
|
37
|
+
// Validate input
|
|
38
|
+
if (!input.query || typeof input.query !== 'string' || input.query.trim().length === 0) {
|
|
39
|
+
return Err(new Error('query must be a non-empty string'));
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Enforce max_results cap
|
|
43
|
+
const maxResults = Math.min(input.max_results ?? 5, 10);
|
|
44
|
+
|
|
45
|
+
try {
|
|
46
|
+
// Call DuckDuckGo Instant Answer API
|
|
47
|
+
const query = encodeURIComponent(input.query.trim());
|
|
48
|
+
const apiUrl = `https://api.duckduckgo.com/?q=${query}&format=json&no_redirect=1&no_html=1`;
|
|
49
|
+
|
|
50
|
+
const controller = new AbortController();
|
|
51
|
+
const timeout = setTimeout(() => controller.abort(), 8000);
|
|
52
|
+
|
|
53
|
+
let response: Response;
|
|
54
|
+
try {
|
|
55
|
+
response = await fetch(apiUrl, {
|
|
56
|
+
signal: controller.signal,
|
|
57
|
+
headers: {
|
|
58
|
+
'User-Agent': 'visus-mcp/0.3.0 (https://github.com/lateos/visus-mcp)'
|
|
59
|
+
}
|
|
60
|
+
});
|
|
61
|
+
} finally {
|
|
62
|
+
clearTimeout(timeout);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
if (!response.ok) {
|
|
66
|
+
return Ok({
|
|
67
|
+
query: input.query,
|
|
68
|
+
result_count: 0,
|
|
69
|
+
sanitized: true,
|
|
70
|
+
results: [],
|
|
71
|
+
total_injections_removed: 0,
|
|
72
|
+
message: `Search unavailable (HTTP ${response.status})`
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const data = await response.json() as DuckDuckGoResponse;
|
|
77
|
+
|
|
78
|
+
// Extract results from DuckDuckGo response
|
|
79
|
+
const rawResults: Array<{ title: string; url: string; snippet: string }> = [];
|
|
80
|
+
|
|
81
|
+
// Add AbstractText as first result if present
|
|
82
|
+
if (data.AbstractText && data.AbstractURL) {
|
|
83
|
+
rawResults.push({
|
|
84
|
+
title: extractTitle(data.AbstractText),
|
|
85
|
+
url: data.AbstractURL,
|
|
86
|
+
snippet: data.AbstractText
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Extract from RelatedTopics
|
|
91
|
+
if (data.RelatedTopics) {
|
|
92
|
+
for (const topic of data.RelatedTopics) {
|
|
93
|
+
// Handle both direct topics and nested topic groups
|
|
94
|
+
if ('Topics' in topic && Array.isArray(topic.Topics)) {
|
|
95
|
+
// Nested topics group
|
|
96
|
+
for (const nestedTopic of topic.Topics) {
|
|
97
|
+
if (nestedTopic.Text && nestedTopic.FirstURL) {
|
|
98
|
+
rawResults.push({
|
|
99
|
+
title: extractTitle(nestedTopic.Text),
|
|
100
|
+
url: nestedTopic.FirstURL,
|
|
101
|
+
snippet: nestedTopic.Text
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
} else if ('Text' in topic && topic.Text && topic.FirstURL) {
|
|
106
|
+
// Direct topic
|
|
107
|
+
rawResults.push({
|
|
108
|
+
title: extractTitle(topic.Text),
|
|
109
|
+
url: topic.FirstURL,
|
|
110
|
+
snippet: topic.Text
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Stop if we've collected enough results
|
|
115
|
+
if (rawResults.length >= maxResults) {
|
|
116
|
+
break;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Filter out results with empty URLs and limit to max_results
|
|
122
|
+
const validResults = rawResults
|
|
123
|
+
.filter(r => r.url && r.url.trim().length > 0)
|
|
124
|
+
.slice(0, maxResults);
|
|
125
|
+
|
|
126
|
+
// If no results found, return empty array with message
|
|
127
|
+
if (validResults.length === 0) {
|
|
128
|
+
return Ok({
|
|
129
|
+
query: input.query,
|
|
130
|
+
result_count: 0,
|
|
131
|
+
sanitized: true,
|
|
132
|
+
results: [],
|
|
133
|
+
total_injections_removed: 0,
|
|
134
|
+
message: 'No results found'
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Sanitize each result independently
|
|
139
|
+
const sanitizedResults = [];
|
|
140
|
+
const allPatternsDetected = new Set<string>();
|
|
141
|
+
let totalInjectionsRemoved = 0;
|
|
142
|
+
let totalPIIRedacted = 0;
|
|
143
|
+
|
|
144
|
+
for (const result of validResults) {
|
|
145
|
+
// Sanitize title
|
|
146
|
+
const titleSanitization = sanitize(result.title);
|
|
147
|
+
|
|
148
|
+
// Sanitize snippet
|
|
149
|
+
const snippetSanitization = sanitize(result.snippet);
|
|
150
|
+
|
|
151
|
+
const injectionsRemoved =
|
|
152
|
+
titleSanitization.sanitization.patterns_detected.length +
|
|
153
|
+
snippetSanitization.sanitization.patterns_detected.length;
|
|
154
|
+
|
|
155
|
+
const piiRedacted =
|
|
156
|
+
titleSanitization.sanitization.pii_types_redacted.length +
|
|
157
|
+
snippetSanitization.sanitization.pii_types_redacted.length;
|
|
158
|
+
|
|
159
|
+
totalInjectionsRemoved += injectionsRemoved;
|
|
160
|
+
totalPIIRedacted += piiRedacted;
|
|
161
|
+
|
|
162
|
+
// Collect all patterns detected across all results
|
|
163
|
+
titleSanitization.sanitization.patterns_detected.forEach(p => allPatternsDetected.add(p));
|
|
164
|
+
snippetSanitization.sanitization.patterns_detected.forEach(p => allPatternsDetected.add(p));
|
|
165
|
+
|
|
166
|
+
sanitizedResults.push({
|
|
167
|
+
title: titleSanitization.content,
|
|
168
|
+
url: result.url,
|
|
169
|
+
snippet: snippetSanitization.content,
|
|
170
|
+
injections_removed: injectionsRemoved,
|
|
171
|
+
pii_redacted: piiRedacted
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Generate aggregated threat report for all search results
|
|
176
|
+
const threatReport = generateThreatReport({
|
|
177
|
+
patterns_detected: Array.from(allPatternsDetected),
|
|
178
|
+
pii_redacted: totalPIIRedacted,
|
|
179
|
+
source_url: `DuckDuckGo Search: ${input.query}`
|
|
180
|
+
});
|
|
181
|
+
|
|
182
|
+
return Ok({
|
|
183
|
+
query: input.query,
|
|
184
|
+
result_count: sanitizedResults.length,
|
|
185
|
+
sanitized: true,
|
|
186
|
+
results: sanitizedResults,
|
|
187
|
+
total_injections_removed: totalInjectionsRemoved,
|
|
188
|
+
// Include threat_report only if findings exist
|
|
189
|
+
...(threatReport && { threat_report: threatReport })
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
} catch (error) {
|
|
193
|
+
// Handle timeout or network errors
|
|
194
|
+
if (error instanceof Error && error.name === 'AbortError') {
|
|
195
|
+
return Ok({
|
|
196
|
+
query: input.query,
|
|
197
|
+
result_count: 0,
|
|
198
|
+
sanitized: true,
|
|
199
|
+
results: [],
|
|
200
|
+
total_injections_removed: 0,
|
|
201
|
+
message: 'Search unavailable (timeout)'
|
|
202
|
+
});
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
return Ok({
|
|
206
|
+
query: input.query,
|
|
207
|
+
result_count: 0,
|
|
208
|
+
sanitized: true,
|
|
209
|
+
results: [],
|
|
210
|
+
total_injections_removed: 0,
|
|
211
|
+
message: `Search unavailable: ${error instanceof Error ? error.message : String(error)}`
|
|
212
|
+
});
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Extract title from text (first sentence or up to 80 chars)
|
|
218
|
+
*/
|
|
219
|
+
function extractTitle(text: string): string {
|
|
220
|
+
// Try to find first sentence
|
|
221
|
+
const firstSentenceMatch = text.match(/^[^.!?]+[.!?]/);
|
|
222
|
+
if (firstSentenceMatch) {
|
|
223
|
+
const sentence = firstSentenceMatch[0].trim();
|
|
224
|
+
if (sentence.length <= 80) {
|
|
225
|
+
return sentence;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Fallback to first 80 chars
|
|
230
|
+
if (text.length <= 80) {
|
|
231
|
+
return text.trim();
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return text.substring(0, 77).trim() + '...';
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Tool definition for MCP registration
|
|
239
|
+
*/
|
|
240
|
+
export const visusSearchToolDefinition = {
|
|
241
|
+
name: 'visus_search',
|
|
242
|
+
title: 'Search the Web (Sanitized)',
|
|
243
|
+
description: 'Searches the web via DuckDuckGo and returns sanitized results with prompt injection and PII removed before reaching the LLM. Use before visus_fetch or visus_read to safely discover and then read pages.',
|
|
244
|
+
inputSchema: {
|
|
245
|
+
type: 'object',
|
|
246
|
+
properties: {
|
|
247
|
+
query: {
|
|
248
|
+
type: 'string',
|
|
249
|
+
description: 'Search query'
|
|
250
|
+
},
|
|
251
|
+
max_results: {
|
|
252
|
+
type: 'number',
|
|
253
|
+
description: 'Maximum number of results to return (default: 5, max: 10)',
|
|
254
|
+
default: 5
|
|
255
|
+
}
|
|
256
|
+
},
|
|
257
|
+
required: ['query']
|
|
258
|
+
},
|
|
259
|
+
readOnlyHint: true,
|
|
260
|
+
destructiveHint: false,
|
|
261
|
+
idempotentHint: true,
|
|
262
|
+
openWorldHint: true
|
|
263
|
+
};
|
package/src/types.ts
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
* Shared TypeScript interfaces for Visus MCP tool
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
|
+
import type { ThreatReport } from './sanitizer/threat-reporter.js';
|
|
6
|
+
|
|
5
7
|
/**
|
|
6
8
|
* Input options for visus_fetch tool
|
|
7
9
|
*/
|
|
@@ -28,7 +30,12 @@ export interface VisusFetchOutput {
|
|
|
28
30
|
fetched_at: string;
|
|
29
31
|
content_length_original: number;
|
|
30
32
|
content_length_sanitized: number;
|
|
33
|
+
format_detected?: 'html' | 'json' | 'xml' | 'rss';
|
|
34
|
+
content_type?: string;
|
|
35
|
+
truncated?: boolean;
|
|
36
|
+
truncated_at_chars?: number;
|
|
31
37
|
};
|
|
38
|
+
threat_report?: ThreatReport;
|
|
32
39
|
}
|
|
33
40
|
|
|
34
41
|
/**
|
|
@@ -40,6 +47,14 @@ export interface VisusFetchStructuredInput {
|
|
|
40
47
|
timeout_ms?: number;
|
|
41
48
|
}
|
|
42
49
|
|
|
50
|
+
/**
|
|
51
|
+
* Input for visus_read tool
|
|
52
|
+
*/
|
|
53
|
+
export interface VisusReadInput {
|
|
54
|
+
url: string;
|
|
55
|
+
timeout_ms?: number;
|
|
56
|
+
}
|
|
57
|
+
|
|
43
58
|
/**
|
|
44
59
|
* Output from visus_fetch_structured tool
|
|
45
60
|
*/
|
|
@@ -57,7 +72,60 @@ export interface VisusFetchStructuredOutput {
|
|
|
57
72
|
fetched_at: string;
|
|
58
73
|
content_length_original: number;
|
|
59
74
|
content_length_sanitized: number;
|
|
75
|
+
format_detected?: 'html' | 'json' | 'xml' | 'rss';
|
|
76
|
+
content_type?: string;
|
|
77
|
+
truncated?: boolean;
|
|
78
|
+
truncated_at_chars?: number;
|
|
60
79
|
};
|
|
80
|
+
threat_report?: ThreatReport;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Output from visus_read tool
|
|
85
|
+
*/
|
|
86
|
+
export interface VisusReadOutput {
|
|
87
|
+
url: string;
|
|
88
|
+
content: string;
|
|
89
|
+
metadata: {
|
|
90
|
+
title: string;
|
|
91
|
+
author: string | null;
|
|
92
|
+
published: string | null;
|
|
93
|
+
word_count: number;
|
|
94
|
+
reader_mode_available: boolean;
|
|
95
|
+
sanitized: true;
|
|
96
|
+
injections_removed: number;
|
|
97
|
+
pii_redacted: number;
|
|
98
|
+
truncated: boolean;
|
|
99
|
+
fetched_at?: string;
|
|
100
|
+
};
|
|
101
|
+
threat_report?: ThreatReport;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Input for visus_search tool
|
|
106
|
+
*/
|
|
107
|
+
export interface VisusSearchInput {
|
|
108
|
+
query: string;
|
|
109
|
+
max_results?: number;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Output from visus_search tool
|
|
114
|
+
*/
|
|
115
|
+
export interface VisusSearchOutput {
|
|
116
|
+
query: string;
|
|
117
|
+
result_count: number;
|
|
118
|
+
sanitized: true;
|
|
119
|
+
results: Array<{
|
|
120
|
+
title: string;
|
|
121
|
+
url: string;
|
|
122
|
+
snippet: string;
|
|
123
|
+
injections_removed: number;
|
|
124
|
+
pii_redacted: number;
|
|
125
|
+
}>;
|
|
126
|
+
total_injections_removed: number;
|
|
127
|
+
message?: string;
|
|
128
|
+
threat_report?: ThreatReport;
|
|
61
129
|
}
|
|
62
130
|
|
|
63
131
|
/**
|
|
@@ -67,6 +135,7 @@ export interface BrowserRenderResult {
|
|
|
67
135
|
html: string;
|
|
68
136
|
title: string;
|
|
69
137
|
url: string;
|
|
138
|
+
contentType?: string;
|
|
70
139
|
text?: string;
|
|
71
140
|
error?: string;
|
|
72
141
|
}
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Format Converter - Content-Type based format detection and conversion
|
|
3
|
+
*
|
|
4
|
+
* Handles format-appropriate conversion based on detected Content-Type.
|
|
5
|
+
* Supports HTML, JSON, XML, and RSS/Atom feeds.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { XMLParser } from 'fast-xml-parser';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Detected format type
|
|
12
|
+
*/
|
|
13
|
+
export type FormatType = 'html' | 'json' | 'xml' | 'rss';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Detect format from Content-Type header
|
|
17
|
+
*
|
|
18
|
+
* @param contentType - Content-Type header value (e.g., "application/json", "text/html; charset=utf-8")
|
|
19
|
+
* @returns Detected format type
|
|
20
|
+
*/
|
|
21
|
+
export function detectFormat(contentType: string): FormatType {
|
|
22
|
+
// Normalize: lowercase and extract MIME type (before semicolon)
|
|
23
|
+
const mimeType = contentType.toLowerCase().split(';')[0].trim();
|
|
24
|
+
|
|
25
|
+
// HTML formats
|
|
26
|
+
if (mimeType === 'text/html' || mimeType === 'application/xhtml+xml') {
|
|
27
|
+
return 'html';
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// JSON formats
|
|
31
|
+
if (mimeType === 'application/json' || mimeType === 'text/json') {
|
|
32
|
+
return 'json';
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// RSS/Atom feed formats
|
|
36
|
+
if (mimeType === 'application/rss+xml' ||
|
|
37
|
+
mimeType === 'application/atom+xml' ||
|
|
38
|
+
mimeType === 'application/feed+json') {
|
|
39
|
+
return 'rss';
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// XML formats (must come after RSS check)
|
|
43
|
+
if (mimeType === 'application/xml' || mimeType === 'text/xml') {
|
|
44
|
+
return 'xml';
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Default to HTML for unknown types
|
|
48
|
+
return 'html';
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Convert JSON content to formatted string
|
|
53
|
+
*
|
|
54
|
+
* @param raw - Raw JSON string
|
|
55
|
+
* @returns Formatted JSON string with prefix, or raw string if parse fails
|
|
56
|
+
*/
|
|
57
|
+
export function convertJson(raw: string): string {
|
|
58
|
+
try {
|
|
59
|
+
// Parse and re-stringify with 2-space indent for readability
|
|
60
|
+
const parsed = JSON.parse(raw);
|
|
61
|
+
const formatted = JSON.stringify(parsed, null, 2);
|
|
62
|
+
return `JSON Response:\n\n${formatted}`;
|
|
63
|
+
} catch (error) {
|
|
64
|
+
// Parse failed, return raw string unchanged
|
|
65
|
+
return raw;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Convert XML content to clean text representation
|
|
71
|
+
*
|
|
72
|
+
* @param raw - Raw XML string
|
|
73
|
+
* @returns Formatted XML representation with prefix, or tag-stripped fallback if parse fails
|
|
74
|
+
*/
|
|
75
|
+
export function convertXml(raw: string): string {
|
|
76
|
+
try {
|
|
77
|
+
const parser = new XMLParser({
|
|
78
|
+
ignoreAttributes: false,
|
|
79
|
+
attributeNamePrefix: '@_',
|
|
80
|
+
textNodeName: '#text',
|
|
81
|
+
ignoreDeclaration: true,
|
|
82
|
+
ignorePiTags: true,
|
|
83
|
+
removeNSPrefix: true,
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
const parsed = parser.parse(raw);
|
|
87
|
+
const formatted = JSON.stringify(parsed, null, 2);
|
|
88
|
+
|
|
89
|
+
return `XML Response:\n\n${formatted}`;
|
|
90
|
+
} catch (error) {
|
|
91
|
+
// Parse failed, strip XML tags using regex and return
|
|
92
|
+
const stripped = raw.replace(/<[^>]+>/g, '').trim();
|
|
93
|
+
return `XML Response:\n\n${stripped}`;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Convert RSS/Atom feed content to clean Markdown
|
|
99
|
+
*
|
|
100
|
+
* @param raw - Raw RSS/Atom XML string
|
|
101
|
+
* @returns Formatted Markdown representation, or falls back to convertXml if parse fails
|
|
102
|
+
*/
|
|
103
|
+
export function convertRss(raw: string): string {
|
|
104
|
+
try {
|
|
105
|
+
const parser = new XMLParser({
|
|
106
|
+
ignoreAttributes: false,
|
|
107
|
+
attributeNamePrefix: '@_',
|
|
108
|
+
textNodeName: '#text',
|
|
109
|
+
removeNSPrefix: true,
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
const parsed = parser.parse(raw);
|
|
113
|
+
|
|
114
|
+
// Handle RSS 2.0 format
|
|
115
|
+
if (parsed.rss && parsed.rss.channel) {
|
|
116
|
+
return formatRss2(parsed.rss.channel);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Handle Atom format
|
|
120
|
+
if (parsed.feed) {
|
|
121
|
+
return formatAtom(parsed.feed);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Handle RSS 1.0 (RDF) format
|
|
125
|
+
if (parsed.rdf && parsed.rdf.channel) {
|
|
126
|
+
return formatRss2(parsed.rdf.channel);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Unknown feed format, fall back to XML
|
|
130
|
+
return convertXml(raw);
|
|
131
|
+
|
|
132
|
+
} catch (error) {
|
|
133
|
+
// Parse failed, fall back to XML converter
|
|
134
|
+
return convertXml(raw);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Format RSS 2.0 feed data as Markdown
|
|
140
|
+
*/
|
|
141
|
+
function formatRss2(channel: any): string {
|
|
142
|
+
const title = channel.title || 'Untitled Feed';
|
|
143
|
+
const description = channel.description || '';
|
|
144
|
+
const items = Array.isArray(channel.item) ? channel.item : (channel.item ? [channel.item] : []);
|
|
145
|
+
|
|
146
|
+
let markdown = `RSS Feed:\n\n# ${title}\n`;
|
|
147
|
+
|
|
148
|
+
if (description) {
|
|
149
|
+
markdown += `${description}\n`;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
markdown += '\n## Items\n\n';
|
|
153
|
+
|
|
154
|
+
// Extract up to 10 items
|
|
155
|
+
const itemsToShow = items.slice(0, 10);
|
|
156
|
+
|
|
157
|
+
for (const item of itemsToShow) {
|
|
158
|
+
const itemTitle = item.title || 'Untitled';
|
|
159
|
+
const itemLink = item.link || '';
|
|
160
|
+
const itemDescription = item.description || '';
|
|
161
|
+
const itemPubDate = item.pubDate || '';
|
|
162
|
+
|
|
163
|
+
// Truncate description to 200 chars
|
|
164
|
+
const truncatedDesc = itemDescription.length > 200
|
|
165
|
+
? itemDescription.substring(0, 200) + '...'
|
|
166
|
+
: itemDescription;
|
|
167
|
+
|
|
168
|
+
markdown += `### ${itemTitle}\n\n`;
|
|
169
|
+
|
|
170
|
+
if (truncatedDesc) {
|
|
171
|
+
markdown += `${truncatedDesc}\n\n`;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if (itemLink) {
|
|
175
|
+
markdown += `Link: ${itemLink}\n`;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if (itemPubDate) {
|
|
179
|
+
markdown += `Published: ${itemPubDate}\n`;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
markdown += '\n---\n\n';
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
return markdown;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Format Atom feed data as Markdown
|
|
190
|
+
*/
|
|
191
|
+
function formatAtom(feed: any): string {
|
|
192
|
+
const title = feed.title || 'Untitled Feed';
|
|
193
|
+
const subtitle = feed.subtitle || '';
|
|
194
|
+
const entries = Array.isArray(feed.entry) ? feed.entry : (feed.entry ? [feed.entry] : []);
|
|
195
|
+
|
|
196
|
+
let markdown = `RSS Feed:\n\n# ${title}\n`;
|
|
197
|
+
|
|
198
|
+
if (subtitle) {
|
|
199
|
+
markdown += `${subtitle}\n`;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
markdown += '\n## Items\n\n';
|
|
203
|
+
|
|
204
|
+
// Extract up to 10 entries
|
|
205
|
+
const entriesToShow = entries.slice(0, 10);
|
|
206
|
+
|
|
207
|
+
for (const entry of entriesToShow) {
|
|
208
|
+
const entryTitle = entry.title || 'Untitled';
|
|
209
|
+
const entryLink = entry.link ? (entry.link['@_href'] || entry.link) : '';
|
|
210
|
+
const entrySummary = entry.summary || entry.content || '';
|
|
211
|
+
const entryPublished = entry.published || entry.updated || '';
|
|
212
|
+
|
|
213
|
+
// Truncate summary to 200 chars
|
|
214
|
+
const truncatedSummary = entrySummary.length > 200
|
|
215
|
+
? entrySummary.substring(0, 200) + '...'
|
|
216
|
+
: entrySummary;
|
|
217
|
+
|
|
218
|
+
markdown += `### ${entryTitle}\n\n`;
|
|
219
|
+
|
|
220
|
+
if (truncatedSummary) {
|
|
221
|
+
markdown += `${truncatedSummary}\n\n`;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if (entryLink) {
|
|
225
|
+
markdown += `Link: ${entryLink}\n`;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
if (entryPublished) {
|
|
229
|
+
markdown += `Published: ${entryPublished}\n`;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
markdown += '\n---\n\n';
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
return markdown;
|
|
236
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token-aware content truncation utility
|
|
3
|
+
*
|
|
4
|
+
* Anthropic MCP Directory enforces a 25,000 token response limit.
|
|
5
|
+
* This utility provides safe truncation with token estimation.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Maximum tokens allowed in MCP response (Anthropic Directory limit)
|
|
10
|
+
* We target 24,000 to leave headroom for metadata/JSON structure
|
|
11
|
+
*/
|
|
12
|
+
const MAX_TOKENS = 24000;
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Conservative token estimation: 1 token ≈ 4 characters
|
|
16
|
+
* This is a safe approximation that errs on the side of caution
|
|
17
|
+
*/
|
|
18
|
+
const CHARS_PER_TOKEN = 4;
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Maximum characters based on token limit
|
|
22
|
+
*/
|
|
23
|
+
const MAX_CHARS = MAX_TOKENS * CHARS_PER_TOKEN; // 96,000 characters
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Truncate content if it exceeds the token ceiling
|
|
27
|
+
*
|
|
28
|
+
* @param content Content to potentially truncate
|
|
29
|
+
* @returns Truncated content and metadata
|
|
30
|
+
*/
|
|
31
|
+
export function truncateContent(content: string): {
|
|
32
|
+
content: string;
|
|
33
|
+
truncated: boolean;
|
|
34
|
+
truncated_at_chars?: number;
|
|
35
|
+
} {
|
|
36
|
+
if (content.length <= MAX_CHARS) {
|
|
37
|
+
// Content is within limits
|
|
38
|
+
return {
|
|
39
|
+
content,
|
|
40
|
+
truncated: false
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Content exceeds limit - truncate with warning message
|
|
45
|
+
const truncatedContent = content.substring(0, MAX_CHARS);
|
|
46
|
+
const warningMessage = `\n\n--- CONTENT TRUNCATED ---\nOriginal length: ${content.length} characters (~${Math.ceil(content.length / CHARS_PER_TOKEN)} tokens)\nTruncated to: ${MAX_CHARS} characters (~${MAX_TOKENS} tokens)\nReason: Anthropic MCP Directory enforces a 25,000 token response limit\n`;
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
content: truncatedContent + warningMessage,
|
|
50
|
+
truncated: true,
|
|
51
|
+
truncated_at_chars: MAX_CHARS
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Estimate token count for a given string
|
|
57
|
+
* Uses conservative 4 chars per token approximation
|
|
58
|
+
*
|
|
59
|
+
* @param text Text to estimate
|
|
60
|
+
* @returns Estimated token count
|
|
61
|
+
*/
|
|
62
|
+
export function estimateTokens(text: string): number {
|
|
63
|
+
return Math.ceil(text.length / CHARS_PER_TOKEN);
|
|
64
|
+
}
|