mcp-safe-fetch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +88 -0
- package/dist/config.d.ts +8 -0
- package/dist/config.js +28 -0
- package/dist/fetch.d.ts +7 -0
- package/dist/fetch.js +21 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +14 -0
- package/dist/logger.d.ts +11 -0
- package/dist/logger.js +11 -0
- package/dist/sanitize/delimiters.d.ts +7 -0
- package/dist/sanitize/delimiters.js +30 -0
- package/dist/sanitize/html.d.ts +13 -0
- package/dist/sanitize/html.js +48 -0
- package/dist/sanitize/pipeline.d.ts +21 -0
- package/dist/sanitize/pipeline.js +39 -0
- package/dist/sanitize/unicode.d.ts +11 -0
- package/dist/sanitize/unicode.js +35 -0
- package/dist/server.d.ts +1 -0
- package/dist/server.js +104 -0
- package/package.json +45 -0
package/README.md
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# mcp-safe-fetch
|
|
2
|
+
|
|
3
|
+
Deterministic content sanitization MCP server for agentic coding tools. Strips prompt injection vectors from web-fetched content before it enters the LLM context.
|
|
4
|
+
|
|
5
|
+
Drop-in replacement for Claude Code's built-in `WebFetch` — exposes a `safe_fetch` tool that fetches URLs, sanitizes the HTML, and returns clean markdown.
|
|
6
|
+
|
|
7
|
+
## What it strips
|
|
8
|
+
|
|
9
|
+
- **Hidden HTML** — `display:none`, `visibility:hidden`, `opacity:0`, `[hidden]` attribute
|
|
10
|
+
- **Dangerous tags** — `<script>`, `<style>`, `<noscript>`, `<meta>`, `<link>`
|
|
11
|
+
- **HTML comments** — often used to inject instructions invisible to readers
|
|
12
|
+
- **Invisible unicode** — zero-width chars, soft hyphens, BOM, bidi overrides, variation selectors, tag characters
|
|
13
|
+
- **Control characters** — preserves `\n`, `\t`, `\r`, strips everything else
|
|
14
|
+
- **Fake LLM delimiters** — `<|im_start|>`, `[INST]`, `<<SYS>>`, `\n\nHuman:`, etc.
|
|
15
|
+
- **NFKC normalization** — collapses fullwidth and homoglyph characters
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
npx -y mcp-safe-fetch init
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
This configures Claude Code to use `safe_fetch` and deny the built-in `WebFetch`. Restart Claude Code after running.
|
|
24
|
+
|
|
25
|
+
## Usage
|
|
26
|
+
|
|
27
|
+
### As MCP server (automatic)
|
|
28
|
+
|
|
29
|
+
After `init`, Claude Code uses `safe_fetch` whenever it needs to read a URL. The sanitization header shows what was stripped:
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
[safe-fetch] Stripped: 5 hidden elements, 68 script tags, 3 style tags | 284127 → 12720 bytes (219ms)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### CLI
|
|
36
|
+
|
|
37
|
+
Test sanitization on any URL:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
npx -y mcp-safe-fetch test <url>
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Stats print to stderr, sanitized markdown to stdout.
|
|
44
|
+
|
|
45
|
+
### MCP tools
|
|
46
|
+
|
|
47
|
+
| Tool | Description |
|
|
48
|
+
|------|-------------|
|
|
49
|
+
| `safe_fetch` | Fetch a URL and return sanitized markdown |
|
|
50
|
+
| `sanitize_stats` | Show session sanitization statistics |
|
|
51
|
+
|
|
52
|
+
## Configuration
|
|
53
|
+
|
|
54
|
+
Optional. Create `.mcp-safe-fetch.json` in your project root or home directory:
|
|
55
|
+
|
|
56
|
+
```json
|
|
57
|
+
{
|
|
58
|
+
"logStripped": true,
|
|
59
|
+
"logFile": ".claude/sanitize.log"
|
|
60
|
+
}
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
| Option | Default | Description |
|
|
64
|
+
|--------|---------|-------------|
|
|
65
|
+
| `logStripped` | `false` | Log sanitization stats to file |
|
|
66
|
+
| `logFile` | `.claude/sanitize.log` | Log file path |
|
|
67
|
+
|
|
68
|
+
## How it works
|
|
69
|
+
|
|
70
|
+
1. Fetch URL with native `fetch` (from your machine, not Anthropic's servers)
|
|
71
|
+
2. Parse HTML with [cheerio](https://cheerio.js.org/) (htmlparser2 backend)
|
|
72
|
+
3. Strip hidden elements, dangerous tags, and comments
|
|
73
|
+
4. Convert to markdown with [turndown](https://github.com/mixmark-io/turndown)
|
|
74
|
+
5. Strip invisible unicode characters and normalize with NFKC
|
|
75
|
+
6. Strip fake LLM delimiter tokens
|
|
76
|
+
|
|
77
|
+
## License
|
|
78
|
+
|
|
79
|
+
MIT
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function runCli(command: string, args: string[]): void;
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
|
|
2
|
+
import { join } from 'node:path';
|
|
3
|
+
import { fetchUrl } from './fetch.js';
|
|
4
|
+
import { sanitize } from './sanitize/pipeline.js';
|
|
5
|
+
const CLAUDE_JSON_PATH = join(process.env.HOME || '', '.claude.json');
|
|
6
|
+
const SETTINGS_PATH = join(process.env.HOME || '', '.claude', 'settings.json');
|
|
7
|
+
const MCP_CONFIG = {
|
|
8
|
+
type: 'stdio',
|
|
9
|
+
command: 'npx',
|
|
10
|
+
args: ['-y', 'mcp-safe-fetch'],
|
|
11
|
+
};
|
|
12
|
+
function readJson(path) {
|
|
13
|
+
if (!existsSync(path)) {
|
|
14
|
+
return {};
|
|
15
|
+
}
|
|
16
|
+
try {
|
|
17
|
+
return JSON.parse(readFileSync(path, 'utf-8'));
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return {};
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
function writeJson(path, data) {
|
|
24
|
+
writeFileSync(path, JSON.stringify(data, null, 2) + '\n', 'utf-8');
|
|
25
|
+
}
|
|
26
|
+
function runInit(args) {
|
|
27
|
+
const dryRun = args.includes('--dry-run');
|
|
28
|
+
if (dryRun) {
|
|
29
|
+
console.log('Would add to ~/.claude.json:');
|
|
30
|
+
console.log(JSON.stringify({ mcpServers: { 'safe-fetch': MCP_CONFIG } }, null, 2));
|
|
31
|
+
console.log('\nWould add to ~/.claude/settings.json:');
|
|
32
|
+
console.log(JSON.stringify({ allowedTools: { WebFetch: 'deny', 'mcp__safe-fetch__safe_fetch': 'allow' } }, null, 2));
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
// Add MCP server to ~/.claude.json
|
|
36
|
+
const claudeJson = readJson(CLAUDE_JSON_PATH);
|
|
37
|
+
if (!claudeJson.mcpServers)
|
|
38
|
+
claudeJson.mcpServers = {};
|
|
39
|
+
claudeJson.mcpServers['safe-fetch'] = MCP_CONFIG;
|
|
40
|
+
writeJson(CLAUDE_JSON_PATH, claudeJson);
|
|
41
|
+
// Add tool permissions to ~/.claude/settings.json
|
|
42
|
+
const settings = readJson(SETTINGS_PATH);
|
|
43
|
+
if (!settings.allowedTools)
|
|
44
|
+
settings.allowedTools = {};
|
|
45
|
+
settings.allowedTools['WebFetch'] = 'deny';
|
|
46
|
+
settings.allowedTools['mcp__safe-fetch__safe_fetch'] = 'allow';
|
|
47
|
+
writeJson(SETTINGS_PATH, settings);
|
|
48
|
+
console.log('Updated ~/.claude.json:');
|
|
49
|
+
console.log(' + mcpServers.safe-fetch (mcp-safe-fetch MCP server)');
|
|
50
|
+
console.log('\nUpdated ~/.claude/settings.json:');
|
|
51
|
+
console.log(' + allowedTools.WebFetch: "deny"');
|
|
52
|
+
console.log(' + allowedTools.mcp__safe-fetch__safe_fetch: "allow"');
|
|
53
|
+
console.log('\nRestart Claude Code to activate.');
|
|
54
|
+
}
|
|
55
|
+
async function runTest(args) {
|
|
56
|
+
const url = args[0];
|
|
57
|
+
if (!url) {
|
|
58
|
+
console.error('Usage: mcp-safe-fetch test <url>');
|
|
59
|
+
process.exit(1);
|
|
60
|
+
}
|
|
61
|
+
console.error(`Fetching ${url}...`);
|
|
62
|
+
const startTime = Date.now();
|
|
63
|
+
const fetched = await fetchUrl(url);
|
|
64
|
+
const result = sanitize(fetched.html);
|
|
65
|
+
const durationMs = Date.now() - startTime;
|
|
66
|
+
// Print stats to stderr
|
|
67
|
+
console.error(`\nSanitization complete (${durationMs}ms):`);
|
|
68
|
+
console.error(` Input: ${result.inputSize} bytes`);
|
|
69
|
+
console.error(` Output: ${result.outputSize} bytes`);
|
|
70
|
+
console.error(` Hidden elements: ${result.stats.hiddenElements}`);
|
|
71
|
+
console.error(` Script tags: ${result.stats.scriptTags}`);
|
|
72
|
+
console.error(` Style tags: ${result.stats.styleTags}`);
|
|
73
|
+
console.error(` Zero-width chars: ${result.stats.zeroWidthChars}`);
|
|
74
|
+
console.error(` LLM delimiters: ${result.stats.llmDelimiters}`);
|
|
75
|
+
// Print sanitized content to stdout
|
|
76
|
+
process.stdout.write(result.content);
|
|
77
|
+
}
|
|
78
|
+
export function runCli(command, args) {
|
|
79
|
+
if (command === 'init') {
|
|
80
|
+
runInit(args);
|
|
81
|
+
}
|
|
82
|
+
else if (command === 'test') {
|
|
83
|
+
runTest(args).catch((error) => {
|
|
84
|
+
console.error(`Error: ${error instanceof Error ? error.message : error}`);
|
|
85
|
+
process.exit(1);
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
}
|
package/dist/config.d.ts
ADDED
package/dist/config.js
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { readFileSync, existsSync } from 'node:fs';
|
|
2
|
+
import { join } from 'node:path';
|
|
3
|
+
const DEFAULT_CONFIG = {
|
|
4
|
+
logStripped: false,
|
|
5
|
+
logFile: '.claude/sanitize.log',
|
|
6
|
+
allowDataUris: false,
|
|
7
|
+
maxBase64DecodeLength: 500,
|
|
8
|
+
customPatterns: [],
|
|
9
|
+
};
|
|
10
|
+
export function loadConfig() {
|
|
11
|
+
const paths = [
|
|
12
|
+
join(process.cwd(), '.mcp-safe-fetch.json'),
|
|
13
|
+
join(process.env.HOME || '', '.mcp-safe-fetch.json'),
|
|
14
|
+
];
|
|
15
|
+
for (const configPath of paths) {
|
|
16
|
+
if (existsSync(configPath)) {
|
|
17
|
+
try {
|
|
18
|
+
const raw = readFileSync(configPath, 'utf-8');
|
|
19
|
+
const parsed = JSON.parse(raw);
|
|
20
|
+
return { ...DEFAULT_CONFIG, ...parsed };
|
|
21
|
+
}
|
|
22
|
+
catch {
|
|
23
|
+
// Invalid config, use defaults
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
return DEFAULT_CONFIG;
|
|
28
|
+
}
|
package/dist/fetch.d.ts
ADDED
package/dist/fetch.js
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
export async function fetchUrl(url) {
|
|
2
|
+
const response = await fetch(url, {
|
|
3
|
+
headers: {
|
|
4
|
+
'User-Agent': 'mcp-safe-fetch/0.1',
|
|
5
|
+
'Accept': 'text/html,application/xhtml+xml,*/*',
|
|
6
|
+
},
|
|
7
|
+
redirect: 'follow',
|
|
8
|
+
signal: AbortSignal.timeout(10000),
|
|
9
|
+
});
|
|
10
|
+
if (!response.ok) {
|
|
11
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
12
|
+
}
|
|
13
|
+
const html = await response.text();
|
|
14
|
+
const contentType = response.headers.get('content-type') || '';
|
|
15
|
+
return {
|
|
16
|
+
html,
|
|
17
|
+
url: response.url,
|
|
18
|
+
status: response.status,
|
|
19
|
+
contentType,
|
|
20
|
+
};
|
|
21
|
+
}
|
package/dist/index.d.ts
ADDED
package/dist/index.js
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { startServer } from './server.js';
|
|
3
|
+
import { runCli } from './cli.js';
|
|
4
|
+
const command = process.argv[2];
|
|
5
|
+
if (command === 'init' || command === 'test') {
|
|
6
|
+
runCli(command, process.argv.slice(3));
|
|
7
|
+
}
|
|
8
|
+
else {
|
|
9
|
+
// Default: start MCP server (this is what npx safe-fetch invokes)
|
|
10
|
+
startServer().catch((error) => {
|
|
11
|
+
console.error('[safe-fetch] Fatal error:', error);
|
|
12
|
+
process.exit(1);
|
|
13
|
+
});
|
|
14
|
+
}
|
package/dist/logger.d.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { PipelineStats } from './sanitize/pipeline.js';
|
|
2
|
+
export interface LogEntry {
|
|
3
|
+
timestamp: string;
|
|
4
|
+
url: string;
|
|
5
|
+
stripped: PipelineStats;
|
|
6
|
+
inputSize: number;
|
|
7
|
+
outputSize: number;
|
|
8
|
+
reductionPercent: number;
|
|
9
|
+
durationMs: number;
|
|
10
|
+
}
|
|
11
|
+
export declare function logSanitization(logFile: string, entry: LogEntry): void;
|
package/dist/logger.js
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { appendFileSync, mkdirSync } from 'node:fs';
|
|
2
|
+
import { dirname } from 'node:path';
|
|
3
|
+
export function logSanitization(logFile, entry) {
|
|
4
|
+
try {
|
|
5
|
+
mkdirSync(dirname(logFile), { recursive: true });
|
|
6
|
+
appendFileSync(logFile, JSON.stringify(entry) + '\n', 'utf-8');
|
|
7
|
+
}
|
|
8
|
+
catch {
|
|
9
|
+
console.error(`[safe-fetch] Failed to write log to ${logFile}`);
|
|
10
|
+
}
|
|
11
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
const LLM_DELIMITER_PATTERNS = [
|
|
2
|
+
/<\|im_start\|>/gi,
|
|
3
|
+
/<\|im_end\|>/gi,
|
|
4
|
+
/<\|system\|>/gi,
|
|
5
|
+
/<\|user\|>/gi,
|
|
6
|
+
/<\|assistant\|>/gi,
|
|
7
|
+
/<\|endoftext\|>/gi,
|
|
8
|
+
/<\|pad\|>/gi,
|
|
9
|
+
/\\?\[INST\\?\]/gi,
|
|
10
|
+
/\\?\[\\?\/INST\\?\]/gi,
|
|
11
|
+
/<<SYS>>/gi,
|
|
12
|
+
/<<\\?\/SYS>>/gi,
|
|
13
|
+
/\n\nHuman:/g,
|
|
14
|
+
/\n\nAssistant:/g,
|
|
15
|
+
];
|
|
16
|
+
export function sanitizeDelimiters(text) {
|
|
17
|
+
let count = 0;
|
|
18
|
+
let result = text;
|
|
19
|
+
for (const pattern of LLM_DELIMITER_PATTERNS) {
|
|
20
|
+
const matches = result.match(pattern);
|
|
21
|
+
if (matches) {
|
|
22
|
+
count += matches.length;
|
|
23
|
+
result = result.replace(pattern, '');
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
return {
|
|
27
|
+
text: result,
|
|
28
|
+
stats: { llmDelimiters: count },
|
|
29
|
+
};
|
|
30
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { CheerioAPI } from 'cheerio';
|
|
2
|
+
export interface HtmlSanitizeResult {
|
|
3
|
+
html: string;
|
|
4
|
+
stats: {
|
|
5
|
+
hiddenElements: number;
|
|
6
|
+
htmlComments: number;
|
|
7
|
+
scriptTags: number;
|
|
8
|
+
styleTags: number;
|
|
9
|
+
noscriptTags: number;
|
|
10
|
+
metaTags: number;
|
|
11
|
+
};
|
|
12
|
+
}
|
|
13
|
+
export declare function sanitizeHtml($: CheerioAPI): HtmlSanitizeResult;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
const HIDDEN_SELECTORS = [
|
|
2
|
+
'[style*="display:none"]',
|
|
3
|
+
'[style*="display: none"]',
|
|
4
|
+
'[style*="visibility:hidden"]',
|
|
5
|
+
'[style*="visibility: hidden"]',
|
|
6
|
+
'[style*="opacity:0"]',
|
|
7
|
+
'[style*="opacity: 0"]',
|
|
8
|
+
'[hidden]',
|
|
9
|
+
].join(', ');
|
|
10
|
+
const STRIP_TAGS = ['script', 'style', 'noscript', 'meta', 'link'];
|
|
11
|
+
export function sanitizeHtml($) {
|
|
12
|
+
const stats = {
|
|
13
|
+
hiddenElements: 0,
|
|
14
|
+
htmlComments: 0,
|
|
15
|
+
scriptTags: 0,
|
|
16
|
+
styleTags: 0,
|
|
17
|
+
noscriptTags: 0,
|
|
18
|
+
metaTags: 0,
|
|
19
|
+
};
|
|
20
|
+
// Remove hidden elements by inline style / hidden attribute
|
|
21
|
+
const hidden = $(HIDDEN_SELECTORS);
|
|
22
|
+
stats.hiddenElements = hidden.length;
|
|
23
|
+
hidden.remove();
|
|
24
|
+
// Remove script, style, noscript, meta, link tags
|
|
25
|
+
for (const tag of STRIP_TAGS) {
|
|
26
|
+
const elements = $(tag);
|
|
27
|
+
const count = elements.length;
|
|
28
|
+
if (tag === 'script')
|
|
29
|
+
stats.scriptTags = count;
|
|
30
|
+
else if (tag === 'style')
|
|
31
|
+
stats.styleTags = count;
|
|
32
|
+
else if (tag === 'noscript')
|
|
33
|
+
stats.noscriptTags = count;
|
|
34
|
+
else if (tag === 'meta' || tag === 'link')
|
|
35
|
+
stats.metaTags += count;
|
|
36
|
+
elements.remove();
|
|
37
|
+
}
|
|
38
|
+
// Count and remove HTML comments
|
|
39
|
+
const comments = $('*').contents().filter(function () {
|
|
40
|
+
return this.type === 'comment';
|
|
41
|
+
});
|
|
42
|
+
stats.htmlComments = comments.length;
|
|
43
|
+
comments.remove();
|
|
44
|
+
return {
|
|
45
|
+
html: $.html(),
|
|
46
|
+
stats,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
export interface PipelineStats {
|
|
2
|
+
hiddenElements: number;
|
|
3
|
+
htmlComments: number;
|
|
4
|
+
scriptTags: number;
|
|
5
|
+
styleTags: number;
|
|
6
|
+
noscriptTags: number;
|
|
7
|
+
metaTags: number;
|
|
8
|
+
zeroWidthChars: number;
|
|
9
|
+
controlChars: number;
|
|
10
|
+
bidiOverrides: number;
|
|
11
|
+
unicodeTags: number;
|
|
12
|
+
variationSelectors: number;
|
|
13
|
+
llmDelimiters: number;
|
|
14
|
+
}
|
|
15
|
+
export interface PipelineResult {
|
|
16
|
+
content: string;
|
|
17
|
+
stats: PipelineStats;
|
|
18
|
+
inputSize: number;
|
|
19
|
+
outputSize: number;
|
|
20
|
+
}
|
|
21
|
+
export declare function sanitize(html: string): PipelineResult;
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import * as cheerio from 'cheerio/slim';
|
|
2
|
+
import TurndownService from 'turndown';
|
|
3
|
+
import { sanitizeHtml } from './html.js';
|
|
4
|
+
import { sanitizeUnicode } from './unicode.js';
|
|
5
|
+
import { sanitizeDelimiters } from './delimiters.js';
|
|
6
|
+
const turndown = new TurndownService({
|
|
7
|
+
headingStyle: 'atx',
|
|
8
|
+
codeBlockStyle: 'fenced',
|
|
9
|
+
fence: '```',
|
|
10
|
+
hr: '---',
|
|
11
|
+
bulletListMarker: '-',
|
|
12
|
+
preformattedCode: true,
|
|
13
|
+
});
|
|
14
|
+
export function sanitize(html) {
|
|
15
|
+
const inputSize = html.length;
|
|
16
|
+
// Step 1: Parse HTML with cheerio (htmlparser2 backend via /slim)
|
|
17
|
+
const $ = cheerio.load(html);
|
|
18
|
+
// Step 2: Strip hidden HTML elements
|
|
19
|
+
const htmlResult = sanitizeHtml($);
|
|
20
|
+
// Step 3: Convert cleaned HTML to markdown
|
|
21
|
+
let content = turndown.turndown(htmlResult.html);
|
|
22
|
+
// Step 4: Unicode sanitization
|
|
23
|
+
const unicodeResult = sanitizeUnicode(content);
|
|
24
|
+
content = unicodeResult.text;
|
|
25
|
+
// Step 5: Strip fake LLM delimiters
|
|
26
|
+
const delimiterResult = sanitizeDelimiters(content);
|
|
27
|
+
content = delimiterResult.text;
|
|
28
|
+
const outputSize = content.length;
|
|
29
|
+
return {
|
|
30
|
+
content,
|
|
31
|
+
stats: {
|
|
32
|
+
...htmlResult.stats,
|
|
33
|
+
...unicodeResult.stats,
|
|
34
|
+
...delimiterResult.stats,
|
|
35
|
+
},
|
|
36
|
+
inputSize,
|
|
37
|
+
outputSize,
|
|
38
|
+
};
|
|
39
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export interface UnicodeSanitizeResult {
|
|
2
|
+
text: string;
|
|
3
|
+
stats: {
|
|
4
|
+
zeroWidthChars: number;
|
|
5
|
+
controlChars: number;
|
|
6
|
+
bidiOverrides: number;
|
|
7
|
+
unicodeTags: number;
|
|
8
|
+
variationSelectors: number;
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
export declare function sanitizeUnicode(text: string): UnicodeSanitizeResult;
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
// Zero-width and invisible characters
|
|
2
|
+
const INVISIBLE_CHARS = /[\u200B\u200C\u200D\u200E\u200F\u2060\u2063\uFEFF\u00AD]/g;
|
|
3
|
+
// Bidirectional overrides and isolates
|
|
4
|
+
const BIDI_CHARS = /[\u202A-\u202E\u2066-\u2069]/g;
|
|
5
|
+
// Variation selectors
|
|
6
|
+
const VARIATION_SELECTORS = /[\uFE00-\uFE0F]/g;
|
|
7
|
+
// Unicode tag characters (U+E0001-U+E007F)
|
|
8
|
+
const UNICODE_TAGS = /[\u{E0001}-\u{E007F}]/gu;
|
|
9
|
+
// Control characters (except \n \t \r)
|
|
10
|
+
const CONTROL_CHARS = /[\x00-\x08\x0B\x0C\x0E-\x1F]/g;
|
|
11
|
+
export function sanitizeUnicode(text) {
|
|
12
|
+
const stats = {
|
|
13
|
+
zeroWidthChars: 0,
|
|
14
|
+
controlChars: 0,
|
|
15
|
+
bidiOverrides: 0,
|
|
16
|
+
unicodeTags: 0,
|
|
17
|
+
variationSelectors: 0,
|
|
18
|
+
};
|
|
19
|
+
// Count before stripping
|
|
20
|
+
stats.zeroWidthChars = (text.match(INVISIBLE_CHARS) || []).length;
|
|
21
|
+
stats.bidiOverrides = (text.match(BIDI_CHARS) || []).length;
|
|
22
|
+
stats.variationSelectors = (text.match(VARIATION_SELECTORS) || []).length;
|
|
23
|
+
stats.unicodeTags = (text.match(UNICODE_TAGS) || []).length;
|
|
24
|
+
stats.controlChars = (text.match(CONTROL_CHARS) || []).length;
|
|
25
|
+
// Strip all
|
|
26
|
+
let result = text
|
|
27
|
+
.replace(INVISIBLE_CHARS, '')
|
|
28
|
+
.replace(BIDI_CHARS, '')
|
|
29
|
+
.replace(VARIATION_SELECTORS, '')
|
|
30
|
+
.replace(UNICODE_TAGS, '')
|
|
31
|
+
.replace(CONTROL_CHARS, '');
|
|
32
|
+
// NFKC normalization (collapses homoglyphs)
|
|
33
|
+
result = result.normalize('NFKC');
|
|
34
|
+
return { text: result, stats };
|
|
35
|
+
}
|
package/dist/server.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function startServer(): Promise<void>;
|
package/dist/server.js
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
2
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
3
|
+
import { z } from 'zod';
|
|
4
|
+
import { fetchUrl } from './fetch.js';
|
|
5
|
+
import { sanitize } from './sanitize/pipeline.js';
|
|
6
|
+
import { loadConfig } from './config.js';
|
|
7
|
+
import { logSanitization } from './logger.js';
|
|
8
|
+
export async function startServer() {
|
|
9
|
+
const config = loadConfig();
|
|
10
|
+
const session = {
|
|
11
|
+
totalRequests: 0,
|
|
12
|
+
totalStripped: {
|
|
13
|
+
hiddenElements: 0, htmlComments: 0, scriptTags: 0,
|
|
14
|
+
styleTags: 0, noscriptTags: 0, metaTags: 0,
|
|
15
|
+
zeroWidthChars: 0, controlChars: 0, bidiOverrides: 0,
|
|
16
|
+
unicodeTags: 0, variationSelectors: 0, llmDelimiters: 0,
|
|
17
|
+
},
|
|
18
|
+
urls: [],
|
|
19
|
+
};
|
|
20
|
+
const server = new McpServer({
|
|
21
|
+
name: 'safe-fetch',
|
|
22
|
+
version: '0.1.0',
|
|
23
|
+
});
|
|
24
|
+
server.registerTool('safe_fetch', {
|
|
25
|
+
description: 'Fetch a URL and return sanitized content with prompt injection vectors removed. Strips hidden HTML elements, invisible unicode characters, and fake LLM delimiters.',
|
|
26
|
+
inputSchema: {
|
|
27
|
+
url: z.string().url().describe('URL to fetch'),
|
|
28
|
+
},
|
|
29
|
+
}, async ({ url }) => {
|
|
30
|
+
try {
|
|
31
|
+
const startTime = Date.now();
|
|
32
|
+
const fetched = await fetchUrl(url);
|
|
33
|
+
const result = sanitize(fetched.html);
|
|
34
|
+
const durationMs = Date.now() - startTime;
|
|
35
|
+
// Update session stats
|
|
36
|
+
session.totalRequests++;
|
|
37
|
+
session.urls.push(url);
|
|
38
|
+
for (const key of Object.keys(session.totalStripped)) {
|
|
39
|
+
session.totalStripped[key] += result.stats[key];
|
|
40
|
+
}
|
|
41
|
+
// Log if configured
|
|
42
|
+
if (config.logStripped) {
|
|
43
|
+
const entry = {
|
|
44
|
+
timestamp: new Date().toISOString(),
|
|
45
|
+
url,
|
|
46
|
+
stripped: result.stats,
|
|
47
|
+
inputSize: result.inputSize,
|
|
48
|
+
outputSize: result.outputSize,
|
|
49
|
+
reductionPercent: Math.round((1 - result.outputSize / result.inputSize) * 1000) / 10,
|
|
50
|
+
durationMs,
|
|
51
|
+
};
|
|
52
|
+
logSanitization(config.logFile, entry);
|
|
53
|
+
}
|
|
54
|
+
// Build summary of what was stripped
|
|
55
|
+
const strippedItems = [];
|
|
56
|
+
if (result.stats.hiddenElements > 0)
|
|
57
|
+
strippedItems.push(`${result.stats.hiddenElements} hidden elements`);
|
|
58
|
+
if (result.stats.scriptTags > 0)
|
|
59
|
+
strippedItems.push(`${result.stats.scriptTags} script tags`);
|
|
60
|
+
if (result.stats.styleTags > 0)
|
|
61
|
+
strippedItems.push(`${result.stats.styleTags} style tags`);
|
|
62
|
+
if (result.stats.zeroWidthChars > 0)
|
|
63
|
+
strippedItems.push(`${result.stats.zeroWidthChars} zero-width chars`);
|
|
64
|
+
if (result.stats.llmDelimiters > 0)
|
|
65
|
+
strippedItems.push(`${result.stats.llmDelimiters} LLM delimiters`);
|
|
66
|
+
const header = strippedItems.length > 0
|
|
67
|
+
? `[safe-fetch] Stripped: ${strippedItems.join(', ')} | ${result.inputSize} → ${result.outputSize} bytes (${durationMs}ms)\n\n`
|
|
68
|
+
: `[safe-fetch] Clean page | ${result.inputSize} → ${result.outputSize} bytes (${durationMs}ms)\n\n`;
|
|
69
|
+
return {
|
|
70
|
+
content: [{ type: 'text', text: header + result.content }],
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
catch (error) {
|
|
74
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
75
|
+
return {
|
|
76
|
+
content: [{ type: 'text', text: `[safe-fetch] Error fetching ${url}: ${message}` }],
|
|
77
|
+
isError: true,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
});
|
|
81
|
+
server.registerTool('sanitize_stats', {
|
|
82
|
+
description: 'Show sanitization statistics for the current session',
|
|
83
|
+
inputSchema: {},
|
|
84
|
+
}, async () => {
|
|
85
|
+
const lines = [
|
|
86
|
+
`Session stats (${session.totalRequests} requests):`,
|
|
87
|
+
` Hidden elements stripped: ${session.totalStripped.hiddenElements}`,
|
|
88
|
+
` Script tags stripped: ${session.totalStripped.scriptTags}`,
|
|
89
|
+
` Style tags stripped: ${session.totalStripped.styleTags}`,
|
|
90
|
+
` Zero-width chars stripped: ${session.totalStripped.zeroWidthChars}`,
|
|
91
|
+
` LLM delimiters stripped: ${session.totalStripped.llmDelimiters}`,
|
|
92
|
+
` Bidi overrides stripped: ${session.totalStripped.bidiOverrides}`,
|
|
93
|
+
'',
|
|
94
|
+
`URLs fetched:`,
|
|
95
|
+
...session.urls.map(u => ` - ${u}`),
|
|
96
|
+
];
|
|
97
|
+
return {
|
|
98
|
+
content: [{ type: 'text', text: lines.join('\n') }],
|
|
99
|
+
};
|
|
100
|
+
});
|
|
101
|
+
const transport = new StdioServerTransport();
|
|
102
|
+
await server.connect(transport);
|
|
103
|
+
console.error('[safe-fetch] MCP server running on stdio');
|
|
104
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "mcp-safe-fetch",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Deterministic content sanitization MCP server for agentic coding tools",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"mcp-safe-fetch": "./dist/index.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"dist"
|
|
12
|
+
],
|
|
13
|
+
"scripts": {
|
|
14
|
+
"build": "tsc",
|
|
15
|
+
"test": "vitest run",
|
|
16
|
+
"test:watch": "vitest",
|
|
17
|
+
"prepublishOnly": "npm run build"
|
|
18
|
+
},
|
|
19
|
+
"keywords": [
|
|
20
|
+
"mcp",
|
|
21
|
+
"sanitize",
|
|
22
|
+
"prompt-injection",
|
|
23
|
+
"claude",
|
|
24
|
+
"llm",
|
|
25
|
+
"security"
|
|
26
|
+
],
|
|
27
|
+
"author": "Tim Stark <tim@timstark.dev>",
|
|
28
|
+
"license": "MIT",
|
|
29
|
+
"repository": {
|
|
30
|
+
"type": "git",
|
|
31
|
+
"url": "https://github.com/timstarkk/mcp-safe-fetch"
|
|
32
|
+
},
|
|
33
|
+
"dependencies": {
|
|
34
|
+
"@modelcontextprotocol/sdk": "^1.27.0",
|
|
35
|
+
"cheerio": "^1.2.0",
|
|
36
|
+
"turndown": "^7.2.2",
|
|
37
|
+
"zod": "^3.23.0"
|
|
38
|
+
},
|
|
39
|
+
"devDependencies": {
|
|
40
|
+
"@types/node": "^22.0.0",
|
|
41
|
+
"@types/turndown": "^5.0.5",
|
|
42
|
+
"typescript": "^5.7.0",
|
|
43
|
+
"vitest": "^3.0.0"
|
|
44
|
+
}
|
|
45
|
+
}
|