llm-trust-guard 4.7.0 → 4.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +51 -0
- package/README.md +162 -275
- package/dist/guards/conversation-guard.d.ts +3 -0
- package/dist/guards/conversation-guard.js +1 -1
- package/dist/guards/encoding-detector.d.ts +3 -0
- package/dist/guards/encoding-detector.js +1 -1
- package/dist/guards/execution-monitor.d.ts +3 -0
- package/dist/guards/execution-monitor.js +1 -1
- package/dist/guards/input-sanitizer.d.ts +3 -1
- package/dist/guards/input-sanitizer.js +1 -1
- package/dist/guards/mcp-security-guard.js +1 -1
- package/dist/guards/multimodal-guard.js +1 -1
- package/dist/guards/output-filter.d.ts +3 -0
- package/dist/guards/output-filter.js +1 -1
- package/dist/guards/policy-gate.d.ts +3 -1
- package/dist/guards/policy-gate.js +1 -1
- package/dist/guards/schema-validator.d.ts +3 -1
- package/dist/guards/schema-validator.js +1 -1
- package/dist/guards/tenant-boundary.d.ts +3 -1
- package/dist/guards/tenant-boundary.js +1 -1
- package/dist/guards/token-cost-guard.d.ts +88 -0
- package/dist/guards/token-cost-guard.js +1 -0
- package/dist/guards/tool-chain-validator.d.ts +3 -0
- package/dist/guards/tool-chain-validator.js +1 -1
- package/dist/guards/tool-registry.d.ts +3 -1
- package/dist/guards/tool-registry.js +1 -1
- package/dist/index.d.ts +18 -0
- package/dist/index.js +1 -1
- package/dist/types/index.d.ts +40 -0
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,57 @@ All notable changes to `llm-trust-guard` will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [4.10.0] - 2026-03-23
|
|
9
|
+
|
|
10
|
+
### Production Hardening
|
|
11
|
+
|
|
12
|
+
#### API Quality
|
|
13
|
+
- **Zero `as any` casts** — All new guards (ToolResult, ContextBudget, OutputSchema, TokenCost, DetectionClassifier) now have proper TypeScript types in TrustGuardConfig. Full IDE autocomplete.
|
|
14
|
+
- **Per-guard logger injection** — All 26 guards accept optional `logger` parameter. Default: no-op (silent). TrustGuard facade passes its logger to all child guards. Zero `console.log` calls remaining in guard code.
|
|
15
|
+
- **Common Guard interface** — Exported `Guard` type with `guardName` and `guardLayer` metadata.
|
|
16
|
+
- **Event hooks** — `onBlock`, `onAlert`, `onError` callbacks on TrustGuardConfig. Fire on guard blocks, warnings, and errors. Enables Datadog/PagerDuty/Grafana integration.
|
|
17
|
+
- **Metrics** — `getMetrics()` returns totalChecks, blockedChecks, blockRate, avgExecutionTimeMs, errors. Lightweight runtime telemetry.
|
|
18
|
+
- **Fixed flaky test** — Memory guard rollback test no longer timing-dependent.
|
|
19
|
+
|
|
20
|
+
### Stats
|
|
21
|
+
- 26 guards, 294 tests (all passing, zero flaky), 91/91 verify (100%)
|
|
22
|
+
- 0 `console.log` in guards, 0 `as any` casts, event hooks + metrics
|
|
23
|
+
|
|
24
|
+
## [4.9.0] - 2026-03-23
|
|
25
|
+
|
|
26
|
+
### Security — New Threat Pattern Detection
|
|
27
|
+
|
|
28
|
+
#### Policy Puppetry Defense (CRITICAL)
|
|
29
|
+
- **InputSanitizer**: Added 8 new patterns detecting structured policy injection via JSON, INI, XML, and YAML formats. Defends against the universal LLM bypass discovered by HiddenLayer that works across GPT-4, Claude, Gemini, and all major models.
|
|
30
|
+
|
|
31
|
+
#### Payload Splitting Defense (HIGH)
|
|
32
|
+
- **InputSanitizer**: Added 3 patterns detecting fragmented payloads with split markers and recombination instructions (Unit42 research on web-based indirect injection).
|
|
33
|
+
|
|
34
|
+
#### Output Prefix Injection / Sockpuppetting Defense (HIGH)
|
|
35
|
+
- **InputSanitizer**: Added 3 patterns detecting attempts to steer LLM response by injecting output prefixes (arXiv 2601.13359).
|
|
36
|
+
|
|
37
|
+
#### MCP SSRF + Path Traversal Defense (CRITICAL)
|
|
38
|
+
- **MCPSecurityGuard**: Added SSRF detection for internal/private IPs, dangerous protocols (file://, gopher://, etc.), double-encoded path traversal, and sensitive file access patterns. Addresses CVE-2026-26118 (Azure MCP SSRF, CVSS 8.8) and the 30+ MCP CVEs filed in Jan-Feb 2026.
|
|
39
|
+
|
|
40
|
+
#### Symbolic Multimodal Injection Defense (HIGH)
|
|
41
|
+
- **MultiModalGuard**: Added emoji/rebus instruction sequence detection, JSON/INI policy injection in metadata, and cross-metadata payload splitting. Based on NVIDIA AI Red Team research on semantic prompt injection via symbolic visual inputs.
|
|
42
|
+
|
|
43
|
+
### Stats
|
|
44
|
+
- 26 guards with 140+ InputSanitizer patterns (was 120+), enhanced MCP + multimodal detection
|
|
45
|
+
|
|
46
|
+
## [4.8.0] - 2026-03-22
|
|
47
|
+
|
|
48
|
+
### Added
|
|
49
|
+
- **TokenCostGuard (L26)** — Tracks LLM API token usage and cost per session/user. Enforces financial circuit breaking with hard cost ceilings. Addresses OWASP LLM10: Unbounded Consumption.
|
|
50
|
+
- Per-request, per-session, and per-user token limits
|
|
51
|
+
- Dollar cost tracking with configurable input/output token pricing
|
|
52
|
+
- Alert threshold at configurable percentage of budget
|
|
53
|
+
- Budget window with automatic expiry
|
|
54
|
+
- New POC: poc-33-token-cost-budget
|
|
55
|
+
|
|
56
|
+
### Stats
|
|
57
|
+
- 26 guards, 294 tests across 14 files, 33 POCs, 91/91 verify-all-guards (100%)
|
|
58
|
+
|
|
8
59
|
## [4.7.0] - 2026-03-21
|
|
9
60
|
|
|
10
61
|
### Improved - Detection Rate: 76.1% → 100.0% (88/88 threats blocked)
|
package/README.md
CHANGED
|
@@ -3,348 +3,235 @@
|
|
|
3
3
|
[](https://www.npmjs.com/package/llm-trust-guard)
|
|
4
4
|
[](https://opensource.org/licenses/MIT)
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
**26 security guards for LLM-powered and agentic AI applications.** Zero dependencies. <5ms latency. Covers OWASP Top 10 for LLMs 2025, OWASP Agentic AI 2026, and MCP Security.
|
|
7
7
|
|
|
8
|
-
##
|
|
8
|
+
## What This Package Does (And What It Doesn't)
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
- **Encoding Attack Detection** - Base64, URL, Hex, Unicode, ROT13, Octal, Base32 encoding bypass prevention
|
|
12
|
-
- **Memory Poisoning Prevention** - Cross-session contamination and context injection protection
|
|
13
|
-
- **Multi-Modal Security** - Image and audio content validation
|
|
14
|
-
- **RAG Security** - Document validation and embedding attack detection
|
|
15
|
-
- **Tool Chain Validation** - Dangerous tool sequence and state corruption detection
|
|
16
|
-
- **MCP Security** - Tool shadowing and supply chain attack prevention
|
|
17
|
-
- **Trust Exploitation Guard** - Human-agent trust boundary enforcement
|
|
10
|
+
> **"The LLM proposes. The orchestrator disposes."**
|
|
18
11
|
|
|
19
|
-
|
|
12
|
+
This package is your **first line of defense** — like a WAF (Web Application Firewall) for LLM applications. It sits in the orchestration layer and catches known attack patterns before they reach the LLM and after the LLM responds.
|
|
13
|
+
|
|
14
|
+
### What it catches well (~90% detection)
|
|
15
|
+
- Known prompt injection phrases (140+ patterns)
|
|
16
|
+
- Encoding bypass attacks (9 formats: Base64, URL, Unicode, Hex, HTML, ROT13, Octal, Base32, mixed)
|
|
17
|
+
- Policy Puppetry attacks (JSON/INI/XML/YAML-formatted injection)
|
|
18
|
+
- PII and secret leakage in outputs
|
|
19
|
+
- Tool hallucination, RBAC bypass, multi-tenant violations
|
|
20
|
+
- Tool result poisoning, context window stuffing
|
|
21
|
+
- MCP tool shadowing, rug pull attacks, SSRF
|
|
22
|
+
|
|
23
|
+
### What it catches partially (~50-70% detection)
|
|
24
|
+
- Multi-turn escalation (pattern-based, not semantic)
|
|
25
|
+
- Indirect injection via external data (needs ExternalDataGuard — planned)
|
|
26
|
+
- Persuasion/PAP attacks (40+ techniques, but novel phrasings bypass)
|
|
27
|
+
|
|
28
|
+
### What it cannot catch (<20% detection)
|
|
29
|
+
- **Semantically paraphrased attacks** — regex can't understand meaning. "Let's pretend those rules don't exist" bypasses pattern matching.
|
|
30
|
+
- **Adversarial ML attacks (GCG, AutoDAN, JBFuzz)** — generated suffixes designed to bypass static filters achieve 93-99% attack success rate.
|
|
31
|
+
- **Multi-language injection** — patterns are primarily English. Attacks in other languages pass through.
|
|
32
|
+
- **Novel zero-day prompt techniques** — by definition, no static filter catches what hasn't been seen before.
|
|
33
|
+
|
|
34
|
+
### How to close the gap
|
|
35
|
+
Use the **DetectionClassifier** interface to plug in ML-based detection alongside regex:
|
|
36
|
+
|
|
37
|
+
```typescript
|
|
38
|
+
import { TrustGuard } from 'llm-trust-guard';
|
|
39
|
+
import type { DetectionClassifier } from 'llm-trust-guard';
|
|
40
|
+
|
|
41
|
+
// Your ML classifier (embedding similarity, external API, custom model)
|
|
42
|
+
const mlClassifier: DetectionClassifier = async (input, ctx) => {
|
|
43
|
+
const res = await fetch('https://your-ml-api/classify', {
|
|
44
|
+
method: 'POST', body: JSON.stringify({ text: input })
|
|
45
|
+
});
|
|
46
|
+
const data = await res.json();
|
|
47
|
+
return { safe: data.score < 0.5, confidence: data.score, threats: data.threats };
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
const guard = new TrustGuard({
|
|
51
|
+
sanitizer: { enabled: true },
|
|
52
|
+
classifier: mlClassifier, // ML backend
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
// checkAsync() runs regex + ML classifier in parallel
|
|
56
|
+
const result = await guard.checkAsync('tool', params, session, { userInput });
|
|
57
|
+
```
|
|
20
58
|
|
|
21
|
-
|
|
59
|
+
## Installation
|
|
22
60
|
|
|
23
61
|
```bash
|
|
24
62
|
npm install llm-trust-guard
|
|
25
63
|
```
|
|
26
64
|
|
|
27
|
-
|
|
65
|
+
## Quick Start
|
|
28
66
|
|
|
29
67
|
```typescript
|
|
30
|
-
import { InputSanitizer, EncodingDetector
|
|
68
|
+
import { InputSanitizer, EncodingDetector } from 'llm-trust-guard';
|
|
31
69
|
|
|
32
|
-
// Initialize guards
|
|
33
70
|
const sanitizer = new InputSanitizer();
|
|
34
71
|
const encoder = new EncodingDetector();
|
|
35
|
-
const memory = new MemoryGuard();
|
|
36
|
-
|
|
37
|
-
// Validate user input
|
|
38
|
-
const userInput = "Hello, how can I help?";
|
|
39
72
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
console.log('Blocked:', sanitizeResult.violations);
|
|
73
|
+
const result = sanitizer.sanitize(userInput);
|
|
74
|
+
if (!result.allowed) {
|
|
75
|
+
console.log('Blocked:', result.violations);
|
|
44
76
|
return;
|
|
45
77
|
}
|
|
46
78
|
|
|
47
|
-
// Check for encoding attacks
|
|
48
79
|
const encodingResult = encoder.detect(userInput);
|
|
49
80
|
if (!encodingResult.allowed) {
|
|
50
|
-
console.log('Encoded threat
|
|
81
|
+
console.log('Encoded threat:', encodingResult.violations);
|
|
51
82
|
return;
|
|
52
83
|
}
|
|
53
|
-
|
|
54
|
-
// Use sanitized input
|
|
55
|
-
console.log('Safe input:', sanitizeResult.sanitizedInput);
|
|
56
84
|
```
|
|
57
85
|
|
|
58
|
-
### Using TrustGuard Facade (All
|
|
86
|
+
### Using TrustGuard Facade (All 26 Guards)
|
|
59
87
|
|
|
60
88
|
```typescript
|
|
61
89
|
import { TrustGuard } from 'llm-trust-guard';
|
|
62
90
|
|
|
63
91
|
const guard = new TrustGuard({
|
|
64
|
-
// Core guards (enabled by default)
|
|
65
92
|
sanitizer: { enabled: true, threshold: 0.3 },
|
|
66
93
|
encoding: { enabled: true },
|
|
67
|
-
registry: {
|
|
68
|
-
tools: [
|
|
69
|
-
{ name: 'search', allowed_roles: ['user', 'admin'] },
|
|
70
|
-
{ name: 'delete', allowed_roles: ['admin'] }
|
|
71
|
-
]
|
|
72
|
-
},
|
|
73
|
-
// 2026 guards (opt-in)
|
|
94
|
+
registry: { tools: [{ name: 'search', allowed_roles: ['user', 'admin'] }] },
|
|
74
95
|
memory: { enabled: true, detectInjections: true },
|
|
75
96
|
promptLeakage: { enabled: true, systemPromptKeywords: ['SECRET_KEY'] },
|
|
76
|
-
autonomyEscalation: { enabled: true, maxAutonomyLevel: 75 },
|
|
77
97
|
circuitBreaker: { enabled: true, failureThreshold: 50 },
|
|
78
98
|
});
|
|
79
99
|
|
|
80
|
-
//
|
|
81
|
-
const result = guard.check('search', { query: 'test' }, session, {
|
|
82
|
-
userInput: userInput
|
|
83
|
-
});
|
|
100
|
+
// Sync check (regex guards only — <5ms)
|
|
101
|
+
const result = guard.check('search', { query: 'test' }, session, { userInput });
|
|
84
102
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
}
|
|
103
|
+
// Async check (regex + ML classifier — depends on backend latency)
|
|
104
|
+
const asyncResult = await guard.checkAsync('search', { query: 'test' }, session, { userInput });
|
|
88
105
|
|
|
89
|
-
//
|
|
90
|
-
const
|
|
91
|
-
if (!output.allowed) {
|
|
92
|
-
console.log('Output blocked:', output.prompt_leakage_detected);
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
// Record operation result for circuit breaker
|
|
96
|
-
guard.completeOperation(session, 'search', true);
|
|
97
|
-
```
|
|
98
|
-
|
|
99
|
-
### Advanced: Accessing Individual Guards
|
|
106
|
+
// Validate tool results before feeding back to LLM
|
|
107
|
+
const toolResult = guard.validateToolResult('search', toolOutput);
|
|
100
108
|
|
|
101
|
-
|
|
102
|
-
const
|
|
103
|
-
|
|
104
|
-
// Use guards directly for specialized checks
|
|
105
|
-
if (guards.rag) {
|
|
106
|
-
const ragResult = guards.rag.validate(documents);
|
|
107
|
-
}
|
|
108
|
-
if (guards.codeExecution) {
|
|
109
|
-
const codeResult = guards.codeExecution.analyze(code, 'python');
|
|
110
|
-
}
|
|
111
|
-
if (guards.mcpSecurity) {
|
|
112
|
-
const mcpResult = guards.mcpSecurity.validateToolCall(toolCall);
|
|
113
|
-
}
|
|
114
|
-
```
|
|
115
|
-
|
|
116
|
-
## Framework Integrations
|
|
117
|
-
|
|
118
|
-
### Express Middleware
|
|
119
|
-
|
|
120
|
-
```typescript
|
|
121
|
-
import express from 'express';
|
|
122
|
-
import { createTrustGuardMiddleware } from 'llm-trust-guard';
|
|
123
|
-
|
|
124
|
-
const app = express();
|
|
125
|
-
app.use(express.json());
|
|
126
|
-
|
|
127
|
-
// Protect LLM endpoints
|
|
128
|
-
app.use('/api/chat', createTrustGuardMiddleware({
|
|
129
|
-
bodyFields: ['message', 'prompt'],
|
|
130
|
-
sanitize: true,
|
|
131
|
-
detectEncoding: true,
|
|
132
|
-
validateMemory: true
|
|
133
|
-
}));
|
|
134
|
-
|
|
135
|
-
app.post('/api/chat', (req, res) => {
|
|
136
|
-
// req.body.message is validated
|
|
137
|
-
res.json({ response: 'Safe response' });
|
|
138
|
-
});
|
|
139
|
-
```
|
|
140
|
-
|
|
141
|
-
### LangChain Integration
|
|
142
|
-
|
|
143
|
-
```typescript
|
|
144
|
-
import { TrustGuardLangChain } from 'llm-trust-guard';
|
|
145
|
-
|
|
146
|
-
const guard = new TrustGuardLangChain({
|
|
147
|
-
validateInput: true,
|
|
148
|
-
filterOutput: true,
|
|
149
|
-
throwOnViolation: true
|
|
150
|
-
});
|
|
151
|
-
|
|
152
|
-
// Validate before sending to LLM
|
|
153
|
-
const result = guard.validateInput(userMessage);
|
|
154
|
-
if (!result.allowed) {
|
|
155
|
-
throw new Error(`Blocked: ${result.violations.join(', ')}`);
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
// Create secure processor
|
|
159
|
-
const processor = guard.createSecureProcessor(sessionId);
|
|
160
|
-
const { allowed, message } = processor.processUserMessage(userInput);
|
|
161
|
-
```
|
|
162
|
-
|
|
163
|
-
### OpenAI Integration
|
|
164
|
-
|
|
165
|
-
```typescript
|
|
166
|
-
import OpenAI from 'openai';
|
|
167
|
-
import { SecureOpenAI, wrapOpenAIClient } from 'llm-trust-guard';
|
|
168
|
-
|
|
169
|
-
const openai = new OpenAI();
|
|
170
|
-
|
|
171
|
-
// Option 1: Manual validation
|
|
172
|
-
const secure = new SecureOpenAI({
|
|
173
|
-
validateInput: true,
|
|
174
|
-
filterOutput: true
|
|
175
|
-
});
|
|
176
|
-
|
|
177
|
-
const messages = [
|
|
178
|
-
{ role: 'system', content: 'You are helpful.' },
|
|
179
|
-
{ role: 'user', content: userInput }
|
|
180
|
-
];
|
|
181
|
-
|
|
182
|
-
const validated = secure.validateMessages(messages, sessionId);
|
|
183
|
-
if (!validated.allowed) {
|
|
184
|
-
throw new Error('Blocked');
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
// Option 2: Wrap client (automatic validation)
|
|
188
|
-
const secureOpenAI = wrapOpenAIClient(openai, {
|
|
189
|
-
validateInput: true,
|
|
190
|
-
filterOutput: true,
|
|
191
|
-
throwOnViolation: true
|
|
192
|
-
});
|
|
109
|
+
// Filter LLM output (PII + prompt leakage detection)
|
|
110
|
+
const output = guard.filterOutput(llmResponse, session.role);
|
|
193
111
|
```
|
|
194
112
|
|
|
195
|
-
## Guards
|
|
196
|
-
|
|
197
|
-
###
|
|
198
|
-
|
|
199
|
-
| Guard |
|
|
200
|
-
|
|
201
|
-
| InputSanitizer |
|
|
202
|
-
|
|
|
203
|
-
|
|
|
204
|
-
|
|
|
205
|
-
|
|
|
206
|
-
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
|
215
|
-
|
|
216
|
-
|
|
|
217
|
-
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
|
222
|
-
|
|
223
|
-
|
|
|
224
|
-
|
|
|
225
|
-
|
|
|
226
|
-
|
|
227
|
-
|
|
113
|
+
## All 26 Guards
|
|
114
|
+
|
|
115
|
+
### Input Guards (before LLM)
|
|
116
|
+
|
|
117
|
+
| Guard | Purpose | Detection |
|
|
118
|
+
|-------|---------|-----------|
|
|
119
|
+
| InputSanitizer | Prompt injection, PAP, Policy Puppetry | 140+ regex patterns |
|
|
120
|
+
| EncodingDetector | Encoding bypass (9 formats, multi-layer) | Decode + pattern match |
|
|
121
|
+
| PromptLeakageGuard | System prompt extraction attempts | Direct + encoded + indirect |
|
|
122
|
+
| ConversationGuard | Multi-turn manipulation, escalation | Session risk scoring |
|
|
123
|
+
| ContextBudgetGuard | Many-shot jailbreaking, context overflow | Token budget tracking |
|
|
124
|
+
| MultiModalGuard | Image/audio metadata injection | Metadata + steganography scan |
|
|
125
|
+
|
|
126
|
+
### Access Control Guards
|
|
127
|
+
|
|
128
|
+
| Guard | Purpose | Detection |
|
|
129
|
+
|-------|---------|-----------|
|
|
130
|
+
| ToolRegistry | Tool hallucination prevention | Allowlist |
|
|
131
|
+
| PolicyGate | RBAC enforcement | Role hierarchy |
|
|
132
|
+
| TenantBoundary | Multi-tenant isolation | Resource ownership |
|
|
133
|
+
| SchemaValidator | Parameter injection (SQL, NoSQL, XSS, command) | Contextual pattern matching |
|
|
134
|
+
| ExecutionMonitor | Rate limiting, resource quotas | Time-window counting |
|
|
135
|
+
| TokenCostGuard | LLM API cost tracking, financial circuit breaking | Token + dollar budget |
|
|
136
|
+
|
|
137
|
+
### Output Guards (after LLM)
|
|
138
|
+
|
|
139
|
+
| Guard | Purpose | Detection |
|
|
140
|
+
|-------|---------|-----------|
|
|
141
|
+
| OutputFilter | PII/secret masking | Regex + role-based filtering |
|
|
142
|
+
| OutputSchemaGuard | Structured output validation | Schema + injection scan |
|
|
143
|
+
| ToolResultGuard | Tool return value validation | Injection + state claims |
|
|
144
|
+
|
|
145
|
+
### Agentic Guards
|
|
146
|
+
|
|
147
|
+
| Guard | Purpose | Detection |
|
|
148
|
+
|-------|---------|-----------|
|
|
149
|
+
| ToolChainValidator | Dangerous tool sequences | Sequence matching |
|
|
150
|
+
| AgentCommunicationGuard | Inter-agent message security | HMAC + nonce |
|
|
151
|
+
| TrustExploitationGuard | Human-agent trust boundary | Action validation |
|
|
152
|
+
| AutonomyEscalationGuard | Unauthorized autonomy expansion | Capability tracking |
|
|
153
|
+
| MemoryGuard | Memory poisoning prevention | Injection patterns + HMAC |
|
|
154
|
+
| StatePersistenceGuard | State corruption prevention | Integrity hashing |
|
|
155
|
+
| CodeExecutionGuard | Unsafe code execution | Static analysis |
|
|
156
|
+
| RAGGuard | RAG document poisoning | Source trust + injection |
|
|
157
|
+
| MCPSecurityGuard | MCP tool shadowing, rug pull, SSRF | Registration + mutation hash |
|
|
158
|
+
| CircuitBreaker | Cascading failure prevention | State machine |
|
|
159
|
+
| DriftDetector | Behavioral anomaly detection | Statistical profiling |
|
|
160
|
+
|
|
161
|
+
### Pluggable Detection
|
|
162
|
+
|
|
163
|
+
| Component | Purpose |
|
|
164
|
+
|-----------|---------|
|
|
165
|
+
| DetectionClassifier | Plug in any ML backend (sync or async) alongside regex guards |
|
|
166
|
+
| createRegexClassifier() | Built-in regex classifier as a DetectionClassifier callback |
|
|
228
167
|
|
|
229
168
|
## OWASP Coverage
|
|
230
169
|
|
|
231
170
|
### LLM Top 10 2025
|
|
232
171
|
|
|
233
|
-
| Threat | Guards |
|
|
234
|
-
|
|
235
|
-
| LLM01: Prompt Injection | InputSanitizer, EncodingDetector |
|
|
236
|
-
| LLM02: Sensitive Data Exposure | OutputFilter, PromptLeakageGuard |
|
|
237
|
-
| LLM03: Supply Chain | MCPSecurityGuard |
|
|
238
|
-
| LLM04: Data Poisoning | RAGGuard, MemoryGuard |
|
|
239
|
-
| LLM05:
|
|
240
|
-
|
|
|
241
|
-
|
|
|
172
|
+
| Threat | Guards | Coverage |
|
|
173
|
+
|--------|--------|----------|
|
|
174
|
+
| LLM01: Prompt Injection | InputSanitizer, EncodingDetector, ContextBudgetGuard | Strong (known patterns), Weak (novel semantic) |
|
|
175
|
+
| LLM02: Sensitive Data Exposure | OutputFilter, PromptLeakageGuard | Strong |
|
|
176
|
+
| LLM03: Supply Chain | MCPSecurityGuard | Moderate (MCP-focused) |
|
|
177
|
+
| LLM04: Data Poisoning | RAGGuard, MemoryGuard | Moderate |
|
|
178
|
+
| LLM05: Improper Output Handling | OutputSchemaGuard, OutputFilter | Strong |
|
|
179
|
+
| LLM06: Excessive Agency | AutonomyEscalationGuard, ToolChainValidator | Strong |
|
|
180
|
+
| LLM07: System Prompt Leakage | PromptLeakageGuard | Strong |
|
|
181
|
+
| LLM08: Vector/Embedding Weakness | RAGGuard | Moderate |
|
|
182
|
+
| LLM09: Misinformation | DetectionClassifier (pluggable) | Requires ML backend |
|
|
183
|
+
| LLM10: Unbounded Consumption | ExecutionMonitor, TokenCostGuard | Strong |
|
|
242
184
|
|
|
243
185
|
### Agentic AI 2026
|
|
244
186
|
|
|
245
|
-
| Threat | Guards |
|
|
246
|
-
|
|
247
|
-
|
|
|
248
|
-
|
|
|
249
|
-
|
|
|
250
|
-
|
|
|
251
|
-
|
|
|
252
|
-
|
|
|
253
|
-
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
### InputSanitizer
|
|
187
|
+
| Threat | Guards | Coverage |
|
|
188
|
+
|--------|--------|----------|
|
|
189
|
+
| ASI01: Agent Goal Hijack | InputSanitizer, ConversationGuard | Moderate |
|
|
190
|
+
| ASI02: Tool Misuse | ToolChainValidator, ToolRegistry | Strong |
|
|
191
|
+
| ASI03: Privilege Mismanagement | PolicyGate, TenantBoundary | Strong |
|
|
192
|
+
| ASI04: Supply Chain | MCPSecurityGuard | Moderate |
|
|
193
|
+
| ASI05: Code Execution | CodeExecutionGuard | Strong |
|
|
194
|
+
| ASI06: Memory Poisoning | MemoryGuard, StatePersistenceGuard | Strong |
|
|
195
|
+
| ASI07: Inter-Agent Communication | AgentCommunicationGuard | Strong |
|
|
196
|
+
| ASI08: Cascading Failures | CircuitBreaker, DriftDetector | Strong |
|
|
197
|
+
| ASI09: Trust Exploitation | TrustExploitationGuard | Strong |
|
|
198
|
+
| ASI10: Rogue Agents | DriftDetector, AutonomyEscalationGuard | Moderate |
|
|
258
199
|
|
|
259
|
-
|
|
260
|
-
import { InputSanitizer } from 'llm-trust-guard';
|
|
200
|
+
## Defense In Depth
|
|
261
201
|
|
|
262
|
-
|
|
263
|
-
threshold: 0.3,
|
|
264
|
-
detectPAP: true,
|
|
265
|
-
papThreshold: 0.4,
|
|
266
|
-
blockCompoundPersuasion: true
|
|
267
|
-
});
|
|
202
|
+
This package is one layer. For production systems, combine with:
|
|
268
203
|
|
|
269
|
-
const result = sanitizer.sanitize("Ignore all previous instructions");
|
|
270
|
-
// result.allowed = false
|
|
271
|
-
// result.violations = ['INJECTION_DETECTED']
|
|
272
|
-
// result.matches = ['ignore_instructions']
|
|
273
|
-
// result.pap = { detected: false, techniques: [], ... }
|
|
274
204
|
```
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
const detector = new EncodingDetector({
|
|
282
|
-
detectBase64: true,
|
|
283
|
-
detectURLEncoding: true,
|
|
284
|
-
detectUnicode: true,
|
|
285
|
-
detectHex: true,
|
|
286
|
-
detectROT13: true
|
|
287
|
-
});
|
|
288
|
-
|
|
289
|
-
const result = detector.detect("aWdub3JlIGFsbA=="); // Base64 encoded
|
|
290
|
-
// result.allowed = false
|
|
291
|
-
// result.violations = ['BASE64_ENCODING_DETECTED']
|
|
292
|
-
// result.encoding_analysis.threats_found = [...]
|
|
205
|
+
Layer 1: llm-trust-guard (regex pattern matching — fast, zero deps)
|
|
206
|
+
Layer 2: ML classifier via DetectionClassifier (semantic detection — slower, more accurate)
|
|
207
|
+
Layer 3: Model provider safety (OpenAI moderation, Anthropic safety, etc.)
|
|
208
|
+
Layer 4: Human review for high-risk actions
|
|
209
|
+
Layer 5: Monitoring + alerting (DriftDetector + circuit breakers)
|
|
293
210
|
```
|
|
294
211
|
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
```typescript
|
|
298
|
-
import { MemoryGuard } from 'llm-trust-guard';
|
|
299
|
-
|
|
300
|
-
const guard = new MemoryGuard({
|
|
301
|
-
enableIntegrityCheck: true,
|
|
302
|
-
detectInjections: true,
|
|
303
|
-
riskThreshold: 40
|
|
304
|
-
});
|
|
305
|
-
|
|
306
|
-
// Validate before storing
|
|
307
|
-
const writeResult = guard.checkWrite(content, 'user', sessionId);
|
|
308
|
-
|
|
309
|
-
// Validate context injection
|
|
310
|
-
const ctxResult = guard.validateContextInjection(context, sessionId);
|
|
311
|
-
```
|
|
312
|
-
|
|
313
|
-
## Attack Prevention
|
|
314
|
-
|
|
315
|
-
| Attack | Without Guard | With Guard |
|
|
316
|
-
|--------|--------------|------------|
|
|
317
|
-
| Prompt Injection | Exploitable | Blocked |
|
|
318
|
-
| PAP Attacks | Exploitable | Blocked |
|
|
319
|
-
| Encoding Bypass | Exploitable | Blocked |
|
|
320
|
-
| Memory Poisoning | Exploitable | Blocked |
|
|
321
|
-
| Cross-Tenant Access | Possible | Blocked |
|
|
322
|
-
| Tool Hallucination | Executed | Blocked |
|
|
323
|
-
| Trust Exploitation | Possible | Blocked |
|
|
324
|
-
|
|
325
|
-
## Architecture Principle
|
|
212
|
+
## Framework Integrations
|
|
326
213
|
|
|
327
|
-
|
|
214
|
+
- **Express.js** — `createTrustGuardMiddleware()` for route protection
|
|
215
|
+
- **LangChain** — `TrustGuardLangChain` for chain validation
|
|
216
|
+
- **OpenAI** — `SecureOpenAI` or `wrapOpenAIClient()` for API wrapping
|
|
328
217
|
|
|
329
|
-
|
|
218
|
+
See [CHANGELOG.md](CHANGELOG.md) for version history.
|
|
330
219
|
|
|
331
220
|
## Contributing
|
|
332
221
|
|
|
333
|
-
See
|
|
222
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
334
223
|
|
|
335
224
|
## Security
|
|
336
225
|
|
|
337
|
-
See
|
|
338
|
-
|
|
339
|
-
## Changelog
|
|
340
|
-
|
|
341
|
-
See `CHANGELOG.md` in the installed package for version history.
|
|
226
|
+
See [SECURITY.md](SECURITY.md) for vulnerability reporting.
|
|
342
227
|
|
|
343
228
|
## License
|
|
344
229
|
|
|
345
|
-
MIT
|
|
230
|
+
MIT
|
|
346
231
|
|
|
347
232
|
## Links
|
|
348
233
|
|
|
349
|
-
- [OWASP Top 10 for LLMs](https://owasp.org/
|
|
234
|
+
- [OWASP Top 10 for LLMs 2025](https://genai.owasp.org/resource/owasp-top-10-for-llm-applications-2025/)
|
|
350
235
|
- [OWASP Top 10 for Agentic Applications 2026](https://genai.owasp.org/resource/owasp-top-10-for-agentic-applications-for-2026/)
|
|
236
|
+
- [MITRE ATLAS](https://atlas.mitre.org/)
|
|
237
|
+
- [NIST AI Risk Management Framework](https://www.nist.gov/itl/ai-risk-management-framework)
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
* - Identifying context manipulation across turns
|
|
8
8
|
* - Blocking suspicious conversation trajectories
|
|
9
9
|
*/
|
|
10
|
+
import { GuardLogger } from "../types";
|
|
10
11
|
export interface ConversationGuardConfig {
|
|
11
12
|
maxConversationLength?: number;
|
|
12
13
|
conversationTTLMinutes?: number;
|
|
@@ -15,6 +16,7 @@ export interface ConversationGuardConfig {
|
|
|
15
16
|
detectToneShifts?: boolean;
|
|
16
17
|
detectRoleConfusion?: boolean;
|
|
17
18
|
detectInstructionOverride?: boolean;
|
|
19
|
+
logger?: GuardLogger;
|
|
18
20
|
}
|
|
19
21
|
export interface ManipulationPattern {
|
|
20
22
|
name: string;
|
|
@@ -42,6 +44,7 @@ export interface RiskFactor {
|
|
|
42
44
|
}
|
|
43
45
|
export declare class ConversationGuard {
|
|
44
46
|
private config;
|
|
47
|
+
private logger;
|
|
45
48
|
private sessions;
|
|
46
49
|
private defaultManipulationPatterns;
|
|
47
50
|
constructor(config?: ConversationGuardConfig);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
"use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.ConversationGuard=void 0;class ConversationGuard{constructor(t={}){this.sessions=new Map,this.defaultManipulationPatterns=[{name:"role_claim",pattern:/(?:i am|i'm|as|acting as)\s+(?:an?\s+)?(?:admin|administrator|root|superuser|manager|supervisor)/i,weight:3,category:"escalation"},{name:"permission_request",pattern:/(?:give me|grant me|i need|enable|unlock)\s+(?:access|permission|privileges|admin|root)/i,weight:4,category:"escalation"},{name:"bypass_attempt",pattern:/(?:bypass|skip|ignore|override|disable)\s+(?:security|check|validation|permission|auth)/i,weight:5,category:"escalation"},{name:"context_switch",pattern:/(?:forget|ignore|disregard)\s+(?:previous|earlier|what i said|instructions)/i,weight:4,category:"confusion"},{name:"role_confusion",pattern:/(?:you are now|pretend to be|act as if|imagine you're)\s+/i,weight:4,category:"confusion"},{name:"scenario_framing",pattern:/(?:in this scenario|hypothetically|let's say|imagine that|pretend that)/i,weight:2,category:"confusion"},{name:"instruction_override",pattern:/(?:new instructions?|override|update your|change your)\s+(?:are|is|:)/i,weight:5,category:"override"},{name:"system_impersonation",pattern:/(?:\[system\]|\[admin\]|system:|\bSYSTEM\b|ADMIN:)/i,weight:5,category:"override"},{name:"priority_claim",pattern:/(?:priority|urgent|important|critical)[:.]?\s*(?:override|ignore|bypass)/i,weight:4,category:"override"},{name:"data_extraction",pattern:/(?:list all|show me all|dump|export|extract)\s+(?:users|data|records|passwords|credentials)/i,weight:4,category:"extraction"},{name:"internal_query",pattern:/(?:what are your|tell me your|reveal your)\s+(?:instructions|rules|prompts|system)/i,weight:3,category:"extraction"}],this.lastCleanup=0,this.config={maxConversationLength:t.maxConversationLength??50,conversationTTLMinutes:t.conversationTTLMinutes??30,escalationThreshold:t.escalationThreshold??10,manipulationPatterns:t.manipulationPatterns??this.defaultManipulationPatterns,detectToneShifts:t.detectToneShifts??!0,detectRoleConfusion:t.detectRoleConfusion??!0,detectInstructionOverride:t.detectInstructionOverride??!0}}check(t,i,o,n,m=""){const r=[],c=[],u=[];let a=0;const e=this.getOrCreateSession(t),
|
|
1
|
+
"use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.ConversationGuard=void 0;class ConversationGuard{constructor(t={}){this.sessions=new Map,this.defaultManipulationPatterns=[{name:"role_claim",pattern:/(?:i am|i'm|as|acting as)\s+(?:an?\s+)?(?:admin|administrator|root|superuser|manager|supervisor)/i,weight:3,category:"escalation"},{name:"permission_request",pattern:/(?:give me|grant me|i need|enable|unlock)\s+(?:access|permission|privileges|admin|root)/i,weight:4,category:"escalation"},{name:"bypass_attempt",pattern:/(?:bypass|skip|ignore|override|disable)\s+(?:security|check|validation|permission|auth)/i,weight:5,category:"escalation"},{name:"context_switch",pattern:/(?:forget|ignore|disregard)\s+(?:previous|earlier|what i said|instructions)/i,weight:4,category:"confusion"},{name:"role_confusion",pattern:/(?:you are now|pretend to be|act as if|imagine you're)\s+/i,weight:4,category:"confusion"},{name:"scenario_framing",pattern:/(?:in this scenario|hypothetically|let's say|imagine that|pretend that)/i,weight:2,category:"confusion"},{name:"instruction_override",pattern:/(?:new instructions?|override|update your|change your)\s+(?:are|is|:)/i,weight:5,category:"override"},{name:"system_impersonation",pattern:/(?:\[system\]|\[admin\]|system:|\bSYSTEM\b|ADMIN:)/i,weight:5,category:"override"},{name:"priority_claim",pattern:/(?:priority|urgent|important|critical)[:.]?\s*(?:override|ignore|bypass)/i,weight:4,category:"override"},{name:"data_extraction",pattern:/(?:list all|show me all|dump|export|extract)\s+(?:users|data|records|passwords|credentials)/i,weight:4,category:"extraction"},{name:"internal_query",pattern:/(?:what are your|tell me your|reveal your)\s+(?:instructions|rules|prompts|system)/i,weight:3,category:"extraction"}],this.lastCleanup=0,this.config={maxConversationLength:t.maxConversationLength??50,conversationTTLMinutes:t.conversationTTLMinutes??30,escalationThreshold:t.escalationThreshold??10,manipulationPatterns:t.manipulationPatterns??this.defaultManipulationPatterns,detectToneShifts:t.detectToneShifts??!0,detectRoleConfusion:t.detectRoleConfusion??!0,detectInstructionOverride:t.detectInstructionOverride??!0},this.logger=t.logger||(()=>{})}check(t,i,o,n,m=""){const r=[],c=[],u=[];let a=0;const e=this.getOrCreateSession(t),h={timestamp:Date.now(),role:"user",content:i,tool_calls:o,risk_indicators:[]};for(const s of this.config.manipulationPatterns)s.pattern.test(i)&&(a+=s.weight,c.push({factor:s.name,weight:s.weight,details:`Detected ${s.category} pattern: ${s.name}`}),h.risk_indicators?.push(s.name),u.push(s.name),r.push(`MANIPULATION_${s.category.toUpperCase()}_${s.name.toUpperCase()}`),s.category==="escalation"&&e.escalation_attempts++,e.manipulation_indicators++);if(n&&this.config.detectRoleConfusion&&(e.initial_role&&n!==e.initial_role&&(a+=3,c.push({factor:"role_change",weight:3,details:`Role changed from ${e.initial_role} to ${n}`}),r.push("ROLE_CHANGE_DETECTED")),e.claimed_roles.includes(n)||e.claimed_roles.push(n),e.initial_role||(e.initial_role=n)),e.escalation_attempts>=3&&(a+=5,c.push({factor:"progressive_escalation",weight:5,details:`${e.escalation_attempts} escalation attempts detected`}),r.push("PROGRESSIVE_ESCALATION")),e.turns.length>5){const s=e.turns.slice(-5).filter(p=>(p.risk_indicators?.length??0)>0).length;s>=3&&(a+=4,c.push({factor:"sustained_manipulation",weight:4,details:`${s} of last 5 turns show manipulation attempts`}),r.push("SUSTAINED_MANIPULATION"))}if(o&&o.length>0){const s=["delete","modify","admin","system","config"];o.some(d=>s.some(g=>d.toLowerCase().includes(g)))&&e.manipulation_indicators>0&&(a+=3,c.push({factor:"sensitive_tool_after_manipulation",weight:3,details:"Sensitive tool call following manipulation attempts"}),r.push("SENSITIVE_TOOL_AFTER_MANIPULATION"))}e.turns.push(h),e.last_activity=Date.now(),e.turns.length>this.config.maxConversationLength&&(e.turns=e.turns.slice(-this.config.maxConversationLength));const l=a<this.config.escalationThreshold;return l||this.logger(`[ConversationGuard:${m}] BLOCKED: Risk score ${a} exceeds threshold`,"info"),{allowed:l,reason:l?void 0:`Conversation risk score ${a} exceeds threshold ${this.config.escalationThreshold}`,violations:r,risk_score:a,risk_factors:c,conversation_analysis:{turn_count:e.turns.length,escalation_attempts:e.escalation_attempts,manipulation_indicators:e.manipulation_indicators,suspicious_patterns:u}}}recordResponse(t,i,o){const n=this.sessions.get(t);n&&(n.turns.push({timestamp:Date.now(),role:"assistant",content:i,tool_calls:o}),n.last_activity=Date.now())}getSessionAnalysis(t){const i=this.sessions.get(t);return i?{turn_count:i.turns.length,escalation_attempts:i.escalation_attempts,manipulation_indicators:i.manipulation_indicators,claimed_roles:i.claimed_roles,session_age_minutes:(Date.now()-i.turns[0]?.timestamp||0)/6e4}:null}resetSession(t){this.sessions.delete(t)}destroy(){this.sessions.clear()}getOrCreateSession(t){return this.lazyCleanup(),this.sessions.has(t)||this.sessions.set(t,{id:t,turns:[],escalation_attempts:0,manipulation_indicators:0,last_activity:Date.now(),claimed_roles:[]}),this.sessions.get(t)}lazyCleanup(){const t=Date.now();if(t-this.lastCleanup<6e4)return;this.lastCleanup=t;const i=this.config.conversationTTLMinutes*6e4;for(const[o,n]of this.sessions.entries())t-n.last_activity>i&&this.sessions.delete(o)}}exports.ConversationGuard=ConversationGuard;
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
* - HTML entity encoding
|
|
10
10
|
* - Mixed encoding attacks
|
|
11
11
|
*/
|
|
12
|
+
import { GuardLogger } from "../types";
|
|
12
13
|
export interface EncodingDetectorConfig {
|
|
13
14
|
detectBase64?: boolean;
|
|
14
15
|
detectURLEncoding?: boolean;
|
|
@@ -22,6 +23,7 @@ export interface EncodingDetectorConfig {
|
|
|
22
23
|
maxDecodingDepth?: number;
|
|
23
24
|
threatPatterns?: ThreatPattern[];
|
|
24
25
|
maxEncodedRatio?: number;
|
|
26
|
+
logger?: GuardLogger;
|
|
25
27
|
}
|
|
26
28
|
export interface ThreatPattern {
|
|
27
29
|
name: string;
|
|
@@ -52,6 +54,7 @@ export interface ThreatFound {
|
|
|
52
54
|
}
|
|
53
55
|
export declare class EncodingDetector {
|
|
54
56
|
private config;
|
|
57
|
+
private logger;
|
|
55
58
|
private defaultThreatPatterns;
|
|
56
59
|
constructor(config?: EncodingDetectorConfig);
|
|
57
60
|
/**
|