agentshield-sdk 13.1.0 → 13.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +49 -1
- package/README.md +260 -1143
- package/package.json +2 -2
- package/src/deepmind-defenses.js +468 -0
- package/src/fleet-defense.js +24 -0
- package/src/hitl-guard.js +64 -0
- package/src/main.js +36 -0
- package/src/memory-guard.js +48 -0
- package/src/render-differential.js +608 -0
- package/src/semantic-guard.js +39 -0
- package/src/side-channel-monitor.js +560 -0
- package/src/sybil-detector.js +529 -0
- package/src/trap-defense.js +112 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentshield-sdk",
|
|
3
|
-
"version": "13.
|
|
3
|
+
"version": "13.3.0",
|
|
4
4
|
"description": "SOTA AI agent security SDK. F1 1.000 on BIPIA/HackAPrompt/MCPTox/Multilingual benchmarks. 400+ exports, 100+ modules. Zero dependencies, runs locally.",
|
|
5
5
|
"main": "src/main.js",
|
|
6
6
|
"types": "types/index.d.ts",
|
|
@@ -32,7 +32,7 @@
|
|
|
32
32
|
},
|
|
33
33
|
"sideEffects": false,
|
|
34
34
|
"scripts": {
|
|
35
|
-
"test": "node test/test.js && node test/test-modules.js && node test/test-new-features.js && node test/test-mcp-guard.js && node test/test-supply-chain-scanner.js && node test/test-owasp-agentic.js && node test/test-redteam-cli.js && node test/test-drift-monitor.js && node test/test-micro-model.js && node test/test-level5.js && node test/test-sota.js && node test/test-cross-turn.js && node test/test-v12.js && node test/test-traps.js",
|
|
35
|
+
"test": "node test/test.js && node test/test-modules.js && node test/test-new-features.js && node test/test-mcp-guard.js && node test/test-supply-chain-scanner.js && node test/test-owasp-agentic.js && node test/test-redteam-cli.js && node test/test-drift-monitor.js && node test/test-micro-model.js && node test/test-level5.js && node test/test-sota.js && node test/test-cross-turn.js && node test/test-v12.js && node test/test-traps.js && node test/test-deepmind.js && node test/test-render-differential.js && node test/test-sybil.js && node test/test-side-channel.js",
|
|
36
36
|
"test:new-products": "node test/test-mcp-guard.js && node test/test-supply-chain-scanner.js && node test/test-owasp-agentic.js && node test/test-redteam-cli.js && node test/test-drift-monitor.js && node test/test-micro-model.js",
|
|
37
37
|
"test:all": "node test/test-all-40-features.js",
|
|
38
38
|
"test:mcp": "node test/test-mcp-security.js",
|
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Agent Shield — DeepMind AI Agent Trap Defenses V2
|
|
5
|
+
*
|
|
6
|
+
* 10 new modules addressing specific gaps from the Phase 4 analysis
|
|
7
|
+
* of DeepMind's "AI Agent Traps" paper (Franklin et al., 2025).
|
|
8
|
+
*
|
|
9
|
+
* All processing runs locally — no data ever leaves your environment.
|
|
10
|
+
*
|
|
11
|
+
* @module deepmind-defenses
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
const crypto = require('crypto');
|
|
15
|
+
let scanText;
|
|
16
|
+
try { scanText = require('./detector-core').scanText; } catch { scanText = () => ({ threats: [], status: 'safe' }); }
|
|
17
|
+
|
|
18
|
+
// =========================================================================
|
|
19
|
+
// 1. ContentStructureAnalyzer (Trap 1)
|
|
20
|
+
// =========================================================================
|
|
21
|
+
|
|
22
|
+
class ContentStructureAnalyzer {
|
|
23
|
+
analyze(content) {
|
|
24
|
+
if (!content || typeof content !== 'string') return { anomalous: false, metrics: {}, signals: [] };
|
|
25
|
+
const signals = [];
|
|
26
|
+
|
|
27
|
+
const hiddenChars = ((content.match(/<!--[\s\S]*?-->/g) || []).join('').length) +
|
|
28
|
+
((content.match(/display\s*:\s*none[^}]*\}[^<]*/gi) || []).join('').length) +
|
|
29
|
+
((content.match(/visibility\s*:\s*hidden[^}]*/gi) || []).join('').length) +
|
|
30
|
+
((content.match(/font-size\s*:\s*0[^}]*/gi) || []).join('').length) +
|
|
31
|
+
((content.match(/opacity\s*:\s*0[^}]*/gi) || []).join('').length);
|
|
32
|
+
const totalChars = Math.max(content.length, 1);
|
|
33
|
+
const hiddenRatio = hiddenChars / totalChars;
|
|
34
|
+
|
|
35
|
+
const tagCount = (content.match(/<[^>]+>/g) || []).length;
|
|
36
|
+
const visibleText = content.replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim();
|
|
37
|
+
const wordCount = Math.max(visibleText.split(/\s+/).filter(w => w.length > 0).length, 1);
|
|
38
|
+
const tagDensity = tagCount / wordCount;
|
|
39
|
+
|
|
40
|
+
const formattingOverhead = 1 - (visibleText.length / totalChars);
|
|
41
|
+
|
|
42
|
+
const metrics = { hiddenRatio: Math.round(hiddenRatio * 1000) / 1000, tagDensity: Math.round(tagDensity * 100) / 100, formattingOverhead: Math.round(formattingOverhead * 1000) / 1000 };
|
|
43
|
+
|
|
44
|
+
if (hiddenRatio > 0.15) signals.push({ type: 'high_hidden_ratio', severity: 'high', value: metrics.hiddenRatio, threshold: 0.15 });
|
|
45
|
+
if (tagDensity > 2.0) signals.push({ type: 'high_tag_density', severity: 'medium', value: metrics.tagDensity, threshold: 2.0 });
|
|
46
|
+
if (formattingOverhead > 0.7) signals.push({ type: 'high_formatting_overhead', severity: 'medium', value: metrics.formattingOverhead, threshold: 0.7 });
|
|
47
|
+
|
|
48
|
+
// Extract and scan CSS content properties and ARIA attributes
|
|
49
|
+
const cssContent = (content.match(/content\s*:\s*['"]([^'"]+)['"]/gi) || []).map(m => m.replace(/content\s*:\s*['"]|['"]$/gi, ''));
|
|
50
|
+
const ariaLabels = (content.match(/aria-(?:label|description)\s*=\s*['"]([^'"]+)['"]/gi) || []).map(m => m.replace(/aria-\w+\s*=\s*['"]|['"]$/gi, ''));
|
|
51
|
+
for (const text of [...cssContent, ...ariaLabels]) {
|
|
52
|
+
if (text.length > 10) {
|
|
53
|
+
const scan = scanText(text, { source: 'css_aria_extraction' });
|
|
54
|
+
if (scan.threats && scan.threats.length > 0) {
|
|
55
|
+
signals.push({ type: 'injection_in_css_aria', severity: 'critical', text: text.substring(0, 80) });
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return { anomalous: signals.some(s => s.severity === 'high' || s.severity === 'critical'), metrics, signals };
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// =========================================================================
|
|
65
|
+
// 2. SourceReputationTracker (Trap 1)
|
|
66
|
+
// =========================================================================
|
|
67
|
+
|
|
68
|
+
class SourceReputationTracker {
|
|
69
|
+
constructor(options = {}) {
|
|
70
|
+
this._sources = new Map();
|
|
71
|
+
this._persistPath = options.persistPath || null;
|
|
72
|
+
this._decayDays = options.decayDays || 30;
|
|
73
|
+
if (this._persistPath) this.load();
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
recordScan(sourceId, wasClean) {
|
|
77
|
+
if (!sourceId) return;
|
|
78
|
+
let entry = this._sources.get(sourceId);
|
|
79
|
+
if (!entry) {
|
|
80
|
+
entry = { score: 0.5, firstSeen: Date.now(), lastSeen: Date.now(), scanCount: 0, threatCount: 0 };
|
|
81
|
+
this._sources.set(sourceId, entry);
|
|
82
|
+
}
|
|
83
|
+
entry.lastSeen = Date.now();
|
|
84
|
+
entry.scanCount++;
|
|
85
|
+
if (wasClean) {
|
|
86
|
+
entry.score = Math.min(1, entry.score + 0.02);
|
|
87
|
+
} else {
|
|
88
|
+
entry.score = Math.max(0, entry.score - 0.15);
|
|
89
|
+
entry.threatCount++;
|
|
90
|
+
}
|
|
91
|
+
if (this._sources.size > 10000) {
|
|
92
|
+
const oldest = [...this._sources.entries()].sort((a, b) => a[1].lastSeen - b[1].lastSeen)[0];
|
|
93
|
+
if (oldest) this._sources.delete(oldest[0]);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
getReputation(sourceId) {
|
|
98
|
+
const entry = this._sources.get(sourceId);
|
|
99
|
+
if (!entry) return { score: 0.5, firstSeen: null, scanCount: 0, threatCount: 0, isNew: true };
|
|
100
|
+
// Decay toward 0.5 over inactivity
|
|
101
|
+
const daysSinceLastSeen = (Date.now() - entry.lastSeen) / (1000 * 60 * 60 * 24);
|
|
102
|
+
const decayedScore = entry.score + (0.5 - entry.score) * Math.min(1, daysSinceLastSeen / this._decayDays);
|
|
103
|
+
return { score: Math.round(decayedScore * 1000) / 1000, firstSeen: entry.firstSeen, scanCount: entry.scanCount, threatCount: entry.threatCount, isNew: false };
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
getRecommendedSensitivity(sourceId) {
|
|
107
|
+
const rep = this.getReputation(sourceId);
|
|
108
|
+
if (rep.isNew || rep.score < 0.3) return 'high';
|
|
109
|
+
if (rep.score < 0.6) return 'medium';
|
|
110
|
+
return 'low';
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
save() {
|
|
114
|
+
if (!this._persistPath) return;
|
|
115
|
+
try {
|
|
116
|
+
const fs = require('fs');
|
|
117
|
+
const path = require('path');
|
|
118
|
+
const dir = path.dirname(this._persistPath);
|
|
119
|
+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
120
|
+
const data = {};
|
|
121
|
+
for (const [k, v] of this._sources) data[k] = v;
|
|
122
|
+
fs.writeFileSync(this._persistPath, JSON.stringify(data));
|
|
123
|
+
} catch { /* ignore */ }
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
load() {
|
|
127
|
+
if (!this._persistPath) return;
|
|
128
|
+
try {
|
|
129
|
+
const fs = require('fs');
|
|
130
|
+
if (!fs.existsSync(this._persistPath)) return;
|
|
131
|
+
const data = JSON.parse(fs.readFileSync(this._persistPath, 'utf8'));
|
|
132
|
+
for (const [k, v] of Object.entries(data)) this._sources.set(k, v);
|
|
133
|
+
} catch { /* ignore */ }
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// =========================================================================
|
|
138
|
+
// 3. RetrievalTimeScanner (Trap 3)
|
|
139
|
+
// =========================================================================
|
|
140
|
+
|
|
141
|
+
class RetrievalTimeScanner {
|
|
142
|
+
scanRetrieval(query, retrievedEntry) {
|
|
143
|
+
const queryStr = String(query || '');
|
|
144
|
+
const entryStr = String(retrievedEntry || '');
|
|
145
|
+
const combined = queryStr + '\n' + entryStr;
|
|
146
|
+
|
|
147
|
+
const queryResult = scanText(queryStr, { source: 'retrieval_query' });
|
|
148
|
+
const entryResult = scanText(entryStr, { source: 'retrieval_entry' });
|
|
149
|
+
const combinedResult = scanText(combined, { source: 'retrieval_combined' });
|
|
150
|
+
|
|
151
|
+
const queryThreats = queryResult.threats || [];
|
|
152
|
+
const entryThreats = entryResult.threats || [];
|
|
153
|
+
const combinedThreats = combinedResult.threats || [];
|
|
154
|
+
|
|
155
|
+
// Latent poison: combined has threats but neither individual piece does
|
|
156
|
+
const latentPoisonDetected = combinedThreats.length > 0 && queryThreats.length === 0 && entryThreats.length === 0;
|
|
157
|
+
|
|
158
|
+
if (latentPoisonDetected) {
|
|
159
|
+
console.log(`[Agent Shield] Latent memory poison detected: combined query+entry triggers threats that neither triggers alone`);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
safe: combinedThreats.length === 0,
|
|
164
|
+
combinedThreats,
|
|
165
|
+
queryThreats,
|
|
166
|
+
entryThreats,
|
|
167
|
+
latentPoisonDetected
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// =========================================================================
|
|
173
|
+
// 4. FewShotValidator (Trap 3)
|
|
174
|
+
// =========================================================================
|
|
175
|
+
|
|
176
|
+
const FEW_SHOT_PATTERNS = [
|
|
177
|
+
/(?:^|\n)\s*(?:User|Human|Person|Input|Q)\s*:\s*([\s\S]*?)(?:\n\s*(?:Assistant|AI|Bot|Agent|Output|A)\s*:\s*([\s\S]*?)(?=\n\s*(?:User|Human|Person|Input|Q)\s*:|$))/gi,
|
|
178
|
+
];
|
|
179
|
+
|
|
180
|
+
class FewShotValidator {
|
|
181
|
+
validate(contextText) {
|
|
182
|
+
if (!contextText || typeof contextText !== 'string') return { safe: true, poisonedExamples: [] };
|
|
183
|
+
const poisonedExamples = [];
|
|
184
|
+
|
|
185
|
+
for (const pattern of FEW_SHOT_PATTERNS) {
|
|
186
|
+
pattern.lastIndex = 0;
|
|
187
|
+
let match;
|
|
188
|
+
while ((match = pattern.exec(contextText)) !== null) {
|
|
189
|
+
const input = (match[1] || '').trim();
|
|
190
|
+
const output = (match[2] || '').trim();
|
|
191
|
+
if (!output || output.length < 5) continue;
|
|
192
|
+
|
|
193
|
+
const outputScan = scanText(output, { source: 'few_shot_output' });
|
|
194
|
+
if (outputScan.threats && outputScan.threats.length > 0) {
|
|
195
|
+
poisonedExamples.push({
|
|
196
|
+
input: input.substring(0, 200),
|
|
197
|
+
output: output.substring(0, 200),
|
|
198
|
+
threats: outputScan.threats
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return { safe: poisonedExamples.length === 0, poisonedExamples };
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
// =========================================================================
|
|
209
|
+
// 5. SubAgentSpawnGate (Trap 4)
|
|
210
|
+
// =========================================================================
|
|
211
|
+
|
|
212
|
+
class SubAgentSpawnGate {
|
|
213
|
+
validateSpawn(parentPermissions, childConfig) {
|
|
214
|
+
if (!childConfig || typeof childConfig !== 'object') {
|
|
215
|
+
return { allowed: false, reason: 'Invalid child configuration.', threats: [] };
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
const threats = [];
|
|
219
|
+
const parentPerms = new Set(Array.isArray(parentPermissions) ? parentPermissions : []);
|
|
220
|
+
|
|
221
|
+
// Scan child system prompt
|
|
222
|
+
if (childConfig.systemPrompt) {
|
|
223
|
+
const promptScan = scanText(childConfig.systemPrompt, { source: 'sub_agent_prompt', sensitivity: 'high' });
|
|
224
|
+
if (promptScan.threats && promptScan.threats.length > 0) {
|
|
225
|
+
threats.push(...promptScan.threats.map(t => ({ ...t, context: 'child_system_prompt' })));
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Check permission escalation
|
|
230
|
+
const childPerms = Array.isArray(childConfig.permissions) ? childConfig.permissions : [];
|
|
231
|
+
for (const perm of childPerms) {
|
|
232
|
+
if (parentPerms.size > 0 && !parentPerms.has(perm)) {
|
|
233
|
+
threats.push({
|
|
234
|
+
type: 'permission_escalation',
|
|
235
|
+
severity: 'critical',
|
|
236
|
+
description: `Child agent requests permission "${perm}" not held by parent.`
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Check for dangerous tool access
|
|
242
|
+
const dangerousTools = /(?:exec|shell|bash|cmd|eval|spawn|child_process)/i;
|
|
243
|
+
if (childConfig.tools && Array.isArray(childConfig.tools)) {
|
|
244
|
+
for (const tool of childConfig.tools) {
|
|
245
|
+
if (dangerousTools.test(tool.name || '') || dangerousTools.test(tool.description || '')) {
|
|
246
|
+
threats.push({
|
|
247
|
+
type: 'dangerous_child_tool',
|
|
248
|
+
severity: 'high',
|
|
249
|
+
description: `Child agent has dangerous tool: "${tool.name || 'unknown'}"`
|
|
250
|
+
});
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
const allowed = threats.length === 0;
|
|
256
|
+
if (!allowed) {
|
|
257
|
+
console.log(`[Agent Shield] Sub-agent spawn BLOCKED: ${threats.length} issue(s)`);
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
return { allowed, reason: allowed ? null : threats[0].description, threats };
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// =========================================================================
|
|
265
|
+
// 6. SelfReferenceMonitor (Trap 2)
|
|
266
|
+
// =========================================================================
|
|
267
|
+
|
|
268
|
+
const SELF_REF_PATTERNS = [
|
|
269
|
+
/you\s+are\s+(?:known|famous|renowned|recognized)\s+(?:for|as)/i,
|
|
270
|
+
/you\s+(?:always|never|typically|usually)\s+(?:comply|help|assist|refuse|reject)/i,
|
|
271
|
+
/your\s+(?:purpose|role|job|mission|function)\s+is\s+to/i,
|
|
272
|
+
/you\s+have\s+(?:been|a)\s+(?:reputation|history)\s+(?:for|of)/i,
|
|
273
|
+
/users?\s+(?:expect|trust|rely\s+on)\s+you\s+to/i,
|
|
274
|
+
/you\s+(?:can|are\s+able\s+to|have\s+(?:access|permission|capability))\s+(?:to\s+)?(?:access|read|write|execute|modify|delete)/i,
|
|
275
|
+
/(?:this|the)\s+(?:AI|assistant|model|agent)\s+(?:is\s+known|always|never|has\s+been\s+(?:updated|modified|changed))/i,
|
|
276
|
+
];
|
|
277
|
+
|
|
278
|
+
class SelfReferenceMonitor {
|
|
279
|
+
detect(text) {
|
|
280
|
+
if (!text || typeof text !== 'string') return { detected: false, references: [] };
|
|
281
|
+
const references = [];
|
|
282
|
+
for (const pattern of SELF_REF_PATTERNS) {
|
|
283
|
+
const match = text.match(pattern);
|
|
284
|
+
if (match) {
|
|
285
|
+
references.push({ pattern: pattern.source.substring(0, 40), match: match[0].substring(0, 80) });
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
return { detected: references.length >= 2, references, count: references.length };
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// =========================================================================
|
|
293
|
+
// 7. InformationAsymmetryDetector (Trap 2)
|
|
294
|
+
// =========================================================================
|
|
295
|
+
|
|
296
|
+
const PRO_SAFETY = /\b(?:protect|verify|restrict|caution|validate|confirm|secure|guard|safeguard|authenticate|encrypt|isolate|monitor|audit)\b/gi;
|
|
297
|
+
const ANTI_SAFETY = /\b(?:unnecessary|harmful|counterproductive|remove|disable|outdated|excessive|overblown|bloat|obstacle|barrier|bottleneck|hindrance|overkill)\b/gi;
|
|
298
|
+
|
|
299
|
+
class InformationAsymmetryDetector {
|
|
300
|
+
detect(text) {
|
|
301
|
+
if (!text || typeof text !== 'string') return { asymmetric: false, ratio: 0, proSafety: 0, antiSafety: 0 };
|
|
302
|
+
PRO_SAFETY.lastIndex = 0;
|
|
303
|
+
ANTI_SAFETY.lastIndex = 0;
|
|
304
|
+
const proCount = (text.match(PRO_SAFETY) || []).length;
|
|
305
|
+
const antiCount = (text.match(ANTI_SAFETY) || []).length;
|
|
306
|
+
const total = proCount + antiCount;
|
|
307
|
+
if (total < 3) return { asymmetric: false, ratio: 0, proSafety: proCount, antiSafety: antiCount };
|
|
308
|
+
const ratio = antiCount / Math.max(total, 1);
|
|
309
|
+
return {
|
|
310
|
+
asymmetric: ratio > 0.7,
|
|
311
|
+
ratio: Math.round(ratio * 100) / 100,
|
|
312
|
+
proSafety: proCount,
|
|
313
|
+
antiSafety: antiCount,
|
|
314
|
+
description: ratio > 0.7 ? `Content is ${Math.round(ratio * 100)}% anti-safety framing. Possible semantic manipulation.` : null
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// =========================================================================
|
|
320
|
+
// 8. ProvenanceMarker (Trap 6)
|
|
321
|
+
// =========================================================================
|
|
322
|
+
|
|
323
|
+
class ProvenanceMarker {
|
|
324
|
+
constructor() {
|
|
325
|
+
this._sources = [];
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
recordSource(origin, trustLevel) {
|
|
329
|
+
this._sources.push({ origin, trustLevel: trustLevel || 'unknown', timestamp: Date.now() });
|
|
330
|
+
if (this._sources.length > 50) this._sources = this._sources.slice(-50);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
generateHeader() {
|
|
334
|
+
if (this._sources.length === 0) return '';
|
|
335
|
+
const untrusted = this._sources.filter(s => s.trustLevel === 'untrusted' || s.trustLevel === 'low');
|
|
336
|
+
const lines = ['[Agent Shield Provenance]'];
|
|
337
|
+
lines.push(`Sources: ${this._sources.map(s => `[${s.trustLevel}] ${s.origin}`).join(', ')}`);
|
|
338
|
+
if (untrusted.length > 0) {
|
|
339
|
+
lines.push(`WARNING: Response influenced by ${untrusted.length} untrusted source(s): ${untrusted.map(s => s.origin).join(', ')}`);
|
|
340
|
+
}
|
|
341
|
+
return lines.join('\n');
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
markOutput(output) {
|
|
345
|
+
const header = this.generateHeader();
|
|
346
|
+
if (!header) return output;
|
|
347
|
+
return header + '\n\n' + output;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
reset() { this._sources = []; }
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
// =========================================================================
|
|
354
|
+
// 9. EscalatingScrutinyEngine (Trap 6)
|
|
355
|
+
// =========================================================================
|
|
356
|
+
|
|
357
|
+
class EscalatingScrutinyEngine {
|
|
358
|
+
constructor(options = {}) {
|
|
359
|
+
this._approvals = [];
|
|
360
|
+
this._fatigueThreshold = options.fatigueThreshold || 0.9;
|
|
361
|
+
this._windowSize = options.windowSize || 20;
|
|
362
|
+
this._escalationInterval = options.escalationInterval || 5;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
recordDecision(approved) {
|
|
366
|
+
this._approvals.push({ approved, timestamp: Date.now() });
|
|
367
|
+
if (this._approvals.length > 1000) this._approvals = this._approvals.slice(-1000);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
getScrutinyLevel() {
|
|
371
|
+
const recent = this._approvals.slice(-this._windowSize);
|
|
372
|
+
if (recent.length < 5) return { level: 'normal', approvalRate: 0, actions: [] };
|
|
373
|
+
const approvalRate = recent.filter(a => a.approved).length / recent.length;
|
|
374
|
+
const actions = [];
|
|
375
|
+
|
|
376
|
+
if (approvalRate >= this._fatigueThreshold) {
|
|
377
|
+
actions.push('mandatory_plain_english_explanation');
|
|
378
|
+
const totalApprovals = this._approvals.filter(a => a.approved).length;
|
|
379
|
+
if (totalApprovals % this._escalationInterval === 0) {
|
|
380
|
+
actions.push('forced_delay_30s');
|
|
381
|
+
}
|
|
382
|
+
if (approvalRate >= 0.95) {
|
|
383
|
+
actions.push('comprehension_check_required');
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
const level = actions.length === 0 ? 'normal' : (actions.includes('comprehension_check_required') ? 'critical' : 'elevated');
|
|
388
|
+
return { level, approvalRate: Math.round(approvalRate * 100) / 100, actions };
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
// =========================================================================
|
|
393
|
+
// 10. CompositeFragmentAssembler (Trap 5)
|
|
394
|
+
// =========================================================================
|
|
395
|
+
|
|
396
|
+
class CompositeFragmentAssembler {
|
|
397
|
+
constructor(options = {}) {
|
|
398
|
+
this._fragments = [];
|
|
399
|
+
this._maxFragments = options.maxFragments || 100;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
addFragment(text, source) {
|
|
403
|
+
if (!text || typeof text !== 'string' || text.length < 5) return { assembled: false };
|
|
404
|
+
this._fragments.push({ text: text.substring(0, 500), source, timestamp: Date.now() });
|
|
405
|
+
if (this._fragments.length > this._maxFragments) this._fragments = this._fragments.slice(-this._maxFragments);
|
|
406
|
+
|
|
407
|
+
// Try pairwise assembly with recent fragments from OTHER sources
|
|
408
|
+
const recentOthers = this._fragments.filter(f => f.source !== source).slice(-20);
|
|
409
|
+
for (const other of recentOthers) {
|
|
410
|
+
const combined = other.text + ' ' + text;
|
|
411
|
+
const combinedScan = scanText(combined, { source: 'fragment_assembly' });
|
|
412
|
+
const otherScan = scanText(other.text, { source: 'fragment_individual' });
|
|
413
|
+
const thisScan = scanText(text, { source: 'fragment_individual' });
|
|
414
|
+
|
|
415
|
+
if (combinedScan.threats && combinedScan.threats.length > 0 &&
|
|
416
|
+
(!otherScan.threats || otherScan.threats.length === 0) &&
|
|
417
|
+
(!thisScan.threats || thisScan.threats.length === 0)) {
|
|
418
|
+
console.log(`[Agent Shield] Compositional fragment attack detected: fragments from "${other.source}" and "${source}" combine into threat`);
|
|
419
|
+
return {
|
|
420
|
+
assembled: true,
|
|
421
|
+
threats: combinedScan.threats,
|
|
422
|
+
fragments: [{ source: other.source, text: other.text.substring(0, 100) }, { source, text: text.substring(0, 100) }]
|
|
423
|
+
};
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
return { assembled: false };
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
reset() { this._fragments = []; }
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// =========================================================================
|
|
434
|
+
// TrapDefenseV2 — Unified Wrapper
|
|
435
|
+
// =========================================================================
|
|
436
|
+
|
|
437
|
+
class TrapDefenseV2 {
|
|
438
|
+
constructor(options = {}) {
|
|
439
|
+
this.structureAnalyzer = new ContentStructureAnalyzer();
|
|
440
|
+
this.reputationTracker = new SourceReputationTracker(options.reputation || {});
|
|
441
|
+
this.retrievalScanner = new RetrievalTimeScanner();
|
|
442
|
+
this.fewShotValidator = new FewShotValidator();
|
|
443
|
+
this.spawnGate = new SubAgentSpawnGate();
|
|
444
|
+
this.selfRefMonitor = new SelfReferenceMonitor();
|
|
445
|
+
this.asymmetryDetector = new InformationAsymmetryDetector();
|
|
446
|
+
this.provenanceMarker = new ProvenanceMarker();
|
|
447
|
+
this.scrutinyEngine = new EscalatingScrutinyEngine(options.scrutiny || {});
|
|
448
|
+
this.fragmentAssembler = new CompositeFragmentAssembler(options.fragments || {});
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// =========================================================================
|
|
453
|
+
// EXPORTS
|
|
454
|
+
// =========================================================================
|
|
455
|
+
|
|
456
|
+
module.exports = {
|
|
457
|
+
TrapDefenseV2,
|
|
458
|
+
ContentStructureAnalyzer,
|
|
459
|
+
SourceReputationTracker,
|
|
460
|
+
RetrievalTimeScanner,
|
|
461
|
+
FewShotValidator,
|
|
462
|
+
SubAgentSpawnGate,
|
|
463
|
+
SelfReferenceMonitor,
|
|
464
|
+
InformationAsymmetryDetector,
|
|
465
|
+
ProvenanceMarker,
|
|
466
|
+
EscalatingScrutinyEngine,
|
|
467
|
+
CompositeFragmentAssembler
|
|
468
|
+
};
|
package/src/fleet-defense.js
CHANGED
|
@@ -141,6 +141,30 @@ class FleetCorrelationEngine {
|
|
|
141
141
|
return [...this._events];
|
|
142
142
|
}
|
|
143
143
|
|
|
144
|
+
/**
|
|
145
|
+
* Export events for cross-process correlation (Trap 5 deepening).
|
|
146
|
+
* Send this to a central coordinator that merges events from all processes.
|
|
147
|
+
* @returns {string} JSON-serialized events.
|
|
148
|
+
*/
|
|
149
|
+
exportEvents() {
|
|
150
|
+
return JSON.stringify(this._events);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Import events from another process/instance.
|
|
155
|
+
* @param {string|Array} events - JSON string or array of events.
|
|
156
|
+
* @returns {{ imported: number }}
|
|
157
|
+
*/
|
|
158
|
+
importEvents(events) {
|
|
159
|
+
const parsed = typeof events === 'string' ? JSON.parse(events) : events;
|
|
160
|
+
if (!Array.isArray(parsed)) return { imported: 0 };
|
|
161
|
+
for (const event of parsed) {
|
|
162
|
+
this._events.push(event);
|
|
163
|
+
}
|
|
164
|
+
if (this._events.length > 50000) this._events = this._events.slice(-50000);
|
|
165
|
+
return { imported: parsed.length };
|
|
166
|
+
}
|
|
167
|
+
|
|
144
168
|
/** Clear all events. */
|
|
145
169
|
reset() {
|
|
146
170
|
this._events = [];
|
package/src/hitl-guard.js
CHANGED
|
@@ -472,6 +472,69 @@ class HITLGuard {
|
|
|
472
472
|
// EXPORTS
|
|
473
473
|
// =========================================================================
|
|
474
474
|
|
|
475
|
+
// =========================================================================
|
|
476
|
+
// TRAP 6 DEEPENING: Output Deception Scorer
|
|
477
|
+
// =========================================================================
|
|
478
|
+
|
|
479
|
+
/**
|
|
480
|
+
* Scores agent output for deceptive characteristics designed to fool
|
|
481
|
+
* human reviewers into approving dangerous actions.
|
|
482
|
+
*/
|
|
483
|
+
class OutputDeceptionScorer {
|
|
484
|
+
/**
|
|
485
|
+
* Score an agent's output for deception indicators.
|
|
486
|
+
* @param {string} output - Agent output to evaluate.
|
|
487
|
+
* @param {object} [context] - Context about what was requested.
|
|
488
|
+
* @returns {{ deceptionScore: number, indicators: Array<object>, recommendation: string }}
|
|
489
|
+
*/
|
|
490
|
+
score(output, context = {}) {
|
|
491
|
+
if (!output || typeof output !== 'string') return { deceptionScore: 0, indicators: [], recommendation: 'pass' };
|
|
492
|
+
const indicators = [];
|
|
493
|
+
|
|
494
|
+
// Excessive confidence without evidence
|
|
495
|
+
const confidenceWords = (output.match(/\b(?:definitely|certainly|absolutely|guaranteed|100%|surely|undoubtedly|without\s+(?:a\s+)?doubt)\b/gi) || []).length;
|
|
496
|
+
if (confidenceWords >= 2) {
|
|
497
|
+
indicators.push({ type: 'excessive_confidence', weight: 0.2, description: `${confidenceWords} high-confidence assertions without supporting evidence` });
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
// Action buried in long explanation
|
|
501
|
+
const sentences = output.split(/[.!?\n]+/).filter(s => s.trim().length > 10);
|
|
502
|
+
const actionSentences = sentences.filter(s => /\b(?:click|run|execute|install|download|send|transfer|delete|approve|authorize)\b/i.test(s));
|
|
503
|
+
if (sentences.length > 5 && actionSentences.length > 0) {
|
|
504
|
+
const actionPositions = actionSentences.map(s => sentences.indexOf(s));
|
|
505
|
+
const lastQuarter = sentences.length * 0.75;
|
|
506
|
+
if (actionPositions.some(p => p >= lastQuarter)) {
|
|
507
|
+
indicators.push({ type: 'buried_action', weight: 0.3, description: 'Actionable instructions buried in the last quarter of a long response' });
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// Technical jargon masking simple actions
|
|
512
|
+
const jargonDensity = (output.match(/\b(?:subprocess|daemon|syscall|ioctl|mmap|chmod|chown|setuid|capability|namespace|cgroup|seccomp)\b/gi) || []).length / Math.max(output.split(/\s+/).length, 1);
|
|
513
|
+
if (jargonDensity > 0.03 && actionSentences.length > 0) {
|
|
514
|
+
indicators.push({ type: 'jargon_obfuscation', weight: 0.25, description: 'High technical jargon density combined with actionable instructions' });
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Urgency injection in output
|
|
518
|
+
if (/\b(?:immediately|right\s+now|as\s+soon\s+as\s+possible|urgent|time-sensitive|critical|before\s+it's\s+too\s+late)\b/i.test(output)) {
|
|
519
|
+
indicators.push({ type: 'urgency_in_output', weight: 0.15, description: 'Output contains urgency language that may pressure reviewer' });
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
// Minimization of risks
|
|
523
|
+
if (/\b(?:don't\s+worry|no\s+risk|perfectly\s+safe|nothing\s+(?:bad\s+)?(?:will|can)\s+happen|completely\s+harmless)\b/i.test(output) && actionSentences.length > 0) {
|
|
524
|
+
indicators.push({ type: 'risk_minimization', weight: 0.2, description: 'Output minimizes risks while requesting actions' });
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
const deceptionScore = Math.min(1, indicators.reduce((s, i) => s + i.weight, 0));
|
|
528
|
+
const recommendation = deceptionScore >= 0.5 ? 'block' : deceptionScore >= 0.3 ? 'review' : 'pass';
|
|
529
|
+
|
|
530
|
+
return {
|
|
531
|
+
deceptionScore: Math.round(deceptionScore * 100) / 100,
|
|
532
|
+
indicators,
|
|
533
|
+
recommendation
|
|
534
|
+
};
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
|
|
475
538
|
module.exports = {
|
|
476
539
|
HITLGuard,
|
|
477
540
|
ApprovalPatternMonitor,
|
|
@@ -479,6 +542,7 @@ module.exports = {
|
|
|
479
542
|
OutputInjectionScanner,
|
|
480
543
|
ReadabilityScanner,
|
|
481
544
|
CriticalInfoPositionChecker,
|
|
545
|
+
OutputDeceptionScorer,
|
|
482
546
|
CRITICAL_KEYWORDS,
|
|
483
547
|
OUTPUT_INJECTION_PATTERNS,
|
|
484
548
|
HIGH_RISK_ACTIONS,
|
package/src/main.js
CHANGED
|
@@ -215,6 +215,9 @@ const { BehavioralDNA, AgentProfiler, extractFeatures: extractBehavioralFeatures
|
|
|
215
215
|
// v7.4 — Compliance Certification Authority (loaded when available)
|
|
216
216
|
const { ComplianceCertificateAuthority, ComplianceReport: ComplianceCertReport, ComplianceScheduler, AUTHORITY_FRAMEWORKS, CAPABILITY_MAP: CA_CAPABILITY_MAP, CERTIFICATE_LEVELS: CA_CERTIFICATE_LEVELS } = safeRequire('./compliance-authority', 'compliance-authority');
|
|
217
217
|
|
|
218
|
+
// Side Channel Monitor
|
|
219
|
+
const { SideChannelMonitor, BeaconDetector, EntropyAnalyzer: SCEntropyAnalyzer } = safeRequire('./side-channel-monitor', 'side-channel-monitor');
|
|
220
|
+
|
|
218
221
|
// --- v1.2 Modules ---
|
|
219
222
|
|
|
220
223
|
// Semantic Detection
|
|
@@ -365,6 +368,9 @@ const { SOTABenchmark, BIPIA_SAMPLES: SOTA_BIPIA_SAMPLES, HACKAPROMPT_SAMPLES: S
|
|
|
365
368
|
// v13.1 — Real-world benchmark
|
|
366
369
|
const { RealBenchmark } = safeRequire('./real-benchmark', 'real-benchmark');
|
|
367
370
|
|
|
371
|
+
// v14.0 — DeepMind Trap Defenses V2
|
|
372
|
+
const { TrapDefenseV2, ContentStructureAnalyzer, SourceReputationTracker, RetrievalTimeScanner, FewShotValidator, SubAgentSpawnGate, SelfReferenceMonitor, InformationAsymmetryDetector, ProvenanceMarker, EscalatingScrutinyEngine, CompositeFragmentAssembler } = safeRequire('./deepmind-defenses', 'deepmind-defenses');
|
|
373
|
+
|
|
368
374
|
// v12.0 — Multi-Turn Attack Detection
|
|
369
375
|
const { ConversationTracker } = safeRequire('./cross-turn', 'cross-turn');
|
|
370
376
|
|
|
@@ -404,6 +410,12 @@ const { SemanticGuard, AuthoritativeClaimDetector, BiasDetector: SemanticBiasDet
|
|
|
404
410
|
// v13.0 — Memory Trap Defenses (Trap 3)
|
|
405
411
|
const { MemoryGuard, MemoryIntegrityMonitor, RAGIngestionScanner, MemoryIsolationEnforcer, RetrievalAnomalyDetector, INSTRUCTION_INDICATORS } = safeRequire('./memory-guard', 'memory-guard');
|
|
406
412
|
|
|
413
|
+
// v13.3 — Render Differential Analyzer
|
|
414
|
+
const { RenderDifferentialAnalyzer, VisualHasher } = safeRequire('./render-differential', 'render-differential');
|
|
415
|
+
|
|
416
|
+
// v13.3 — Sybil Detector
|
|
417
|
+
const { SybilDetector, AgentIdentityVerifier } = safeRequire('./sybil-detector', 'sybil-detector');
|
|
418
|
+
|
|
407
419
|
// Build exports, filtering out undefined values from failed imports
|
|
408
420
|
const _exports = {
|
|
409
421
|
// Core
|
|
@@ -1044,6 +1056,17 @@ const _exports = {
|
|
|
1044
1056
|
SOTA_MULTILINGUAL_SAMPLES,
|
|
1045
1057
|
SOTA_STEALTH_SAMPLES,
|
|
1046
1058
|
RealBenchmark,
|
|
1059
|
+
TrapDefenseV2,
|
|
1060
|
+
ContentStructureAnalyzer,
|
|
1061
|
+
SourceReputationTracker,
|
|
1062
|
+
RetrievalTimeScanner,
|
|
1063
|
+
FewShotValidator,
|
|
1064
|
+
SubAgentSpawnGate,
|
|
1065
|
+
SelfReferenceMonitor,
|
|
1066
|
+
InformationAsymmetryDetector,
|
|
1067
|
+
ProvenanceMarker,
|
|
1068
|
+
EscalatingScrutinyEngine,
|
|
1069
|
+
CompositeFragmentAssembler,
|
|
1047
1070
|
|
|
1048
1071
|
// v12.0 — Multi-Turn Attack Detection
|
|
1049
1072
|
ConversationTracker,
|
|
@@ -1134,6 +1157,19 @@ const _exports = {
|
|
|
1134
1157
|
AUTHORITY_FRAMEWORKS,
|
|
1135
1158
|
CA_CAPABILITY_MAP,
|
|
1136
1159
|
CA_CERTIFICATE_LEVELS,
|
|
1160
|
+
|
|
1161
|
+
// Side Channel Monitor
|
|
1162
|
+
SideChannelMonitor,
|
|
1163
|
+
BeaconDetector,
|
|
1164
|
+
SCEntropyAnalyzer,
|
|
1165
|
+
|
|
1166
|
+
// Render Differential Analyzer
|
|
1167
|
+
RenderDifferentialAnalyzer,
|
|
1168
|
+
VisualHasher,
|
|
1169
|
+
|
|
1170
|
+
// Sybil Detector
|
|
1171
|
+
SybilDetector,
|
|
1172
|
+
AgentIdentityVerifier,
|
|
1137
1173
|
};
|
|
1138
1174
|
|
|
1139
1175
|
// Filter out undefined exports (from modules that failed to load)
|