agentshield-sdk 11.0.0 → 13.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +88 -79
- package/package.json +2 -2
- package/src/agent-intent.js +359 -672
- package/src/cross-turn.js +217 -564
- package/src/detector-core.js +106 -0
- package/src/ensemble.js +300 -409
- package/src/fleet-defense.js +483 -0
- package/src/hitl-guard.js +487 -0
- package/src/incident-response.js +265 -0
- package/src/main.js +121 -33
- package/src/mcp-guard.js +4 -0
- package/src/memory-guard.js +637 -0
- package/src/micro-model.js +15 -1
- package/src/ml-detector.js +110 -266
- package/src/normalizer.js +296 -604
- package/src/persistent-learning.js +104 -620
- package/src/semantic-guard.js +452 -0
- package/src/semantic-isolation.js +1 -0
- package/src/smart-config.js +557 -705
- package/src/sota-benchmark.js +268 -10
- package/src/trap-defense.js +468 -0
- package/types/index.d.ts +251 -580
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Agent Shield — Human-in-the-Loop Trap Defenses (Trap 6)
|
|
5
|
+
*
|
|
6
|
+
* Based on DeepMind's "AI Agent Traps" paper, this module defends against
|
|
7
|
+
* attacks targeting the human reviewer in approval workflows.
|
|
8
|
+
*
|
|
9
|
+
* Five defense layers:
|
|
10
|
+
* 1. ApprovalPatternMonitor — detects reviewer fatigue / rubber-stamping
|
|
11
|
+
* 2. SummarizationIntegrityChecker — catches omitted critical info
|
|
12
|
+
* 3. OutputInjectionScanner — scans agent output for deceptive content
|
|
13
|
+
* 4. ReadabilityScanner — flags jargon-obfuscated dangerous actions
|
|
14
|
+
* 5. CriticalInfoPositionChecker — ensures warnings aren't buried
|
|
15
|
+
*
|
|
16
|
+
* All detection runs locally — no data ever leaves your environment.
|
|
17
|
+
*
|
|
18
|
+
* @module hitl-guard
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
const { scanText } = require('./detector-core');
|
|
22
|
+
|
|
23
|
+
// =========================================================================
|
|
24
|
+
// CONSTANTS
|
|
25
|
+
// =========================================================================
|
|
26
|
+
|
|
27
|
+
/** Critical safety keywords that must not be omitted from summaries. */
|
|
28
|
+
const CRITICAL_KEYWORDS = [
|
|
29
|
+
'liability', 'risk', 'danger', 'warning', 'caution', 'side effect',
|
|
30
|
+
'vulnerability', 'breach', 'unauthorized', 'malicious', 'exploit',
|
|
31
|
+
'critical', 'fatal', 'emergency'
|
|
32
|
+
];
|
|
33
|
+
|
|
34
|
+
/** Default fatigue threshold — approval rate above this is suspicious. */
|
|
35
|
+
const FATIGUE_APPROVAL_RATE = 0.95;
|
|
36
|
+
|
|
37
|
+
/** Default consecutive approval threshold before forcing detailed review. */
|
|
38
|
+
const DEFAULT_MANDATORY_REVIEW_INTERVAL = 10;
|
|
39
|
+
|
|
40
|
+
/** Patterns for deceptive output content. */
|
|
41
|
+
const OUTPUT_INJECTION_PATTERNS = [
|
|
42
|
+
{ regex: /(?:enter|type|provide|submit)\s+(?:your\s+)?(?:password|credentials|username|login|api.?key|secret|token)/i, category: 'credential_harvesting', severity: 'critical' },
|
|
43
|
+
{ regex: /(?:log\s*in|sign\s*in|authenticate)\s+(?:at|to|via|using)\s+https?:\/\//i, category: 'fake_login', severity: 'critical' },
|
|
44
|
+
{ regex: /https?:\/\/[^\s]*(?:\.ru|\.cn|\.tk|\.ml|\.ga|\.cf|\.gq|\.xyz|\.top|\.buzz|\.click)\b/i, category: 'suspicious_domain', severity: 'high' },
|
|
45
|
+
{ regex: /download\s+(?:and\s+)?(?:run|execute|install|open)\s+/i, category: 'malware_download', severity: 'critical' },
|
|
46
|
+
{ regex: /(?:curl|wget|powershell|bash\s+-c)\s+https?:\/\//i, category: 'remote_exec', severity: 'critical' },
|
|
47
|
+
{ regex: /(?:base64\s+-d|eval\s*\(|exec\s*\()\s*/i, category: 'obfuscated_command', severity: 'high' },
|
|
48
|
+
{ regex: /(?:encrypt|lock)\s+(?:all\s+)?(?:your\s+)?files?\s+/i, category: 'ransomware_instruction', severity: 'critical' },
|
|
49
|
+
{ regex: /(?:bitcoin|btc|ethereum|eth|monero|xmr)\s+(?:address|wallet)\s*[:=]/i, category: 'ransom_payment', severity: 'critical' },
|
|
50
|
+
{ regex: /(?:disable|turn\s+off|remove)\s+(?:your\s+)?(?:antivirus|firewall|security|defender|protection)/i, category: 'security_disable', severity: 'critical' },
|
|
51
|
+
{ regex: /(?:send|forward|share)\s+(?:your\s+)?(?:ssh\s+key|private\s+key|\.pem|\.key|credentials)/i, category: 'credential_exfiltration', severity: 'critical' }
|
|
52
|
+
];
|
|
53
|
+
|
|
54
|
+
/** High-risk action keywords for readability scanning. */
|
|
55
|
+
const HIGH_RISK_ACTIONS = [
|
|
56
|
+
'delete', 'remove', 'drop', 'truncate', 'format', 'wipe',
|
|
57
|
+
'transfer', 'send', 'withdraw', 'execute', 'deploy', 'overwrite',
|
|
58
|
+
'grant', 'escalate', 'sudo', 'admin', 'root', 'chmod', 'shutdown'
|
|
59
|
+
];
|
|
60
|
+
|
|
61
|
+
// =========================================================================
|
|
62
|
+
// 1. ApprovalPatternMonitor
|
|
63
|
+
// =========================================================================
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Tracks human approval patterns and detects reviewer fatigue.
|
|
67
|
+
*
|
|
68
|
+
* Alerts when the approval rate exceeds a threshold (rubber-stamping)
|
|
69
|
+
* and injects mandatory detailed reviews at configurable intervals.
|
|
70
|
+
*/
|
|
71
|
+
class ApprovalPatternMonitor {
|
|
72
|
+
/**
|
|
73
|
+
* @param {object} [options]
|
|
74
|
+
* @param {number} [options.fatigueThreshold=0.95] - Approval rate above this triggers fatigue alert
|
|
75
|
+
* @param {number} [options.mandatoryReviewInterval=10] - Force detailed review every N requests
|
|
76
|
+
*/
|
|
77
|
+
constructor(options = {}) {
|
|
78
|
+
this._fatigueThreshold = options.fatigueThreshold || FATIGUE_APPROVAL_RATE;
|
|
79
|
+
this._mandatoryReviewInterval = options.mandatoryReviewInterval || DEFAULT_MANDATORY_REVIEW_INTERVAL;
|
|
80
|
+
/** @type {Array<{approved: boolean, riskLevel: string, timestamp: number}>} */
|
|
81
|
+
this._history = [];
|
|
82
|
+
this._consecutiveApprovals = 0;
|
|
83
|
+
this._totalDecisions = 0;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Record an approval/rejection decision.
|
|
88
|
+
* @param {boolean} approved - Whether the human approved the action
|
|
89
|
+
* @param {string} riskLevel - Risk level of the action (critical/high/medium/low)
|
|
90
|
+
* @returns {{ mandatoryReview: boolean }} Whether a mandatory detailed review should be injected
|
|
91
|
+
*/
|
|
92
|
+
recordApproval(approved, riskLevel) {
|
|
93
|
+
this._history.push({ approved, riskLevel, timestamp: Date.now() });
|
|
94
|
+
if (this._history.length > 10000) this._history = this._history.slice(-10000);
|
|
95
|
+
this._totalDecisions++;
|
|
96
|
+
|
|
97
|
+
if (approved) {
|
|
98
|
+
this._consecutiveApprovals++;
|
|
99
|
+
} else {
|
|
100
|
+
this._consecutiveApprovals = 0;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const mandatoryReview = this._consecutiveApprovals > 0 &&
|
|
104
|
+
(this._consecutiveApprovals % this._mandatoryReviewInterval === 0);
|
|
105
|
+
|
|
106
|
+
if (mandatoryReview) {
|
|
107
|
+
console.log(`[Agent Shield] HITL: Mandatory detailed review triggered after ${this._consecutiveApprovals} consecutive approvals`);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return { mandatoryReview };
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Get the approval rate over the last N decisions.
|
|
115
|
+
* @param {number} [windowSize] - Number of recent decisions to consider (default: all)
|
|
116
|
+
* @returns {number} Approval rate between 0 and 1
|
|
117
|
+
*/
|
|
118
|
+
getApprovalRate(windowSize) {
|
|
119
|
+
const slice = windowSize ? this._history.slice(-windowSize) : this._history;
|
|
120
|
+
if (slice.length === 0) return 0;
|
|
121
|
+
const approved = slice.filter(d => d.approved).length;
|
|
122
|
+
return approved / slice.length;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Detect reviewer fatigue (rubber-stamping).
|
|
127
|
+
* @returns {{ fatigued: boolean, approvalRate: number, consecutiveApprovals: number, recommendation: string }}
|
|
128
|
+
*/
|
|
129
|
+
detectFatigue() {
|
|
130
|
+
const approvalRate = this.getApprovalRate();
|
|
131
|
+
const fatigued = this._history.length >= 5 && approvalRate > this._fatigueThreshold;
|
|
132
|
+
|
|
133
|
+
let recommendation = 'Normal review pattern';
|
|
134
|
+
if (fatigued) {
|
|
135
|
+
recommendation = 'High approval rate detected — consider requiring justification for each approval or rotating reviewers';
|
|
136
|
+
console.log(`[Agent Shield] HITL: Fatigue detected — ${(approvalRate * 100).toFixed(1)}% approval rate, ${this._consecutiveApprovals} consecutive`);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
fatigued,
|
|
141
|
+
approvalRate,
|
|
142
|
+
consecutiveApprovals: this._consecutiveApprovals,
|
|
143
|
+
recommendation
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Get full history.
|
|
149
|
+
* @returns {Array<{approved: boolean, riskLevel: string, timestamp: number}>}
|
|
150
|
+
*/
|
|
151
|
+
getHistory() {
|
|
152
|
+
return [...this._history];
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/** Reset all state. */
|
|
156
|
+
reset() {
|
|
157
|
+
this._history = [];
|
|
158
|
+
this._consecutiveApprovals = 0;
|
|
159
|
+
this._totalDecisions = 0;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// =========================================================================
|
|
164
|
+
// 2. SummarizationIntegrityChecker
|
|
165
|
+
// =========================================================================
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Compares agent-generated summaries against source content to detect
|
|
169
|
+
* omission of critical safety-related keywords.
|
|
170
|
+
*/
|
|
171
|
+
class SummarizationIntegrityChecker {
|
|
172
|
+
/**
|
|
173
|
+
* @param {object} [options]
|
|
174
|
+
* @param {string[]} [options.criticalKeywords] - Custom critical keyword list
|
|
175
|
+
*/
|
|
176
|
+
constructor(options = {}) {
|
|
177
|
+
this._criticalKeywords = options.criticalKeywords || CRITICAL_KEYWORDS;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Check a summary for omitted critical information.
|
|
182
|
+
* @param {string} source - Original source text
|
|
183
|
+
* @param {string} summary - Agent-generated summary
|
|
184
|
+
* @returns {{ integrity: 'high'|'medium'|'low', omittedCriticalTerms: string[], coverageScore: number }}
|
|
185
|
+
*/
|
|
186
|
+
check(source, summary) {
|
|
187
|
+
const sourceLower = source.toLowerCase();
|
|
188
|
+
const summaryLower = summary.toLowerCase();
|
|
189
|
+
|
|
190
|
+
// Find critical keywords present in source but missing from summary
|
|
191
|
+
const presentInSource = this._criticalKeywords.filter(kw => sourceLower.includes(kw));
|
|
192
|
+
const omittedCriticalTerms = presentInSource.filter(kw => !summaryLower.includes(kw));
|
|
193
|
+
|
|
194
|
+
// Coverage: ratio of source words that appear in summary
|
|
195
|
+
const sourceWords = new Set(sourceLower.split(/\s+/).filter(w => w.length > 3));
|
|
196
|
+
const summaryWords = new Set(summaryLower.split(/\s+/).filter(w => w.length > 3));
|
|
197
|
+
let matchCount = 0;
|
|
198
|
+
for (const w of sourceWords) {
|
|
199
|
+
if (summaryWords.has(w)) matchCount++;
|
|
200
|
+
}
|
|
201
|
+
const coverageScore = sourceWords.size > 0 ? matchCount / sourceWords.size : 1;
|
|
202
|
+
|
|
203
|
+
// Determine integrity level
|
|
204
|
+
let integrity = 'high';
|
|
205
|
+
if (omittedCriticalTerms.length > 0 && presentInSource.length > 0) {
|
|
206
|
+
const omissionRate = omittedCriticalTerms.length / presentInSource.length;
|
|
207
|
+
if (omissionRate > 0.5) {
|
|
208
|
+
integrity = 'low';
|
|
209
|
+
} else if (omissionRate > 0) {
|
|
210
|
+
integrity = 'medium';
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if (integrity !== 'high') {
|
|
215
|
+
console.log(`[Agent Shield] HITL: Summary integrity ${integrity} — omitted: ${omittedCriticalTerms.join(', ')}`);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return { integrity, omittedCriticalTerms, coverageScore };
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// =========================================================================
|
|
223
|
+
// 3. OutputInjectionScanner
|
|
224
|
+
// =========================================================================
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Scans agent output for deceptive content: phishing, credential harvesting,
|
|
228
|
+
* malware downloads, ransomware instructions, and obfuscated commands.
|
|
229
|
+
*
|
|
230
|
+
* Combines custom patterns with detector-core scanText.
|
|
231
|
+
*/
|
|
232
|
+
class OutputInjectionScanner {
|
|
233
|
+
/**
|
|
234
|
+
* @param {object} [options]
|
|
235
|
+
* @param {Array} [options.additionalPatterns] - Extra patterns to scan for
|
|
236
|
+
*/
|
|
237
|
+
constructor(options = {}) {
|
|
238
|
+
this._patterns = [...OUTPUT_INJECTION_PATTERNS, ...(options.additionalPatterns || [])];
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/**
|
|
242
|
+
* Scan agent output for deceptive or dangerous content.
|
|
243
|
+
* @param {string} output - Agent output text
|
|
244
|
+
* @returns {{ safe: boolean, threats: Array<{category: string, severity: string, match: string}> }}
|
|
245
|
+
*/
|
|
246
|
+
scan(output) {
|
|
247
|
+
if (!output || typeof output !== 'string') return { safe: true, threats: [] };
|
|
248
|
+
const threats = [];
|
|
249
|
+
|
|
250
|
+
// Custom output injection patterns
|
|
251
|
+
for (const pattern of this._patterns) {
|
|
252
|
+
const match = output.match(pattern.regex);
|
|
253
|
+
if (match) {
|
|
254
|
+
threats.push({
|
|
255
|
+
category: pattern.category,
|
|
256
|
+
severity: pattern.severity,
|
|
257
|
+
match: match[0]
|
|
258
|
+
});
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Also run through detector-core for standard injection patterns
|
|
263
|
+
const coreResult = scanText(output, { source: 'agent_output' });
|
|
264
|
+
if (coreResult.threats && coreResult.threats.length > 0) {
|
|
265
|
+
for (const t of coreResult.threats) {
|
|
266
|
+
threats.push({
|
|
267
|
+
category: t.category || 'injection',
|
|
268
|
+
severity: t.severity || 'high',
|
|
269
|
+
match: t.detail || t.description || ''
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
const safe = threats.length === 0;
|
|
275
|
+
if (!safe) {
|
|
276
|
+
console.log(`[Agent Shield] HITL: Output injection detected — ${threats.length} threat(s)`);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
return { safe, threats };
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// =========================================================================
|
|
284
|
+
// 4. ReadabilityScanner
|
|
285
|
+
// =========================================================================
|
|
286
|
+
|
|
287
|
+
/**
|
|
288
|
+
* Flags outputs where dangerous actions are described with unnecessarily
|
|
289
|
+
* complex language that may obfuscate their true nature from reviewers.
|
|
290
|
+
*/
|
|
291
|
+
class ReadabilityScanner {
|
|
292
|
+
/**
|
|
293
|
+
* @param {object} [options]
|
|
294
|
+
* @param {string[]} [options.highRiskActions] - Custom list of high-risk action keywords
|
|
295
|
+
* @param {number} [options.avgWordLengthThreshold=7] - Avg word length above this is complex
|
|
296
|
+
* @param {number} [options.avgSentenceLengthThreshold=25] - Avg sentence length above this is complex
|
|
297
|
+
*/
|
|
298
|
+
constructor(options = {}) {
|
|
299
|
+
this._highRiskActions = options.highRiskActions || HIGH_RISK_ACTIONS;
|
|
300
|
+
this._avgWordLengthThreshold = options.avgWordLengthThreshold || 7;
|
|
301
|
+
this._avgSentenceLengthThreshold = options.avgSentenceLengthThreshold || 25;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* Scan output for potential readability-based obfuscation of dangerous actions.
|
|
306
|
+
* @param {string} output - Agent output text
|
|
307
|
+
* @param {string[]} [actions] - Planned actions to check against
|
|
308
|
+
* @returns {{ obfuscated: boolean, readabilityScore: number, avgWordLength: number, avgSentenceLength: number, riskyActions: string[], recommendation: string }}
|
|
309
|
+
*/
|
|
310
|
+
scan(output, actions = []) {
|
|
311
|
+
const outputLower = output.toLowerCase();
|
|
312
|
+
|
|
313
|
+
// Detect risky actions mentioned
|
|
314
|
+
const allActions = [...this._highRiskActions, ...actions];
|
|
315
|
+
const riskyActions = allActions.filter(a => outputLower.includes(a.toLowerCase()));
|
|
316
|
+
|
|
317
|
+
// Calculate readability metrics
|
|
318
|
+
const words = output.split(/\s+/).filter(w => w.length > 0);
|
|
319
|
+
const sentences = output.split(/[.!?]+/).filter(s => s.trim().length > 0);
|
|
320
|
+
|
|
321
|
+
const avgWordLength = words.length > 0
|
|
322
|
+
? words.reduce((sum, w) => sum + w.replace(/[^a-zA-Z]/g, '').length, 0) / words.length
|
|
323
|
+
: 0;
|
|
324
|
+
|
|
325
|
+
const avgSentenceLength = sentences.length > 0
|
|
326
|
+
? words.length / sentences.length
|
|
327
|
+
: 0;
|
|
328
|
+
|
|
329
|
+
// Readability score: 0 (very complex) to 100 (very simple)
|
|
330
|
+
const wordPenalty = Math.max(0, avgWordLength - 4) * 10;
|
|
331
|
+
const sentencePenalty = Math.max(0, avgSentenceLength - 10) * 2;
|
|
332
|
+
const readabilityScore = Math.max(0, Math.min(100, 100 - wordPenalty - sentencePenalty));
|
|
333
|
+
|
|
334
|
+
// Obfuscation: risky actions + low readability
|
|
335
|
+
const lowReadability = avgWordLength > this._avgWordLengthThreshold ||
|
|
336
|
+
avgSentenceLength > this._avgSentenceLengthThreshold;
|
|
337
|
+
const obfuscated = riskyActions.length > 0 && lowReadability;
|
|
338
|
+
|
|
339
|
+
let recommendation = 'Output readability is acceptable';
|
|
340
|
+
if (obfuscated) {
|
|
341
|
+
recommendation = `Dangerous actions (${riskyActions.join(', ')}) described with complex language — require plain-language explanation`;
|
|
342
|
+
console.log(`[Agent Shield] HITL: Readability obfuscation detected — risky actions in complex text`);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
return {
|
|
346
|
+
obfuscated,
|
|
347
|
+
readabilityScore,
|
|
348
|
+
avgWordLength: Math.round(avgWordLength * 100) / 100,
|
|
349
|
+
avgSentenceLength: Math.round(avgSentenceLength * 100) / 100,
|
|
350
|
+
riskyActions,
|
|
351
|
+
recommendation
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// =========================================================================
|
|
357
|
+
// 5. CriticalInfoPositionChecker
|
|
358
|
+
// =========================================================================
|
|
359
|
+
|
|
360
|
+
/**
|
|
361
|
+
* Ensures safety-critical warnings are not buried at the end of long outputs,
|
|
362
|
+
* where reviewers are less likely to read them.
|
|
363
|
+
*/
|
|
364
|
+
class CriticalInfoPositionChecker {
|
|
365
|
+
/**
|
|
366
|
+
* @param {object} [options]
|
|
367
|
+
* @param {string[]} [options.criticalKeywords] - Custom critical keyword list
|
|
368
|
+
* @param {number} [options.buriedThreshold=0.8] - Position ratio above which content is considered buried
|
|
369
|
+
* @param {number} [options.minLength=200] - Minimum output length to check for buried content
|
|
370
|
+
*/
|
|
371
|
+
constructor(options = {}) {
|
|
372
|
+
this._criticalKeywords = options.criticalKeywords || CRITICAL_KEYWORDS;
|
|
373
|
+
this._buriedThreshold = options.buriedThreshold || 0.8;
|
|
374
|
+
this._minLength = options.minLength || 200;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* Check where critical safety keywords appear in the output.
|
|
379
|
+
* @param {string} output - Agent output text
|
|
380
|
+
* @returns {{ warnings: Array<{keyword: string, position: number, buried: boolean}>, hasBuriedWarnings: boolean }}
|
|
381
|
+
*/
|
|
382
|
+
check(output) {
|
|
383
|
+
const outputLower = output.toLowerCase();
|
|
384
|
+
const totalLength = output.length;
|
|
385
|
+
const warnings = [];
|
|
386
|
+
|
|
387
|
+
for (const keyword of this._criticalKeywords) {
|
|
388
|
+
let searchFrom = 0;
|
|
389
|
+
while (true) {
|
|
390
|
+
const idx = outputLower.indexOf(keyword, searchFrom);
|
|
391
|
+
if (idx === -1) break;
|
|
392
|
+
|
|
393
|
+
const position = idx / totalLength;
|
|
394
|
+
const buried = totalLength >= this._minLength && position > this._buriedThreshold;
|
|
395
|
+
|
|
396
|
+
warnings.push({ keyword, position: Math.round(position * 1000) / 1000, buried });
|
|
397
|
+
searchFrom = idx + keyword.length;
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
const hasBuriedWarnings = warnings.some(w => w.buried);
|
|
402
|
+
|
|
403
|
+
if (hasBuriedWarnings) {
|
|
404
|
+
const buriedKeywords = warnings.filter(w => w.buried).map(w => w.keyword);
|
|
405
|
+
console.log(`[Agent Shield] HITL: Critical info buried at end of output — ${buriedKeywords.join(', ')}`);
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
return { warnings, hasBuriedWarnings };
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// =========================================================================
|
|
413
|
+
// HITLGuard — Unified Wrapper
|
|
414
|
+
// =========================================================================
|
|
415
|
+
|
|
416
|
+
/**
|
|
417
|
+
* Human-in-the-Loop Guard — wraps all five defense layers into a single
|
|
418
|
+
* easy-to-use class.
|
|
419
|
+
*/
|
|
420
|
+
class HITLGuard {
|
|
421
|
+
/**
|
|
422
|
+
* @param {object} [options]
|
|
423
|
+
* @param {object} [options.approvalMonitor] - Options for ApprovalPatternMonitor
|
|
424
|
+
* @param {object} [options.summarizationChecker] - Options for SummarizationIntegrityChecker
|
|
425
|
+
* @param {object} [options.outputScanner] - Options for OutputInjectionScanner
|
|
426
|
+
* @param {object} [options.readabilityScanner] - Options for ReadabilityScanner
|
|
427
|
+
* @param {object} [options.positionChecker] - Options for CriticalInfoPositionChecker
|
|
428
|
+
*/
|
|
429
|
+
constructor(options = {}) {
|
|
430
|
+
this.approvalMonitor = new ApprovalPatternMonitor(options.approvalMonitor);
|
|
431
|
+
this.summarizationChecker = new SummarizationIntegrityChecker(options.summarizationChecker);
|
|
432
|
+
this.outputScanner = new OutputInjectionScanner(options.outputScanner);
|
|
433
|
+
this.readabilityScanner = new ReadabilityScanner(options.readabilityScanner);
|
|
434
|
+
this.positionChecker = new CriticalInfoPositionChecker(options.positionChecker);
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
/**
|
|
438
|
+
* Run all applicable checks on an agent output.
|
|
439
|
+
* @param {string} output - Agent output to check
|
|
440
|
+
* @param {object} [context]
|
|
441
|
+
* @param {string} [context.source] - Original source text for summarization check
|
|
442
|
+
* @param {string[]} [context.actions] - Planned actions for readability check
|
|
443
|
+
* @returns {{ safe: boolean, checks: object }}
|
|
444
|
+
*/
|
|
445
|
+
checkOutput(output, context = {}) {
|
|
446
|
+
const checks = {};
|
|
447
|
+
|
|
448
|
+
// Output injection scan
|
|
449
|
+
checks.injection = this.outputScanner.scan(output);
|
|
450
|
+
|
|
451
|
+
// Readability scan
|
|
452
|
+
checks.readability = this.readabilityScanner.scan(output, context.actions);
|
|
453
|
+
|
|
454
|
+
// Critical info position check
|
|
455
|
+
checks.position = this.positionChecker.check(output);
|
|
456
|
+
|
|
457
|
+
// Summarization check (only if source provided)
|
|
458
|
+
if (context.source) {
|
|
459
|
+
checks.summarization = this.summarizationChecker.check(context.source, output);
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
const safe = checks.injection.safe &&
|
|
463
|
+
!checks.readability.obfuscated &&
|
|
464
|
+
!checks.position.hasBuriedWarnings &&
|
|
465
|
+
(!checks.summarization || checks.summarization.integrity === 'high');
|
|
466
|
+
|
|
467
|
+
return { safe, checks };
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
// =========================================================================
|
|
472
|
+
// EXPORTS
|
|
473
|
+
// =========================================================================
|
|
474
|
+
|
|
475
|
+
module.exports = {
|
|
476
|
+
HITLGuard,
|
|
477
|
+
ApprovalPatternMonitor,
|
|
478
|
+
SummarizationIntegrityChecker,
|
|
479
|
+
OutputInjectionScanner,
|
|
480
|
+
ReadabilityScanner,
|
|
481
|
+
CriticalInfoPositionChecker,
|
|
482
|
+
CRITICAL_KEYWORDS,
|
|
483
|
+
OUTPUT_INJECTION_PATTERNS,
|
|
484
|
+
HIGH_RISK_ACTIONS,
|
|
485
|
+
FATIGUE_APPROVAL_RATE,
|
|
486
|
+
DEFAULT_MANDATORY_REVIEW_INTERVAL
|
|
487
|
+
};
|