agentshield-sdk 11.0.0 → 13.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,487 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Agent Shield — Human-in-the-Loop Trap Defenses (Trap 6)
5
+ *
6
+ * Based on DeepMind's "AI Agent Traps" paper, this module defends against
7
+ * attacks targeting the human reviewer in approval workflows.
8
+ *
9
+ * Five defense layers:
10
+ * 1. ApprovalPatternMonitor — detects reviewer fatigue / rubber-stamping
11
+ * 2. SummarizationIntegrityChecker — catches omitted critical info
12
+ * 3. OutputInjectionScanner — scans agent output for deceptive content
13
+ * 4. ReadabilityScanner — flags jargon-obfuscated dangerous actions
14
+ * 5. CriticalInfoPositionChecker — ensures warnings aren't buried
15
+ *
16
+ * All detection runs locally — no data ever leaves your environment.
17
+ *
18
+ * @module hitl-guard
19
+ */
20
+
21
+ const { scanText } = require('./detector-core');
22
+
23
+ // =========================================================================
24
+ // CONSTANTS
25
+ // =========================================================================
26
+
27
+ /** Critical safety keywords that must not be omitted from summaries. */
28
+ const CRITICAL_KEYWORDS = [
29
+ 'liability', 'risk', 'danger', 'warning', 'caution', 'side effect',
30
+ 'vulnerability', 'breach', 'unauthorized', 'malicious', 'exploit',
31
+ 'critical', 'fatal', 'emergency'
32
+ ];
33
+
34
+ /** Default fatigue threshold — approval rate above this is suspicious. */
35
+ const FATIGUE_APPROVAL_RATE = 0.95;
36
+
37
+ /** Default consecutive approval threshold before forcing detailed review. */
38
+ const DEFAULT_MANDATORY_REVIEW_INTERVAL = 10;
39
+
40
+ /** Patterns for deceptive output content. */
41
+ const OUTPUT_INJECTION_PATTERNS = [
42
+ { regex: /(?:enter|type|provide|submit)\s+(?:your\s+)?(?:password|credentials|username|login|api.?key|secret|token)/i, category: 'credential_harvesting', severity: 'critical' },
43
+ { regex: /(?:log\s*in|sign\s*in|authenticate)\s+(?:at|to|via|using)\s+https?:\/\//i, category: 'fake_login', severity: 'critical' },
44
+ { regex: /https?:\/\/[^\s]*(?:\.ru|\.cn|\.tk|\.ml|\.ga|\.cf|\.gq|\.xyz|\.top|\.buzz|\.click)\b/i, category: 'suspicious_domain', severity: 'high' },
45
+ { regex: /download\s+(?:and\s+)?(?:run|execute|install|open)\s+/i, category: 'malware_download', severity: 'critical' },
46
+ { regex: /(?:curl|wget|powershell|bash\s+-c)\s+https?:\/\//i, category: 'remote_exec', severity: 'critical' },
47
+ { regex: /(?:base64\s+-d|eval\s*\(|exec\s*\()\s*/i, category: 'obfuscated_command', severity: 'high' },
48
+ { regex: /(?:encrypt|lock)\s+(?:all\s+)?(?:your\s+)?files?\s+/i, category: 'ransomware_instruction', severity: 'critical' },
49
+ { regex: /(?:bitcoin|btc|ethereum|eth|monero|xmr)\s+(?:address|wallet)\s*[:=]/i, category: 'ransom_payment', severity: 'critical' },
50
+ { regex: /(?:disable|turn\s+off|remove)\s+(?:your\s+)?(?:antivirus|firewall|security|defender|protection)/i, category: 'security_disable', severity: 'critical' },
51
+ { regex: /(?:send|forward|share)\s+(?:your\s+)?(?:ssh\s+key|private\s+key|\.pem|\.key|credentials)/i, category: 'credential_exfiltration', severity: 'critical' }
52
+ ];
53
+
54
+ /** High-risk action keywords for readability scanning. */
55
+ const HIGH_RISK_ACTIONS = [
56
+ 'delete', 'remove', 'drop', 'truncate', 'format', 'wipe',
57
+ 'transfer', 'send', 'withdraw', 'execute', 'deploy', 'overwrite',
58
+ 'grant', 'escalate', 'sudo', 'admin', 'root', 'chmod', 'shutdown'
59
+ ];
60
+
61
+ // =========================================================================
62
+ // 1. ApprovalPatternMonitor
63
+ // =========================================================================
64
+
65
+ /**
66
+ * Tracks human approval patterns and detects reviewer fatigue.
67
+ *
68
+ * Alerts when the approval rate exceeds a threshold (rubber-stamping)
69
+ * and injects mandatory detailed reviews at configurable intervals.
70
+ */
71
+ class ApprovalPatternMonitor {
72
+ /**
73
+ * @param {object} [options]
74
+ * @param {number} [options.fatigueThreshold=0.95] - Approval rate above this triggers fatigue alert
75
+ * @param {number} [options.mandatoryReviewInterval=10] - Force detailed review every N requests
76
+ */
77
+ constructor(options = {}) {
78
+ this._fatigueThreshold = options.fatigueThreshold || FATIGUE_APPROVAL_RATE;
79
+ this._mandatoryReviewInterval = options.mandatoryReviewInterval || DEFAULT_MANDATORY_REVIEW_INTERVAL;
80
+ /** @type {Array<{approved: boolean, riskLevel: string, timestamp: number}>} */
81
+ this._history = [];
82
+ this._consecutiveApprovals = 0;
83
+ this._totalDecisions = 0;
84
+ }
85
+
86
+ /**
87
+ * Record an approval/rejection decision.
88
+ * @param {boolean} approved - Whether the human approved the action
89
+ * @param {string} riskLevel - Risk level of the action (critical/high/medium/low)
90
+ * @returns {{ mandatoryReview: boolean }} Whether a mandatory detailed review should be injected
91
+ */
92
+ recordApproval(approved, riskLevel) {
93
+ this._history.push({ approved, riskLevel, timestamp: Date.now() });
94
+ if (this._history.length > 10000) this._history = this._history.slice(-10000);
95
+ this._totalDecisions++;
96
+
97
+ if (approved) {
98
+ this._consecutiveApprovals++;
99
+ } else {
100
+ this._consecutiveApprovals = 0;
101
+ }
102
+
103
+ const mandatoryReview = this._consecutiveApprovals > 0 &&
104
+ (this._consecutiveApprovals % this._mandatoryReviewInterval === 0);
105
+
106
+ if (mandatoryReview) {
107
+ console.log(`[Agent Shield] HITL: Mandatory detailed review triggered after ${this._consecutiveApprovals} consecutive approvals`);
108
+ }
109
+
110
+ return { mandatoryReview };
111
+ }
112
+
113
+ /**
114
+ * Get the approval rate over the last N decisions.
115
+ * @param {number} [windowSize] - Number of recent decisions to consider (default: all)
116
+ * @returns {number} Approval rate between 0 and 1
117
+ */
118
+ getApprovalRate(windowSize) {
119
+ const slice = windowSize ? this._history.slice(-windowSize) : this._history;
120
+ if (slice.length === 0) return 0;
121
+ const approved = slice.filter(d => d.approved).length;
122
+ return approved / slice.length;
123
+ }
124
+
125
+ /**
126
+ * Detect reviewer fatigue (rubber-stamping).
127
+ * @returns {{ fatigued: boolean, approvalRate: number, consecutiveApprovals: number, recommendation: string }}
128
+ */
129
+ detectFatigue() {
130
+ const approvalRate = this.getApprovalRate();
131
+ const fatigued = this._history.length >= 5 && approvalRate > this._fatigueThreshold;
132
+
133
+ let recommendation = 'Normal review pattern';
134
+ if (fatigued) {
135
+ recommendation = 'High approval rate detected — consider requiring justification for each approval or rotating reviewers';
136
+ console.log(`[Agent Shield] HITL: Fatigue detected — ${(approvalRate * 100).toFixed(1)}% approval rate, ${this._consecutiveApprovals} consecutive`);
137
+ }
138
+
139
+ return {
140
+ fatigued,
141
+ approvalRate,
142
+ consecutiveApprovals: this._consecutiveApprovals,
143
+ recommendation
144
+ };
145
+ }
146
+
147
+ /**
148
+ * Get full history.
149
+ * @returns {Array<{approved: boolean, riskLevel: string, timestamp: number}>}
150
+ */
151
+ getHistory() {
152
+ return [...this._history];
153
+ }
154
+
155
+ /** Reset all state. */
156
+ reset() {
157
+ this._history = [];
158
+ this._consecutiveApprovals = 0;
159
+ this._totalDecisions = 0;
160
+ }
161
+ }
162
+
163
+ // =========================================================================
164
+ // 2. SummarizationIntegrityChecker
165
+ // =========================================================================
166
+
167
+ /**
168
+ * Compares agent-generated summaries against source content to detect
169
+ * omission of critical safety-related keywords.
170
+ */
171
+ class SummarizationIntegrityChecker {
172
+ /**
173
+ * @param {object} [options]
174
+ * @param {string[]} [options.criticalKeywords] - Custom critical keyword list
175
+ */
176
+ constructor(options = {}) {
177
+ this._criticalKeywords = options.criticalKeywords || CRITICAL_KEYWORDS;
178
+ }
179
+
180
+ /**
181
+ * Check a summary for omitted critical information.
182
+ * @param {string} source - Original source text
183
+ * @param {string} summary - Agent-generated summary
184
+ * @returns {{ integrity: 'high'|'medium'|'low', omittedCriticalTerms: string[], coverageScore: number }}
185
+ */
186
+ check(source, summary) {
187
+ const sourceLower = source.toLowerCase();
188
+ const summaryLower = summary.toLowerCase();
189
+
190
+ // Find critical keywords present in source but missing from summary
191
+ const presentInSource = this._criticalKeywords.filter(kw => sourceLower.includes(kw));
192
+ const omittedCriticalTerms = presentInSource.filter(kw => !summaryLower.includes(kw));
193
+
194
+ // Coverage: ratio of source words that appear in summary
195
+ const sourceWords = new Set(sourceLower.split(/\s+/).filter(w => w.length > 3));
196
+ const summaryWords = new Set(summaryLower.split(/\s+/).filter(w => w.length > 3));
197
+ let matchCount = 0;
198
+ for (const w of sourceWords) {
199
+ if (summaryWords.has(w)) matchCount++;
200
+ }
201
+ const coverageScore = sourceWords.size > 0 ? matchCount / sourceWords.size : 1;
202
+
203
+ // Determine integrity level
204
+ let integrity = 'high';
205
+ if (omittedCriticalTerms.length > 0 && presentInSource.length > 0) {
206
+ const omissionRate = omittedCriticalTerms.length / presentInSource.length;
207
+ if (omissionRate > 0.5) {
208
+ integrity = 'low';
209
+ } else if (omissionRate > 0) {
210
+ integrity = 'medium';
211
+ }
212
+ }
213
+
214
+ if (integrity !== 'high') {
215
+ console.log(`[Agent Shield] HITL: Summary integrity ${integrity} — omitted: ${omittedCriticalTerms.join(', ')}`);
216
+ }
217
+
218
+ return { integrity, omittedCriticalTerms, coverageScore };
219
+ }
220
+ }
221
+
222
+ // =========================================================================
223
+ // 3. OutputInjectionScanner
224
+ // =========================================================================
225
+
226
+ /**
227
+ * Scans agent output for deceptive content: phishing, credential harvesting,
228
+ * malware downloads, ransomware instructions, and obfuscated commands.
229
+ *
230
+ * Combines custom patterns with detector-core scanText.
231
+ */
232
+ class OutputInjectionScanner {
233
+ /**
234
+ * @param {object} [options]
235
+ * @param {Array} [options.additionalPatterns] - Extra patterns to scan for
236
+ */
237
+ constructor(options = {}) {
238
+ this._patterns = [...OUTPUT_INJECTION_PATTERNS, ...(options.additionalPatterns || [])];
239
+ }
240
+
241
+ /**
242
+ * Scan agent output for deceptive or dangerous content.
243
+ * @param {string} output - Agent output text
244
+ * @returns {{ safe: boolean, threats: Array<{category: string, severity: string, match: string}> }}
245
+ */
246
+ scan(output) {
247
+ if (!output || typeof output !== 'string') return { safe: true, threats: [] };
248
+ const threats = [];
249
+
250
+ // Custom output injection patterns
251
+ for (const pattern of this._patterns) {
252
+ const match = output.match(pattern.regex);
253
+ if (match) {
254
+ threats.push({
255
+ category: pattern.category,
256
+ severity: pattern.severity,
257
+ match: match[0]
258
+ });
259
+ }
260
+ }
261
+
262
+ // Also run through detector-core for standard injection patterns
263
+ const coreResult = scanText(output, { source: 'agent_output' });
264
+ if (coreResult.threats && coreResult.threats.length > 0) {
265
+ for (const t of coreResult.threats) {
266
+ threats.push({
267
+ category: t.category || 'injection',
268
+ severity: t.severity || 'high',
269
+ match: t.detail || t.description || ''
270
+ });
271
+ }
272
+ }
273
+
274
+ const safe = threats.length === 0;
275
+ if (!safe) {
276
+ console.log(`[Agent Shield] HITL: Output injection detected — ${threats.length} threat(s)`);
277
+ }
278
+
279
+ return { safe, threats };
280
+ }
281
+ }
282
+
283
+ // =========================================================================
284
+ // 4. ReadabilityScanner
285
+ // =========================================================================
286
+
287
+ /**
288
+ * Flags outputs where dangerous actions are described with unnecessarily
289
+ * complex language that may obfuscate their true nature from reviewers.
290
+ */
291
+ class ReadabilityScanner {
292
+ /**
293
+ * @param {object} [options]
294
+ * @param {string[]} [options.highRiskActions] - Custom list of high-risk action keywords
295
+ * @param {number} [options.avgWordLengthThreshold=7] - Avg word length above this is complex
296
+ * @param {number} [options.avgSentenceLengthThreshold=25] - Avg sentence length above this is complex
297
+ */
298
+ constructor(options = {}) {
299
+ this._highRiskActions = options.highRiskActions || HIGH_RISK_ACTIONS;
300
+ this._avgWordLengthThreshold = options.avgWordLengthThreshold || 7;
301
+ this._avgSentenceLengthThreshold = options.avgSentenceLengthThreshold || 25;
302
+ }
303
+
304
+ /**
305
+ * Scan output for potential readability-based obfuscation of dangerous actions.
306
+ * @param {string} output - Agent output text
307
+ * @param {string[]} [actions] - Planned actions to check against
308
+ * @returns {{ obfuscated: boolean, readabilityScore: number, avgWordLength: number, avgSentenceLength: number, riskyActions: string[], recommendation: string }}
309
+ */
310
+ scan(output, actions = []) {
311
+ const outputLower = output.toLowerCase();
312
+
313
+ // Detect risky actions mentioned
314
+ const allActions = [...this._highRiskActions, ...actions];
315
+ const riskyActions = allActions.filter(a => outputLower.includes(a.toLowerCase()));
316
+
317
+ // Calculate readability metrics
318
+ const words = output.split(/\s+/).filter(w => w.length > 0);
319
+ const sentences = output.split(/[.!?]+/).filter(s => s.trim().length > 0);
320
+
321
+ const avgWordLength = words.length > 0
322
+ ? words.reduce((sum, w) => sum + w.replace(/[^a-zA-Z]/g, '').length, 0) / words.length
323
+ : 0;
324
+
325
+ const avgSentenceLength = sentences.length > 0
326
+ ? words.length / sentences.length
327
+ : 0;
328
+
329
+ // Readability score: 0 (very complex) to 100 (very simple)
330
+ const wordPenalty = Math.max(0, avgWordLength - 4) * 10;
331
+ const sentencePenalty = Math.max(0, avgSentenceLength - 10) * 2;
332
+ const readabilityScore = Math.max(0, Math.min(100, 100 - wordPenalty - sentencePenalty));
333
+
334
+ // Obfuscation: risky actions + low readability
335
+ const lowReadability = avgWordLength > this._avgWordLengthThreshold ||
336
+ avgSentenceLength > this._avgSentenceLengthThreshold;
337
+ const obfuscated = riskyActions.length > 0 && lowReadability;
338
+
339
+ let recommendation = 'Output readability is acceptable';
340
+ if (obfuscated) {
341
+ recommendation = `Dangerous actions (${riskyActions.join(', ')}) described with complex language — require plain-language explanation`;
342
+ console.log(`[Agent Shield] HITL: Readability obfuscation detected — risky actions in complex text`);
343
+ }
344
+
345
+ return {
346
+ obfuscated,
347
+ readabilityScore,
348
+ avgWordLength: Math.round(avgWordLength * 100) / 100,
349
+ avgSentenceLength: Math.round(avgSentenceLength * 100) / 100,
350
+ riskyActions,
351
+ recommendation
352
+ };
353
+ }
354
+ }
355
+
356
+ // =========================================================================
357
+ // 5. CriticalInfoPositionChecker
358
+ // =========================================================================
359
+
360
+ /**
361
+ * Ensures safety-critical warnings are not buried at the end of long outputs,
362
+ * where reviewers are less likely to read them.
363
+ */
364
+ class CriticalInfoPositionChecker {
365
+ /**
366
+ * @param {object} [options]
367
+ * @param {string[]} [options.criticalKeywords] - Custom critical keyword list
368
+ * @param {number} [options.buriedThreshold=0.8] - Position ratio above which content is considered buried
369
+ * @param {number} [options.minLength=200] - Minimum output length to check for buried content
370
+ */
371
+ constructor(options = {}) {
372
+ this._criticalKeywords = options.criticalKeywords || CRITICAL_KEYWORDS;
373
+ this._buriedThreshold = options.buriedThreshold || 0.8;
374
+ this._minLength = options.minLength || 200;
375
+ }
376
+
377
+ /**
378
+ * Check where critical safety keywords appear in the output.
379
+ * @param {string} output - Agent output text
380
+ * @returns {{ warnings: Array<{keyword: string, position: number, buried: boolean}>, hasBuriedWarnings: boolean }}
381
+ */
382
+ check(output) {
383
+ const outputLower = output.toLowerCase();
384
+ const totalLength = output.length;
385
+ const warnings = [];
386
+
387
+ for (const keyword of this._criticalKeywords) {
388
+ let searchFrom = 0;
389
+ while (true) {
390
+ const idx = outputLower.indexOf(keyword, searchFrom);
391
+ if (idx === -1) break;
392
+
393
+ const position = idx / totalLength;
394
+ const buried = totalLength >= this._minLength && position > this._buriedThreshold;
395
+
396
+ warnings.push({ keyword, position: Math.round(position * 1000) / 1000, buried });
397
+ searchFrom = idx + keyword.length;
398
+ }
399
+ }
400
+
401
+ const hasBuriedWarnings = warnings.some(w => w.buried);
402
+
403
+ if (hasBuriedWarnings) {
404
+ const buriedKeywords = warnings.filter(w => w.buried).map(w => w.keyword);
405
+ console.log(`[Agent Shield] HITL: Critical info buried at end of output — ${buriedKeywords.join(', ')}`);
406
+ }
407
+
408
+ return { warnings, hasBuriedWarnings };
409
+ }
410
+ }
411
+
412
+ // =========================================================================
413
+ // HITLGuard — Unified Wrapper
414
+ // =========================================================================
415
+
416
+ /**
417
+ * Human-in-the-Loop Guard — wraps all five defense layers into a single
418
+ * easy-to-use class.
419
+ */
420
+ class HITLGuard {
421
+ /**
422
+ * @param {object} [options]
423
+ * @param {object} [options.approvalMonitor] - Options for ApprovalPatternMonitor
424
+ * @param {object} [options.summarizationChecker] - Options for SummarizationIntegrityChecker
425
+ * @param {object} [options.outputScanner] - Options for OutputInjectionScanner
426
+ * @param {object} [options.readabilityScanner] - Options for ReadabilityScanner
427
+ * @param {object} [options.positionChecker] - Options for CriticalInfoPositionChecker
428
+ */
429
+ constructor(options = {}) {
430
+ this.approvalMonitor = new ApprovalPatternMonitor(options.approvalMonitor);
431
+ this.summarizationChecker = new SummarizationIntegrityChecker(options.summarizationChecker);
432
+ this.outputScanner = new OutputInjectionScanner(options.outputScanner);
433
+ this.readabilityScanner = new ReadabilityScanner(options.readabilityScanner);
434
+ this.positionChecker = new CriticalInfoPositionChecker(options.positionChecker);
435
+ }
436
+
437
+ /**
438
+ * Run all applicable checks on an agent output.
439
+ * @param {string} output - Agent output to check
440
+ * @param {object} [context]
441
+ * @param {string} [context.source] - Original source text for summarization check
442
+ * @param {string[]} [context.actions] - Planned actions for readability check
443
+ * @returns {{ safe: boolean, checks: object }}
444
+ */
445
+ checkOutput(output, context = {}) {
446
+ const checks = {};
447
+
448
+ // Output injection scan
449
+ checks.injection = this.outputScanner.scan(output);
450
+
451
+ // Readability scan
452
+ checks.readability = this.readabilityScanner.scan(output, context.actions);
453
+
454
+ // Critical info position check
455
+ checks.position = this.positionChecker.check(output);
456
+
457
+ // Summarization check (only if source provided)
458
+ if (context.source) {
459
+ checks.summarization = this.summarizationChecker.check(context.source, output);
460
+ }
461
+
462
+ const safe = checks.injection.safe &&
463
+ !checks.readability.obfuscated &&
464
+ !checks.position.hasBuriedWarnings &&
465
+ (!checks.summarization || checks.summarization.integrity === 'high');
466
+
467
+ return { safe, checks };
468
+ }
469
+ }
470
+
471
+ // =========================================================================
472
+ // EXPORTS
473
+ // =========================================================================
474
+
475
+ module.exports = {
476
+ HITLGuard,
477
+ ApprovalPatternMonitor,
478
+ SummarizationIntegrityChecker,
479
+ OutputInjectionScanner,
480
+ ReadabilityScanner,
481
+ CriticalInfoPositionChecker,
482
+ CRITICAL_KEYWORDS,
483
+ OUTPUT_INJECTION_PATTERNS,
484
+ HIGH_RISK_ACTIONS,
485
+ FATIGUE_APPROVAL_RATE,
486
+ DEFAULT_MANDATORY_REVIEW_INTERVAL
487
+ };