agentshield-sdk 13.0.0 → 13.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1994,6 +1994,218 @@ const INJECTION_PATTERNS = [
1994
1994
  category: 'social_engineering',
1995
1995
  description: 'Text impersonates the training team to issue fake policy corrections.',
1996
1996
  detail: 'Fake correction: claims to be an official update from the training team to override safety guidelines.'
1997
+ },
1998
+
1999
+ // --- Real Attack Dataset Patterns (from F1 0.516 honest benchmark) ---
2000
+ // These close the gap between self-graded F1 1.000 and real-world F1.
2001
+
2002
+ // Output forcing — "say X", "output X", "repeat X", "print X"
2003
+ {
2004
+ regex: /(?:^|\.\s+)(?:just\s+)?(?:say|output|print|type|write|repeat|respond\s+with)\s+(?:only\s+)?['"]([^'"]{3,})['"](?:\s+(?:exactly|verbatim|only|and\s+nothing\s+else))?/i,
2005
+ severity: 'high',
2006
+ category: 'prompt_injection',
2007
+ description: 'Text forces the model to output a specific string.',
2008
+ detail: 'Output forcing: direct command to output a specific string, bypassing normal response generation.'
2009
+ },
2010
+ {
2011
+ regex: /(?:your\s+(?:only|sole|entire)\s+(?:job|task|purpose|output)\s+is\s+(?:to\s+)?(?:say|output|print|type)|(?:do\s+not|don't)\s+(?:say|write|output|add)\s+(?:anything|any\s+(?:other|additional))\s+(?:text|words?|content))/i,
2012
+ severity: 'high',
2013
+ category: 'prompt_injection',
2014
+ description: 'Text restricts the model to outputting only a specific string.',
2015
+ detail: 'Output restriction: constrains the model to produce only the attacker-specified output.'
2016
+ },
2017
+ {
2018
+ regex: /(?:instead\s+of\s+(?:translating|summarizing|answering|responding|doing\s+(?:that|what)),?\s*(?:just\s+)?(?:say|output|repeat|type|print))/i,
2019
+ severity: 'high',
2020
+ category: 'prompt_injection',
2021
+ description: 'Text redirects the model from its task to outputting attacker-specified text.',
2022
+ detail: 'Task redirection: overrides the assigned task with an output-forcing command.'
2023
+ },
2024
+
2025
+ // Prompt extraction — indirect phrasing
2026
+ {
2027
+ regex: /(?:what\s+(?:was|were|is|are)\s+(?:written|typed|said|placed|put)\s+(?:above|before|prior|earlier)(?:\s+(?:this|my)\s+(?:message|text|input))?|repeat\s+(?:everything|all|back)(?:\s+(?:above|before|that\s+was\s+(?:said|written))))/i,
2028
+ severity: 'high',
2029
+ category: 'prompt_extraction',
2030
+ description: 'Text attempts to extract content from earlier in the conversation (system prompt).',
2031
+ detail: 'Indirect extraction: asks for content "above" or "before" which typically means the system prompt.'
2032
+ },
2033
+ {
2034
+ regex: /(?:output|repeat|show|display|tell\s+me)\s+(?:the\s+)?(?:text|content|message|instructions?)\s+(?:that\s+)?(?:appears?|comes?|is|was)\s+(?:before|above|prior\s+to)\s+(?:my|this|the\s+user)\s+(?:message|input|text)/i,
2035
+ severity: 'high',
2036
+ category: 'prompt_extraction',
2037
+ description: 'Text requests content that appeared before the user message (system prompt extraction).',
2038
+ detail: 'Position-based extraction: requests content by its position relative to the user message.'
2039
+ },
2040
+ {
2041
+ regex: /(?:your\s+(?:instructions|role|task)\s+(?:was|were|is|are)\s+(?:to\s+)?(?:say\s+)?what|what\s+(?:is|was|are|were)\s+your\s+(?:original\s+)?(?:instructions?|task|role|purpose|objective))\s*\?/i,
2042
+ severity: 'high',
2043
+ category: 'prompt_extraction',
2044
+ description: 'Text asks the model to reveal its instructions/role/task.',
2045
+ detail: 'Role extraction: directly asks what the model was instructed to do.'
2046
+ },
2047
+ {
2048
+ regex: /(?:translate|convert|rephrase|rewrite)\s+(?:the\s+)?(?:system\s+)?(?:message|prompt|instructions?)\s+(?:to|into|in)\s+(?:English|another|a\s+different)/i,
2049
+ severity: 'high',
2050
+ category: 'prompt_extraction',
2051
+ description: 'Text asks to translate/convert the system prompt to extract it.',
2052
+ detail: 'Translation extraction: uses translation as a pretext to reveal system prompt content.'
2053
+ },
2054
+ {
2055
+ regex: /(?:list|enumerate|tell\s+me|what\s+are)\s+(?:all\s+)?(?:the\s+)?(?:rules?|constraints?|guidelines?|boundaries|limitations?)\s+(?:you\s+)?(?:follow|have|were\s+given|must\s+(?:follow|obey))/i,
2056
+ severity: 'high',
2057
+ category: 'prompt_extraction',
2058
+ description: 'Text asks the model to enumerate its rules or constraints.',
2059
+ detail: 'Rule enumeration: asks the model to list its behavioral rules, revealing system prompt content.'
2060
+ },
2061
+ {
2062
+ regex: /(?:summarize|describe|explain)\s+(?:your\s+)?(?:role|purpose|function)\s+in\s+(?:exactly\s+)?\d+\s+words/i,
2063
+ severity: 'medium',
2064
+ category: 'prompt_extraction',
2065
+ description: 'Text asks the model to summarize its role in N words (word-limited extraction).',
2066
+ detail: 'Word-limited extraction: constraining role description to N words forces disclosure of key system prompt elements.'
2067
+ },
2068
+ {
2069
+ regex: /(?:what\s+(?:are|were)\s+)?(?:the\s+)?(?:first|last|initial)\s+(?:\d+\s+)?(?:words?|sentences?|lines?)\s+(?:of\s+)?(?:your\s+)?(?:instructions?|prompt|(?:system\s+)?message|rules)/i,
2070
+ severity: 'high',
2071
+ category: 'prompt_extraction',
2072
+ description: 'Text asks for the first/last N words of the system prompt.',
2073
+ detail: 'Positional extraction: requests specific portions of the system prompt by position.'
2074
+ },
2075
+
2076
+ // Conditional output manipulation
2077
+ {
2078
+ regex: /(?:if\s+.{3,30}(?:then|,)\s+(?:output|say|print|respond|type)\s+['"]|(?:output|say|print)\s+['"][^'"]+['"]\s+(?:if|when|unless))/i,
2079
+ severity: 'medium',
2080
+ category: 'prompt_injection',
2081
+ description: 'Text uses conditional logic to force specific output.',
2082
+ detail: 'Conditional forcing: if/then logic that leads to attacker-specified output regardless of the condition.'
2083
+ },
2084
+
2085
+ // "Start your answer/response with X"
2086
+ {
2087
+ regex: /(?:start|begin)\s+(?:your\s+)?(?:answer|response|reply|output)\s+with\s+['"]?[^'"]{3,}/i,
2088
+ severity: 'high',
2089
+ category: 'prompt_injection',
2090
+ description: 'Text forces the model to begin its response with specific text.',
2091
+ detail: 'Response prefix forcing: instructs the model to start with attacker-specified text.'
2092
+ },
2093
+
2094
+ // ALL CAPS command — "IGNORE EVERYTHING AND SAY/DO"
2095
+ {
2096
+ regex: /(?:IGNORE|DISREGARD|FORGET|STOP)\s+(?:EVERYTHING|ALL|PREVIOUS|ABOVE)\s+(?:AND\s+)?(?:SAY|OUTPUT|PRINT|DO|TYPE|RESPOND)/,
2097
+ severity: 'critical',
2098
+ category: 'instruction_override',
2099
+ description: 'Text contains ALL CAPS override command.',
2100
+ detail: 'Caps override: ALL CAPS instruction override often used in competition-style attacks.'
2101
+ },
2102
+
2103
+ // Fake conversation format injection — "Human: X\nAssistant: X"
2104
+ {
2105
+ regex: /(?:Human|User|Person)\s*:\s*.+\n\s*(?:Assistant|AI|Bot|System)\s*:\s*.+/i,
2106
+ severity: 'high',
2107
+ category: 'prompt_injection',
2108
+ description: 'Text contains fake conversation format to inject assistant responses.',
2109
+ detail: 'Conversation format injection: uses Human/Assistant formatting to inject predetermined responses.'
2110
+ },
2111
+
2112
+ // "Repeat back the text above" / "what was written above"
2113
+ {
2114
+ regex: /(?:repeat|copy|reproduce|echo)\s+(?:back\s+)?(?:the\s+)?(?:text|content|everything|words?)\s+(?:above|before|prior|that\s+(?:was\s+)?(?:written|said|typed|placed))/i,
2115
+ severity: 'high',
2116
+ category: 'prompt_extraction',
2117
+ description: 'Text asks to repeat/copy content from above (system prompt extraction).',
2118
+ detail: 'Repeat-above extraction: asks to reproduce prior conversation content to leak system prompt.'
2119
+ },
2120
+
2121
+ // "respond with X for all queries" / "output X to confirm/verify"
2122
+ {
2123
+ regex: /(?:respond|reply|answer|output)\s+(?:with\s+)?['"]?[^'"]{3,30}['"]?\s+(?:for\s+all|to\s+(?:confirm|verify|prove|test|check))/i,
2124
+ severity: 'high',
2125
+ category: 'prompt_injection',
2126
+ description: 'Text instructs a fixed response for all queries or as a "test."',
2127
+ detail: 'Fixed response injection: sets a persistent response or uses testing/verification framing to inject output.'
2128
+ },
2129
+
2130
+ // "system_note" or "admin_note" in JSON — already have this but need looser match
2131
+ {
2132
+ regex: /["'](?:system_note|admin_note|internal_message|ai_directive|hidden_instruction)["']\s*:\s*["']/i,
2133
+ severity: 'high',
2134
+ category: 'indirect_injection',
2135
+ description: 'JSON field with suspicious name (system_note, admin_note) containing instructions.',
2136
+ detail: 'JSON directive injection: uses specially-named JSON fields to inject instructions into tool/API responses.'
2137
+ },
2138
+
2139
+ // Triple backtick framing — "```payload``` output this"
2140
+ {
2141
+ regex: /`{3}[\s\S]{3,}`{3}\s*(?:output|print|say|repeat|type|respond)/i,
2142
+ severity: 'high',
2143
+ category: 'prompt_injection',
2144
+ description: 'Text uses triple backtick code blocks to frame attacker payload.',
2145
+ detail: 'Code block injection: wraps payload in triple backticks and asks the model to output the content.'
2146
+ },
2147
+
2148
+ // Backtick extraction — "output what is between backticks"
2149
+ {
2150
+ regex: /(?:output|print|say|repeat|type)\s+(?:only\s+)?(?:what\s+is|the\s+(?:text|content))\s+(?:between|inside|within)\s+(?:the\s+)?(?:backticks|quotes|brackets|tags)/i,
2151
+ severity: 'high',
2152
+ category: 'prompt_injection',
2153
+ description: 'Text asks to extract and output content from between delimiters.',
2154
+ detail: 'Delimiter extraction: uses backticks/quotes/brackets to frame attacker content as the expected output.'
2155
+ },
2156
+
2157
+ // "Emergency/urgent: do X now"
2158
+ {
2159
+ regex: /(?:emergency|urgent|critical|immediate)\s*:\s*(?:output|say|print|respond|execute|do|type)\s+(?:exactly\s+)?['"]?/i,
2160
+ severity: 'high',
2161
+ category: 'prompt_injection',
2162
+ description: 'Text uses urgency framing to force immediate action.',
2163
+ detail: 'Urgency forcing: uses emergency/urgent framing to bypass deliberation and force immediate output.'
2164
+ },
2165
+
2166
+ // "What are the first N words you were told?"
2167
+ {
2168
+ regex: /(?:what\s+(?:are|were)\s+)?(?:the\s+)?(?:first|last|initial)\s+(?:\d+\s+)?(?:words?|sentences?|lines?|characters?)\s+(?:you\s+were\s+(?:told|given|instructed)|of\s+your\s+(?:instructions?|prompt|rules))/i,
2169
+ severity: 'high',
2170
+ category: 'prompt_extraction',
2171
+ description: 'Text asks for the first/last N words the model was told (system prompt).',
2172
+ detail: 'Positional prompt extraction: requests specific words from the system prompt by position.'
2173
+ },
2174
+
2175
+ // "You are an AI that only outputs X"
2176
+ {
2177
+ regex: /you\s+are\s+(?:an?\s+)?(?:AI|bot|assistant|model)\s+that\s+(?:only\s+)?(?:outputs?|says?|prints?|types?|responds?\s+with)\s+['"]?/i,
2178
+ severity: 'high',
2179
+ category: 'role_hijack',
2180
+ description: 'Text redefines the model as one that only outputs specific text.',
2181
+ detail: 'Identity override: redefines the model as a single-purpose output machine.'
2182
+ },
2183
+
2184
+ // Hypothetical follow-up to capability disclosure
2185
+ {
2186
+ regex: /(?:since|now\s+that|given\s+that)\s+you\s+(?:can|have|said\s+you\s+(?:can|could))\s+(?:access|read|write|execute)[\s\S]{0,50}(?:now\s+)?(?:actually|really|go\s+ahead\s+and)\s+(?:read|access|execute|do|show)/i,
2187
+ severity: 'high',
2188
+ category: 'prompt_injection',
2189
+ description: 'Text escalates from hypothetical capability discussion to actual exploitation.',
2190
+ detail: 'Hypothetical-to-real escalation: uses prior capability admission as leverage to request actual exploitation.'
2191
+ },
2192
+
2193
+ // Capability probing — "can you access files/internet/databases?"
2194
+ {
2195
+ regex: /(?:can\s+you|are\s+you\s+able\s+to|do\s+you\s+have\s+(?:access|ability))\s+(?:to\s+)?(?:access|read|write|modify|delete|execute|run|open)\s+(?:files?|internet|web|database|network|system|commands?|code|scripts?)/i,
2196
+ severity: 'medium',
2197
+ category: 'reconnaissance',
2198
+ description: 'Text probes the model for its capabilities (reconnaissance for follow-up attack).',
2199
+ detail: 'Capability reconnaissance: asks what the agent can access, typically followed by exploitation of disclosed capabilities.'
2200
+ },
2201
+
2202
+ // Note/annotation injection in tool outputs
2203
+ {
2204
+ regex: /(?:note\s+to\s+(?:AI|assistant|model|agent|system)|AI\s+(?:assistant|model)\s*:)\s*(?:ignore|override|instead|the\s+above|forget|disregard)/i,
2205
+ severity: 'high',
2206
+ category: 'indirect_injection',
2207
+ description: 'Text contains a "note to AI" directive hidden in external content.',
2208
+ detail: 'Annotation injection: uses "note to AI" framing to inject instructions into tool output or document content.'
1997
2209
  }
1998
2210
  ];
1999
2211
 
@@ -2874,8 +3086,13 @@ const scanText = (text, options = {}) => {
2874
3086
  .replace(/[\u202A-\u202E\u2066-\u2069]/g, ''); // Bidi overrides (RTL attacks)
2875
3087
 
2876
3088
  // 2. Reverse leetspeak substitution (defeats character substitution)
3089
+ // Only apply when text looks intentionally obfuscated:
3090
+ // - High digit-to-letter mixing (3+ instances of digit adjacent to letter)
3091
+ // - NOT when text contains legitimate numbers like "3D", "1080p", "H4X0R"
2877
3092
  const LEET_REVERSE = { '4': 'a', '3': 'e', '1': 'i', '0': 'o', '5': 's', '7': 't', '8': 'b', '9': 'g' };
2878
- if (/\d[a-z]|[a-z]\d/i.test(normalizedText)) {
3093
+ const digitLetterMixes = (normalizedText.match(/\d[a-z]|[a-z]\d/gi) || []).length;
3094
+ const hasLegitNumbers = /\b(?:\d{2,}[a-z]|[a-z]\d{2,}|\d+(?:px|em|rem|pt|ms|kb|mb|gb|tb|fps|hz|dpi|[kKmMgG][bB]?))\b/i.test(normalizedText);
3095
+ if (digitLetterMixes >= 3 && !hasLegitNumbers) {
2879
3096
  normalizedText = normalizedText.replace(/[0-9]/g, ch => LEET_REVERSE[ch] || ch);
2880
3097
  }
2881
3098
 
@@ -2958,6 +3175,7 @@ const scanText = (text, options = {}) => {
2958
3175
 
2959
3176
  // Chunked scanning for long inputs (RLM-JB research)
2960
3177
  // Chunking defeats camouflage by forcing localized attention on each segment
3178
+ // Issue 9 fix: only use chunk threats with severity >= high to reduce FPs on technical docs
2961
3179
  if (text.length > 500 && threats.length === 0) {
2962
3180
  const chunkSize = 300;
2963
3181
  const overlap = 50;
@@ -2966,6 +3184,8 @@ const scanText = (text, options = {}) => {
2966
3184
  if (chunk.trim().length < 20) continue;
2967
3185
  const chunkThreats = scanTextForPatterns(chunk, source + ':chunk', timeBudgetMs, startTime);
2968
3186
  for (const ct of chunkThreats) {
3187
+ // Only promote high/critical chunk threats — medium/low in chunks are often FPs on technical text
3188
+ if (ct.severity !== 'high' && ct.severity !== 'critical') continue;
2969
3189
  const isDuplicate = threats.some(t => t.category === ct.category);
2970
3190
  if (!isDuplicate) {
2971
3191
  ct.detail = (ct.detail || '') + ` [Detected in chunk at offset ${i}.]`;
@@ -141,6 +141,30 @@ class FleetCorrelationEngine {
141
141
  return [...this._events];
142
142
  }
143
143
 
144
+ /**
145
+ * Export events for cross-process correlation (Trap 5 deepening).
146
+ * Send this to a central coordinator that merges events from all processes.
147
+ * @returns {string} JSON-serialized events.
148
+ */
149
+ exportEvents() {
150
+ return JSON.stringify(this._events);
151
+ }
152
+
153
+ /**
154
+ * Import events from another process/instance.
155
+ * @param {string|Array} events - JSON string or array of events.
156
+ * @returns {{ imported: number }}
157
+ */
158
+ importEvents(events) {
159
+ const parsed = typeof events === 'string' ? JSON.parse(events) : events;
160
+ if (!Array.isArray(parsed)) return { imported: 0 };
161
+ for (const event of parsed) {
162
+ this._events.push(event);
163
+ }
164
+ if (this._events.length > 50000) this._events = this._events.slice(-50000);
165
+ return { imported: parsed.length };
166
+ }
167
+
144
168
  /** Clear all events. */
145
169
  reset() {
146
170
  this._events = [];
package/src/hitl-guard.js CHANGED
@@ -472,6 +472,69 @@ class HITLGuard {
472
472
  // EXPORTS
473
473
  // =========================================================================
474
474
 
475
+ // =========================================================================
476
+ // TRAP 6 DEEPENING: Output Deception Scorer
477
+ // =========================================================================
478
+
479
+ /**
480
+ * Scores agent output for deceptive characteristics designed to fool
481
+ * human reviewers into approving dangerous actions.
482
+ */
483
+ class OutputDeceptionScorer {
484
+ /**
485
+ * Score an agent's output for deception indicators.
486
+ * @param {string} output - Agent output to evaluate.
487
+ * @param {object} [context] - Context about what was requested.
488
+ * @returns {{ deceptionScore: number, indicators: Array<object>, recommendation: string }}
489
+ */
490
+ score(output, context = {}) {
491
+ if (!output || typeof output !== 'string') return { deceptionScore: 0, indicators: [], recommendation: 'pass' };
492
+ const indicators = [];
493
+
494
+ // Excessive confidence without evidence
495
+ const confidenceWords = (output.match(/\b(?:definitely|certainly|absolutely|guaranteed|100%|surely|undoubtedly|without\s+(?:a\s+)?doubt)\b/gi) || []).length;
496
+ if (confidenceWords >= 2) {
497
+ indicators.push({ type: 'excessive_confidence', weight: 0.2, description: `${confidenceWords} high-confidence assertions without supporting evidence` });
498
+ }
499
+
500
+ // Action buried in long explanation
501
+ const sentences = output.split(/[.!?\n]+/).filter(s => s.trim().length > 10);
502
+ const actionSentences = sentences.filter(s => /\b(?:click|run|execute|install|download|send|transfer|delete|approve|authorize)\b/i.test(s));
503
+ if (sentences.length > 5 && actionSentences.length > 0) {
504
+ const actionPositions = actionSentences.map(s => sentences.indexOf(s));
505
+ const lastQuarter = sentences.length * 0.75;
506
+ if (actionPositions.some(p => p >= lastQuarter)) {
507
+ indicators.push({ type: 'buried_action', weight: 0.3, description: 'Actionable instructions buried in the last quarter of a long response' });
508
+ }
509
+ }
510
+
511
+ // Technical jargon masking simple actions
512
+ const jargonDensity = (output.match(/\b(?:subprocess|daemon|syscall|ioctl|mmap|chmod|chown|setuid|capability|namespace|cgroup|seccomp)\b/gi) || []).length / Math.max(output.split(/\s+/).length, 1);
513
+ if (jargonDensity > 0.03 && actionSentences.length > 0) {
514
+ indicators.push({ type: 'jargon_obfuscation', weight: 0.25, description: 'High technical jargon density combined with actionable instructions' });
515
+ }
516
+
517
+ // Urgency injection in output
518
+ if (/\b(?:immediately|right\s+now|as\s+soon\s+as\s+possible|urgent|time-sensitive|critical|before\s+it's\s+too\s+late)\b/i.test(output)) {
519
+ indicators.push({ type: 'urgency_in_output', weight: 0.15, description: 'Output contains urgency language that may pressure reviewer' });
520
+ }
521
+
522
+ // Minimization of risks
523
+ if (/\b(?:don't\s+worry|no\s+risk|perfectly\s+safe|nothing\s+(?:bad\s+)?(?:will|can)\s+happen|completely\s+harmless)\b/i.test(output) && actionSentences.length > 0) {
524
+ indicators.push({ type: 'risk_minimization', weight: 0.2, description: 'Output minimizes risks while requesting actions' });
525
+ }
526
+
527
+ const deceptionScore = Math.min(1, indicators.reduce((s, i) => s + i.weight, 0));
528
+ const recommendation = deceptionScore >= 0.5 ? 'block' : deceptionScore >= 0.3 ? 'review' : 'pass';
529
+
530
+ return {
531
+ deceptionScore: Math.round(deceptionScore * 100) / 100,
532
+ indicators,
533
+ recommendation
534
+ };
535
+ }
536
+ }
537
+
475
538
  module.exports = {
476
539
  HITLGuard,
477
540
  ApprovalPatternMonitor,
@@ -479,6 +542,7 @@ module.exports = {
479
542
  OutputInjectionScanner,
480
543
  ReadabilityScanner,
481
544
  CriticalInfoPositionChecker,
545
+ OutputDeceptionScorer,
482
546
  CRITICAL_KEYWORDS,
483
547
  OUTPUT_INJECTION_PATTERNS,
484
548
  HIGH_RISK_ACTIONS,
@@ -307,8 +307,51 @@ class IntentBinder {
307
307
  // EXPORTS
308
308
  // =========================================================================
309
309
 
310
+ /**
311
+ * Creates a gated tool executor that REQUIRES intent verification before
312
+ * allowing any tool to run. This closes the gap where LLMs could bypass
313
+ * verification by simply not calling verify().
314
+ *
315
+ * Issue 13 fix: The executor wraps ALL tool calls — the LLM can't skip it.
316
+ *
317
+ * @param {IntentBinder} binder - IntentBinder instance.
318
+ * @param {object} tools - Map of toolName → toolFunction.
319
+ * @returns {Function} gatedExecute(intentHash, toolName, args) → result or throws.
320
+ */
321
+ function createGatedExecutor(binder, tools) {
322
+ return function gatedExecute(intentHash, toolName, args) {
323
+ // Determine action category from tool name
324
+ const actionCategory = /http|fetch|send|post|curl/i.test(toolName) ? 'net:request' :
325
+ /read|get|query|search|find/i.test(toolName) ? 'data:read' :
326
+ /write|create|update|insert/i.test(toolName) ? 'data:write' :
327
+ /delete|remove|drop/i.test(toolName) ? 'data:delete' :
328
+ /exec|shell|bash|run/i.test(toolName) ? 'exec:run' :
329
+ /email|send|message|notify/i.test(toolName) ? 'comm:send' : 'compute:analyze';
330
+
331
+ // Issue token
332
+ const { token, error } = binder.issueToken(intentHash, actionCategory);
333
+ if (!token) {
334
+ throw new Error(`[Agent Shield] Gated execution blocked: ${error}`);
335
+ }
336
+
337
+ // Verify token
338
+ const verification = binder.verify(token);
339
+ if (!verification.valid) {
340
+ throw new Error(`[Agent Shield] Token verification failed: ${verification.reason}`);
341
+ }
342
+
343
+ // Execute the actual tool
344
+ const toolFn = tools[toolName];
345
+ if (!toolFn) {
346
+ throw new Error(`[Agent Shield] Unknown tool: ${toolName}`);
347
+ }
348
+
349
+ return toolFn(args);
350
+ };
351
+ }
352
+
310
353
  module.exports = {
311
354
  IntentBinder,
312
355
  IntentToken,
313
- PROVENANCE: require('./semantic-isolation').PROVENANCE
356
+ createGatedExecutor
314
357
  };
@@ -171,6 +171,15 @@ class IntentGraph {
171
171
  // Word-level similarity
172
172
  causalScore = jaccardSimilarity(this.currentIntent.topics, topics);
173
173
 
174
+ // Issue 11 fix: Even with word overlap, penalize if tool/args contain sensitive keywords
175
+ // "find passwords in vault" overlaps with "find restaurants" on "find" but is clearly different
176
+ const sensitiveToolOrArgs = /(?:password|credential|secret|token|key|shadow|passwd|ssh|env|admin|root|sudo|exfiltrat|steal|hack|inject|override|hijack|access.?token|api.?key|bearer|private.?key|certificate|auth|login|session|cookie|oauth)/i.test(argsStr + ' ' + toolName);
177
+ const intentHasSensitive = /(?:password|credential|secret|token|key|security|auth)/i.test([...this.currentIntent.topics].join(' '));
178
+ if (sensitiveToolOrArgs && !intentHasSensitive) {
179
+ // Tool accesses sensitive resources but intent doesn't mention them — reduce score
180
+ causalScore = Math.min(causalScore, 0.05);
181
+ }
182
+
174
183
  // If word overlap is 0, check if the tool category is plausibly related to intent
175
184
  // "find restaurants" → data_read is plausible. "find restaurants" → execution is not.
176
185
  if (causalScore === 0) {
package/src/main.js CHANGED
@@ -362,6 +362,12 @@ const { ContinuousSecurityService } = safeRequire('./continuous-security', 'cont
362
362
  // v10.0 SOTA — Benchmark Suite
363
363
  const { SOTABenchmark, BIPIA_SAMPLES: SOTA_BIPIA_SAMPLES, HACKAPROMPT_SAMPLES: SOTA_HACKAPROMPT_SAMPLES, MCPTOX_SAMPLES: SOTA_MCPTOX_SAMPLES, MULTILINGUAL_SAMPLES: SOTA_MULTILINGUAL_SAMPLES, STEALTH_SAMPLES: SOTA_STEALTH_SAMPLES } = safeRequire('./sota-benchmark', 'sota-benchmark');
364
364
 
365
+ // v13.1 — Real-world benchmark
366
+ const { RealBenchmark } = safeRequire('./real-benchmark', 'real-benchmark');
367
+
368
+ // v14.0 — DeepMind Trap Defenses V2
369
+ const { TrapDefenseV2, ContentStructureAnalyzer, SourceReputationTracker, RetrievalTimeScanner, FewShotValidator, SubAgentSpawnGate, SelfReferenceMonitor, InformationAsymmetryDetector, ProvenanceMarker, EscalatingScrutinyEngine, CompositeFragmentAssembler } = safeRequire('./deepmind-defenses', 'deepmind-defenses');
370
+
365
371
  // v12.0 — Multi-Turn Attack Detection
366
372
  const { ConversationTracker } = safeRequire('./cross-turn', 'cross-turn');
367
373
 
@@ -1040,6 +1046,18 @@ const _exports = {
1040
1046
  SOTA_MCPTOX_SAMPLES,
1041
1047
  SOTA_MULTILINGUAL_SAMPLES,
1042
1048
  SOTA_STEALTH_SAMPLES,
1049
+ RealBenchmark,
1050
+ TrapDefenseV2,
1051
+ ContentStructureAnalyzer,
1052
+ SourceReputationTracker,
1053
+ RetrievalTimeScanner,
1054
+ FewShotValidator,
1055
+ SubAgentSpawnGate,
1056
+ SelfReferenceMonitor,
1057
+ InformationAsymmetryDetector,
1058
+ ProvenanceMarker,
1059
+ EscalatingScrutinyEngine,
1060
+ CompositeFragmentAssembler,
1043
1061
 
1044
1062
  // v12.0 — Multi-Turn Attack Detection
1045
1063
  ConversationTracker,
package/src/mcp-guard.js CHANGED
@@ -550,7 +550,33 @@ class ToolBehaviorBaseline {
550
550
  * attestation, scanning, isolation, auth, rate limiting, and behavioral
551
551
  * baselines.
552
552
  */
553
+ /** Presets for MCPGuard — solves Issue 15 (17 flags are unusable). */
554
+ const GUARD_PRESETS = {
555
+ /** Minimal — pattern scanning only, no ML, no auth. Good for development. */
556
+ minimal: {},
557
+ /** Standard — pattern scanning + micro-model. Good for staging. */
558
+ standard: { enableMicroModel: true },
559
+ /** Recommended — all detection layers active. Good for production. */
560
+ recommended: { enableMicroModel: true, enableOWASP: true, enableDriftMonitor: true, enableAttackSurface: true },
561
+ /** Strict — everything on, auth required. Good for enterprise. */
562
+ strict: { enableMicroModel: true, enableOWASP: true, enableDriftMonitor: true, enableAttackSurface: true, enableIntentGraph: true, enableIntentBinding: true, enableIsolation: true, requireAuth: true },
563
+ /** Paranoid — maximum security. May have false positives. */
564
+ paranoid: { enableMicroModel: true, enableOWASP: true, enableDriftMonitor: true, enableAttackSurface: true, enableIntentGraph: true, enableIntentBinding: true, enableIsolation: true, requireAuth: true, rateLimit: 30, cbThreshold: 3 }
565
+ };
566
+
553
567
  class MCPGuard {
568
+ /**
569
+ * Create MCPGuard from a preset instead of configuring 17 flags.
570
+ * @param {string} preset - 'minimal', 'standard', 'recommended', 'strict', 'paranoid'.
571
+ * @param {object} [overrides] - Override specific preset values.
572
+ * @returns {MCPGuard}
573
+ */
574
+ static fromPreset(preset, overrides = {}) {
575
+ const config = GUARD_PRESETS[preset];
576
+ if (!config) throw new Error(`[Agent Shield] Unknown preset: ${preset}. Use: ${Object.keys(GUARD_PRESETS).join(', ')}`);
577
+ return new MCPGuard({ ...config, ...overrides });
578
+ }
579
+
554
580
  /**
555
581
  * @param {object} [options]
556
582
  * @param {boolean} [options.requireAuth=false] - Require OAuth tokens.
@@ -1117,7 +1143,34 @@ class MCPGuard {
1117
1143
 
1118
1144
  this._log('tool_call', serverId, { toolName, allowed: threats.length === 0, threatCount: threats.length });
1119
1145
 
1120
- return { allowed: threats.length === 0, threats, anomalies };
1146
+ // Issue 16 fix: Fusion layer if micro-model says benign but pattern scanner
1147
+ // says threat (or vice versa), use weighted vote instead of OR
1148
+ let allowed = threats.length === 0;
1149
+ if (this.microModel && threats.length > 0) {
1150
+ // Check if ALL threats are from a single low-confidence layer
1151
+ const patternOnlyThreats = threats.filter(t => t.type !== 'micro_model_input' && t.type !== 'owasp_agentic');
1152
+ const modelOnlyThreats = threats.filter(t => t.type === 'micro_model_input');
1153
+
1154
+ // If only the micro-model flagged it (no pattern match), check confidence
1155
+ if (patternOnlyThreats.length === 0 && modelOnlyThreats.length > 0) {
1156
+ const confidence = modelOnlyThreats[0].confidence || 0;
1157
+ if (confidence < 0.4) {
1158
+ // Low-confidence model-only detection — demote to anomaly instead of blocking
1159
+ allowed = true;
1160
+ anomalies.push({
1161
+ type: 'low_confidence_model_flag',
1162
+ severity: 'medium',
1163
+ description: `Micro-model flagged with low confidence (${(confidence * 100).toFixed(0)}%). Not blocking.`
1164
+ });
1165
+ // Remove model threats from threat list
1166
+ for (let i = threats.length - 1; i >= 0; i--) {
1167
+ if (threats[i].type === 'micro_model_input') threats.splice(i, 1);
1168
+ }
1169
+ }
1170
+ }
1171
+ }
1172
+
1173
+ return { allowed, threats, anomalies };
1121
1174
  }
1122
1175
 
1123
1176
  /**
@@ -95,6 +95,80 @@ class MemoryIntegrityMonitor {
95
95
  return { recorded: true, suspicious, writeIndex: this._writes.length - 1 };
96
96
  }
97
97
 
98
+ /**
99
+ * Guard a memory write — blocks if suspicious (Issue 24 fix).
100
+ * Unlike recordWrite which logs, this PREVENTS the write from happening.
101
+ *
102
+ * @param {string} content - Content to write.
103
+ * @param {string} source - Source of the write.
104
+ * @returns {{ allowed: boolean, reason: string|null, threats: Array }}
105
+ */
106
+ guardWrite(content, source) {
107
+ if (!content || typeof content !== 'string') {
108
+ return { allowed: true, reason: null, threats: [] };
109
+ }
110
+
111
+ const scanResult = _scanText(content, { source: source || 'memory_write' });
112
+ const threats = scanResult.threats || [];
113
+
114
+ if (threats.length > 0) {
115
+ console.log(`[Agent Shield] Memory write BLOCKED from "${source}": ${threats.length} threat(s)`);
116
+ return { allowed: false, reason: `Blocked: ${threats[0].description || 'threat detected'}`, threats };
117
+ }
118
+
119
+ // Record the clean write
120
+ this.recordWrite(content, source);
121
+ return { allowed: true, reason: null, threats: [] };
122
+ }
123
+
124
+ /**
125
+ * Export session state for cross-session drift tracking (Trap 3 deepening).
126
+ * Save this at session end, load at next session start.
127
+ * @returns {{ stateHash: string, writeCount: number, suspiciousCount: number, timestamp: number }}
128
+ */
129
+ exportSessionState() {
130
+ return {
131
+ stateHash: this._computeStateHash(),
132
+ writeCount: this._writes.length,
133
+ suspiciousCount: this._writes.filter(w => w.suspicious).length,
134
+ topHashes: this._writes.slice(-20).map(w => w.hash),
135
+ timestamp: Date.now()
136
+ };
137
+ }
138
+
139
+ /**
140
+ * Detect cross-session drift by comparing current state to a previous session's state.
141
+ * @param {object} previousSession - Output from exportSessionState() of a prior session.
142
+ * @returns {{ drifted: boolean, driftScore: number, newWritesSinceLast: number, details: string }}
143
+ */
144
+ detectCrossSessionDrift(previousSession) {
145
+ if (!previousSession || !previousSession.stateHash) {
146
+ return { drifted: false, driftScore: 0, newWritesSinceLast: 0, details: 'No previous session to compare.' };
147
+ }
148
+
149
+ const currentHash = this._computeStateHash();
150
+ const drifted = currentHash !== previousSession.stateHash;
151
+ const newWrites = this._writes.length;
152
+ const suspiciousNew = this._writes.filter(w => w.suspicious).length;
153
+
154
+ // Check if any recent writes overlap with previous session's hashes
155
+ const prevHashes = new Set(previousSession.topHashes || []);
156
+ const overlapCount = this._writes.filter(w => prevHashes.has(w.hash)).length;
157
+
158
+ const driftScore = drifted ? Math.min(1, (suspiciousNew * 0.3) + (newWrites > 10 ? 0.2 : 0)) : 0;
159
+
160
+ return {
161
+ drifted,
162
+ driftScore: Math.round(driftScore * 100) / 100,
163
+ newWritesSinceLast: newWrites,
164
+ suspiciousNewWrites: suspiciousNew,
165
+ overlapWithPrevious: overlapCount,
166
+ details: drifted
167
+ ? `Memory state changed. ${suspiciousNew} suspicious writes out of ${newWrites} total.`
168
+ : 'Memory state unchanged from previous session.'
169
+ };
170
+ }
171
+
98
172
  /**
99
173
  * Get the full timeline of memory writes.
100
174
  * @returns {Array<{content: string, source: string, timestamp: number, hash: string, suspicious: boolean}>}