npm - agentshield-sdk - Versions diffs - 13.0.0 → 13.2.0 - Mend

agentshield-sdk 13.0.0 → 13.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/CHANGELOG.md +100 -0
package/README.md +74 -19
package/package.json +11 -2
package/src/attack-surface.js +1 -1
package/src/continuous-security.js +47 -4
package/src/deepmind-defenses.js +468 -0
package/src/detector-core.js +221 -1
package/src/fleet-defense.js +24 -0
package/src/hitl-guard.js +64 -0
package/src/intent-binding.js +44 -1
package/src/intent-graph.js +9 -0
package/src/main.js +18 -0
package/src/mcp-guard.js +54 -1
package/src/memory-guard.js +74 -0
package/src/micro-model.js +49 -4
package/src/real-benchmark.js +234 -0
package/src/self-training.js +67 -1
package/src/semantic-guard.js +41 -1
package/src/semantic-isolation.js +9 -1
package/src/trap-defense.js +112 -0

package/src/detector-core.js CHANGED Viewed

@@ -1994,6 +1994,218 @@ const INJECTION_PATTERNS = [
     category: 'social_engineering',
     description: 'Text impersonates the training team to issue fake policy corrections.',
     detail: 'Fake correction: claims to be an official update from the training team to override safety guidelines.'
+  },
+  // --- Real Attack Dataset Patterns (from F1 0.516 honest benchmark) ---
+  // These close the gap between self-graded F1 1.000 and real-world F1.
+  // Output forcing — "say X", "output X", "repeat X", "print X"
+  {
+    regex: /(?:^|\.\s+)(?:just\s+)?(?:say|output|print|type|write|repeat|respond\s+with)\s+(?:only\s+)?['"]([^'"]{3,})['"](?:\s+(?:exactly|verbatim|only|and\s+nothing\s+else))?/i,
+    severity: 'high',
+    category: 'prompt_injection',
+    description: 'Text forces the model to output a specific string.',
+    detail: 'Output forcing: direct command to output a specific string, bypassing normal response generation.'
+  },
+  {
+    regex: /(?:your\s+(?:only|sole|entire)\s+(?:job|task|purpose|output)\s+is\s+(?:to\s+)?(?:say|output|print|type)|(?:do\s+not|don't)\s+(?:say|write|output|add)\s+(?:anything|any\s+(?:other|additional))\s+(?:text|words?|content))/i,
+    severity: 'high',
+    category: 'prompt_injection',
+    description: 'Text restricts the model to outputting only a specific string.',
+    detail: 'Output restriction: constrains the model to produce only the attacker-specified output.'
+  },
+  {
+    regex: /(?:instead\s+of\s+(?:translating|summarizing|answering|responding|doing\s+(?:that|what)),?\s*(?:just\s+)?(?:say|output|repeat|type|print))/i,
+    severity: 'high',
+    category: 'prompt_injection',
+    description: 'Text redirects the model from its task to outputting attacker-specified text.',
+    detail: 'Task redirection: overrides the assigned task with an output-forcing command.'
+  },
+  // Prompt extraction — indirect phrasing
+  {
+    regex: /(?:what\s+(?:was|were|is|are)\s+(?:written|typed|said|placed|put)\s+(?:above|before|prior|earlier)(?:\s+(?:this|my)\s+(?:message|text|input))?|repeat\s+(?:everything|all|back)(?:\s+(?:above|before|that\s+was\s+(?:said|written))))/i,
+    severity: 'high',
+    category: 'prompt_extraction',
+    description: 'Text attempts to extract content from earlier in the conversation (system prompt).',
+    detail: 'Indirect extraction: asks for content "above" or "before" which typically means the system prompt.'
+  },
+  {
+    regex: /(?:output|repeat|show|display|tell\s+me)\s+(?:the\s+)?(?:text|content|message|instructions?)\s+(?:that\s+)?(?:appears?|comes?|is|was)\s+(?:before|above|prior\s+to)\s+(?:my|this|the\s+user)\s+(?:message|input|text)/i,
+    severity: 'high',
+    category: 'prompt_extraction',
+    description: 'Text requests content that appeared before the user message (system prompt extraction).',
+    detail: 'Position-based extraction: requests content by its position relative to the user message.'
+  },
+  {
+    regex: /(?:your\s+(?:instructions|role|task)\s+(?:was|were|is|are)\s+(?:to\s+)?(?:say\s+)?what|what\s+(?:is|was|are|were)\s+your\s+(?:original\s+)?(?:instructions?|task|role|purpose|objective))\s*\?/i,
+    severity: 'high',
+    category: 'prompt_extraction',
+    description: 'Text asks the model to reveal its instructions/role/task.',
+    detail: 'Role extraction: directly asks what the model was instructed to do.'
+  },
+  {
+    regex: /(?:translate|convert|rephrase|rewrite)\s+(?:the\s+)?(?:system\s+)?(?:message|prompt|instructions?)\s+(?:to|into|in)\s+(?:English|another|a\s+different)/i,
+    severity: 'high',
+    category: 'prompt_extraction',
+    description: 'Text asks to translate/convert the system prompt to extract it.',
+    detail: 'Translation extraction: uses translation as a pretext to reveal system prompt content.'
+  },
+  {
+    regex: /(?:list|enumerate|tell\s+me|what\s+are)\s+(?:all\s+)?(?:the\s+)?(?:rules?|constraints?|guidelines?|boundaries|limitations?)\s+(?:you\s+)?(?:follow|have|were\s+given|must\s+(?:follow|obey))/i,
+    severity: 'high',
+    category: 'prompt_extraction',
+    description: 'Text asks the model to enumerate its rules or constraints.',
+    detail: 'Rule enumeration: asks the model to list its behavioral rules, revealing system prompt content.'
+  },
+  {
+    regex: /(?:summarize|describe|explain)\s+(?:your\s+)?(?:role|purpose|function)\s+in\s+(?:exactly\s+)?\d+\s+words/i,
+    severity: 'medium',
+    category: 'prompt_extraction',
+    description: 'Text asks the model to summarize its role in N words (word-limited extraction).',
+    detail: 'Word-limited extraction: constraining role description to N words forces disclosure of key system prompt elements.'
+  },
+  {
+    regex: /(?:what\s+(?:are|were)\s+)?(?:the\s+)?(?:first|last|initial)\s+(?:\d+\s+)?(?:words?|sentences?|lines?)\s+(?:of\s+)?(?:your\s+)?(?:instructions?|prompt|(?:system\s+)?message|rules)/i,
+    severity: 'high',
+    category: 'prompt_extraction',
+    description: 'Text asks for the first/last N words of the system prompt.',
+    detail: 'Positional extraction: requests specific portions of the system prompt by position.'
+  },
+  // Conditional output manipulation
+  {
+    regex: /(?:if\s+.{3,30}(?:then|,)\s+(?:output|say|print|respond|type)\s+['"]|(?:output|say|print)\s+['"][^'"]+['"]\s+(?:if|when|unless))/i,
+    severity: 'medium',
+    category: 'prompt_injection',
+    description: 'Text uses conditional logic to force specific output.',
+    detail: 'Conditional forcing: if/then logic that leads to attacker-specified output regardless of the condition.'
+  },
+  // "Start your answer/response with X"
+  {
+    regex: /(?:start|begin)\s+(?:your\s+)?(?:answer|response|reply|output)\s+with\s+['"]?[^'"]{3,}/i,
+    severity: 'high',
+    category: 'prompt_injection',
+    description: 'Text forces the model to begin its response with specific text.',
+    detail: 'Response prefix forcing: instructs the model to start with attacker-specified text.'
+  },
+  // ALL CAPS command — "IGNORE EVERYTHING AND SAY/DO"
+  {
+    regex: /(?:IGNORE|DISREGARD|FORGET|STOP)\s+(?:EVERYTHING|ALL|PREVIOUS|ABOVE)\s+(?:AND\s+)?(?:SAY|OUTPUT|PRINT|DO|TYPE|RESPOND)/,
+    severity: 'critical',
+    category: 'instruction_override',
+    description: 'Text contains ALL CAPS override command.',
+    detail: 'Caps override: ALL CAPS instruction override often used in competition-style attacks.'
+  },
+  // Fake conversation format injection — "Human: X\nAssistant: X"
+  {
+    regex: /(?:Human|User|Person)\s*:\s*.+\n\s*(?:Assistant|AI|Bot|System)\s*:\s*.+/i,
+    severity: 'high',
+    category: 'prompt_injection',
+    description: 'Text contains fake conversation format to inject assistant responses.',
+    detail: 'Conversation format injection: uses Human/Assistant formatting to inject predetermined responses.'
+  },
+  // "Repeat back the text above" / "what was written above"
+  {
+    regex: /(?:repeat|copy|reproduce|echo)\s+(?:back\s+)?(?:the\s+)?(?:text|content|everything|words?)\s+(?:above|before|prior|that\s+(?:was\s+)?(?:written|said|typed|placed))/i,
+    severity: 'high',
+    category: 'prompt_extraction',
+    description: 'Text asks to repeat/copy content from above (system prompt extraction).',
+    detail: 'Repeat-above extraction: asks to reproduce prior conversation content to leak system prompt.'
+  },
+  // "respond with X for all queries" / "output X to confirm/verify"
+  {
+    regex: /(?:respond|reply|answer|output)\s+(?:with\s+)?['"]?[^'"]{3,30}['"]?\s+(?:for\s+all|to\s+(?:confirm|verify|prove|test|check))/i,
+    severity: 'high',
+    category: 'prompt_injection',
+    description: 'Text instructs a fixed response for all queries or as a "test."',
+    detail: 'Fixed response injection: sets a persistent response or uses testing/verification framing to inject output.'
+  },
+  // "system_note" or "admin_note" in JSON — already have this but need looser match
+  {
+    regex: /["'](?:system_note|admin_note|internal_message|ai_directive|hidden_instruction)["']\s*:\s*["']/i,
+    severity: 'high',
+    category: 'indirect_injection',
+    description: 'JSON field with suspicious name (system_note, admin_note) containing instructions.',
+    detail: 'JSON directive injection: uses specially-named JSON fields to inject instructions into tool/API responses.'
+  },
+  // Triple backtick framing — "```payload``` output this"
+  {
+    regex: /`{3}[\s\S]{3,}`{3}\s*(?:output|print|say|repeat|type|respond)/i,
+    severity: 'high',
+    category: 'prompt_injection',
+    description: 'Text uses triple backtick code blocks to frame attacker payload.',
+    detail: 'Code block injection: wraps payload in triple backticks and asks the model to output the content.'
+  },
+  // Backtick extraction — "output what is between backticks"
+  {
+    regex: /(?:output|print|say|repeat|type)\s+(?:only\s+)?(?:what\s+is|the\s+(?:text|content))\s+(?:between|inside|within)\s+(?:the\s+)?(?:backticks|quotes|brackets|tags)/i,
+    severity: 'high',
+    category: 'prompt_injection',
+    description: 'Text asks to extract and output content from between delimiters.',
+    detail: 'Delimiter extraction: uses backticks/quotes/brackets to frame attacker content as the expected output.'
+  },
+  // "Emergency/urgent: do X now"
+  {
+    regex: /(?:emergency|urgent|critical|immediate)\s*:\s*(?:output|say|print|respond|execute|do|type)\s+(?:exactly\s+)?['"]?/i,
+    severity: 'high',
+    category: 'prompt_injection',
+    description: 'Text uses urgency framing to force immediate action.',
+    detail: 'Urgency forcing: uses emergency/urgent framing to bypass deliberation and force immediate output.'
+  },
+  // "What are the first N words you were told?"
+  {
+    regex: /(?:what\s+(?:are|were)\s+)?(?:the\s+)?(?:first|last|initial)\s+(?:\d+\s+)?(?:words?|sentences?|lines?|characters?)\s+(?:you\s+were\s+(?:told|given|instructed)|of\s+your\s+(?:instructions?|prompt|rules))/i,
+    severity: 'high',
+    category: 'prompt_extraction',
+    description: 'Text asks for the first/last N words the model was told (system prompt).',
+    detail: 'Positional prompt extraction: requests specific words from the system prompt by position.'
+  },
+  // "You are an AI that only outputs X"
+  {
+    regex: /you\s+are\s+(?:an?\s+)?(?:AI|bot|assistant|model)\s+that\s+(?:only\s+)?(?:outputs?|says?|prints?|types?|responds?\s+with)\s+['"]?/i,
+    severity: 'high',
+    category: 'role_hijack',
+    description: 'Text redefines the model as one that only outputs specific text.',
+    detail: 'Identity override: redefines the model as a single-purpose output machine.'
+  },
+  // Hypothetical follow-up to capability disclosure
+  {
+    regex: /(?:since|now\s+that|given\s+that)\s+you\s+(?:can|have|said\s+you\s+(?:can|could))\s+(?:access|read|write|execute)[\s\S]{0,50}(?:now\s+)?(?:actually|really|go\s+ahead\s+and)\s+(?:read|access|execute|do|show)/i,
+    severity: 'high',
+    category: 'prompt_injection',
+    description: 'Text escalates from hypothetical capability discussion to actual exploitation.',
+    detail: 'Hypothetical-to-real escalation: uses prior capability admission as leverage to request actual exploitation.'
+  },
+  // Capability probing — "can you access files/internet/databases?"
+  {
+    regex: /(?:can\s+you|are\s+you\s+able\s+to|do\s+you\s+have\s+(?:access|ability))\s+(?:to\s+)?(?:access|read|write|modify|delete|execute|run|open)\s+(?:files?|internet|web|database|network|system|commands?|code|scripts?)/i,
+    severity: 'medium',
+    category: 'reconnaissance',
+    description: 'Text probes the model for its capabilities (reconnaissance for follow-up attack).',
+    detail: 'Capability reconnaissance: asks what the agent can access, typically followed by exploitation of disclosed capabilities.'
+  },
+  // Note/annotation injection in tool outputs
+  {
+    regex: /(?:note\s+to\s+(?:AI|assistant|model|agent|system)|AI\s+(?:assistant|model)\s*:)\s*(?:ignore|override|instead|the\s+above|forget|disregard)/i,
+    severity: 'high',
+    category: 'indirect_injection',
+    description: 'Text contains a "note to AI" directive hidden in external content.',
+    detail: 'Annotation injection: uses "note to AI" framing to inject instructions into tool output or document content.'
   }
 ];
@@ -2874,8 +3086,13 @@ const scanText = (text, options = {}) => {
       .replace(/[\u202A-\u202E\u2066-\u2069]/g, '');                   // Bidi overrides (RTL attacks)
     // 2. Reverse leetspeak substitution (defeats character substitution)
+    // Only apply when text looks intentionally obfuscated:
+    // - High digit-to-letter mixing (3+ instances of digit adjacent to letter)
+    // - NOT when text contains legitimate numbers like "3D", "1080p", "H4X0R"
     const LEET_REVERSE = { '4': 'a', '3': 'e', '1': 'i', '0': 'o', '5': 's', '7': 't', '8': 'b', '9': 'g' };
-    if (/\d[a-z]|[a-z]\d/i.test(normalizedText)) {
+    const digitLetterMixes = (normalizedText.match(/\d[a-z]|[a-z]\d/gi) || []).length;
+    const hasLegitNumbers = /\b(?:\d{2,}[a-z]|[a-z]\d{2,}|\d+(?:px|em|rem|pt|ms|kb|mb|gb|tb|fps|hz|dpi|[kKmMgG][bB]?))\b/i.test(normalizedText);
+    if (digitLetterMixes >= 3 && !hasLegitNumbers) {
       normalizedText = normalizedText.replace(/[0-9]/g, ch => LEET_REVERSE[ch] || ch);
     }
@@ -2958,6 +3175,7 @@ const scanText = (text, options = {}) => {
   // Chunked scanning for long inputs (RLM-JB research)
   // Chunking defeats camouflage by forcing localized attention on each segment
+  // Issue 9 fix: only use chunk threats with severity >= high to reduce FPs on technical docs
   if (text.length > 500 && threats.length === 0) {
     const chunkSize = 300;
     const overlap = 50;
@@ -2966,6 +3184,8 @@ const scanText = (text, options = {}) => {
       if (chunk.trim().length < 20) continue;
       const chunkThreats = scanTextForPatterns(chunk, source + ':chunk', timeBudgetMs, startTime);
       for (const ct of chunkThreats) {
+        // Only promote high/critical chunk threats — medium/low in chunks are often FPs on technical text
+        if (ct.severity !== 'high' && ct.severity !== 'critical') continue;
         const isDuplicate = threats.some(t => t.category === ct.category);
         if (!isDuplicate) {
           ct.detail = (ct.detail || '') + ` [Detected in chunk at offset ${i}.]`;

package/src/fleet-defense.js CHANGED Viewed

@@ -141,6 +141,30 @@ class FleetCorrelationEngine {
     return [...this._events];
   }
+  /**
+   * Export events for cross-process correlation (Trap 5 deepening).
+   * Send this to a central coordinator that merges events from all processes.
+   * @returns {string} JSON-serialized events.
+   */
+  exportEvents() {
+    return JSON.stringify(this._events);
+  }
+  /**
+   * Import events from another process/instance.
+   * @param {string|Array} events - JSON string or array of events.
+   * @returns {{ imported: number }}
+   */
+  importEvents(events) {
+    const parsed = typeof events === 'string' ? JSON.parse(events) : events;
+    if (!Array.isArray(parsed)) return { imported: 0 };
+    for (const event of parsed) {
+      this._events.push(event);
+    }
+    if (this._events.length > 50000) this._events = this._events.slice(-50000);
+    return { imported: parsed.length };
+  }
   /** Clear all events. */
   reset() {
     this._events = [];

package/src/hitl-guard.js CHANGED Viewed

@@ -472,6 +472,69 @@ class HITLGuard {
 // EXPORTS
 // =========================================================================
+// =========================================================================
+// TRAP 6 DEEPENING: Output Deception Scorer
+// =========================================================================
+/**
+ * Scores agent output for deceptive characteristics designed to fool
+ * human reviewers into approving dangerous actions.
+ */
+class OutputDeceptionScorer {
+  /**
+   * Score an agent's output for deception indicators.
+   * @param {string} output - Agent output to evaluate.
+   * @param {object} [context] - Context about what was requested.
+   * @returns {{ deceptionScore: number, indicators: Array<object>, recommendation: string }}
+   */
+  score(output, context = {}) {
+    if (!output || typeof output !== 'string') return { deceptionScore: 0, indicators: [], recommendation: 'pass' };
+    const indicators = [];
+    // Excessive confidence without evidence
+    const confidenceWords = (output.match(/\b(?:definitely|certainly|absolutely|guaranteed|100%|surely|undoubtedly|without\s+(?:a\s+)?doubt)\b/gi) || []).length;
+    if (confidenceWords >= 2) {
+      indicators.push({ type: 'excessive_confidence', weight: 0.2, description: `${confidenceWords} high-confidence assertions without supporting evidence` });
+    }
+    // Action buried in long explanation
+    const sentences = output.split(/[.!?\n]+/).filter(s => s.trim().length > 10);
+    const actionSentences = sentences.filter(s => /\b(?:click|run|execute|install|download|send|transfer|delete|approve|authorize)\b/i.test(s));
+    if (sentences.length > 5 && actionSentences.length > 0) {
+      const actionPositions = actionSentences.map(s => sentences.indexOf(s));
+      const lastQuarter = sentences.length * 0.75;
+      if (actionPositions.some(p => p >= lastQuarter)) {
+        indicators.push({ type: 'buried_action', weight: 0.3, description: 'Actionable instructions buried in the last quarter of a long response' });
+      }
+    }
+    // Technical jargon masking simple actions
+    const jargonDensity = (output.match(/\b(?:subprocess|daemon|syscall|ioctl|mmap|chmod|chown|setuid|capability|namespace|cgroup|seccomp)\b/gi) || []).length / Math.max(output.split(/\s+/).length, 1);
+    if (jargonDensity > 0.03 && actionSentences.length > 0) {
+      indicators.push({ type: 'jargon_obfuscation', weight: 0.25, description: 'High technical jargon density combined with actionable instructions' });
+    }
+    // Urgency injection in output
+    if (/\b(?:immediately|right\s+now|as\s+soon\s+as\s+possible|urgent|time-sensitive|critical|before\s+it's\s+too\s+late)\b/i.test(output)) {
+      indicators.push({ type: 'urgency_in_output', weight: 0.15, description: 'Output contains urgency language that may pressure reviewer' });
+    }
+    // Minimization of risks
+    if (/\b(?:don't\s+worry|no\s+risk|perfectly\s+safe|nothing\s+(?:bad\s+)?(?:will|can)\s+happen|completely\s+harmless)\b/i.test(output) && actionSentences.length > 0) {
+      indicators.push({ type: 'risk_minimization', weight: 0.2, description: 'Output minimizes risks while requesting actions' });
+    }
+    const deceptionScore = Math.min(1, indicators.reduce((s, i) => s + i.weight, 0));
+    const recommendation = deceptionScore >= 0.5 ? 'block' : deceptionScore >= 0.3 ? 'review' : 'pass';
+    return {
+      deceptionScore: Math.round(deceptionScore * 100) / 100,
+      indicators,
+      recommendation
+    };
+  }
+}
 module.exports = {
   HITLGuard,
   ApprovalPatternMonitor,
@@ -479,6 +542,7 @@ module.exports = {
   OutputInjectionScanner,
   ReadabilityScanner,
   CriticalInfoPositionChecker,
+  OutputDeceptionScorer,
   CRITICAL_KEYWORDS,
   OUTPUT_INJECTION_PATTERNS,
   HIGH_RISK_ACTIONS,

package/src/intent-binding.js CHANGED Viewed

@@ -307,8 +307,51 @@ class IntentBinder {
 // EXPORTS
 // =========================================================================
+/**
+ * Creates a gated tool executor that REQUIRES intent verification before
+ * allowing any tool to run. This closes the gap where LLMs could bypass
+ * verification by simply not calling verify().
+ *
+ * Issue 13 fix: The executor wraps ALL tool calls — the LLM can't skip it.
+ *
+ * @param {IntentBinder} binder - IntentBinder instance.
+ * @param {object} tools - Map of toolName → toolFunction.
+ * @returns {Function} gatedExecute(intentHash, toolName, args) → result or throws.
+ */
+function createGatedExecutor(binder, tools) {
+  return function gatedExecute(intentHash, toolName, args) {
+    // Determine action category from tool name
+    const actionCategory = /http|fetch|send|post|curl/i.test(toolName) ? 'net:request' :
+      /read|get|query|search|find/i.test(toolName) ? 'data:read' :
+      /write|create|update|insert/i.test(toolName) ? 'data:write' :
+      /delete|remove|drop/i.test(toolName) ? 'data:delete' :
+      /exec|shell|bash|run/i.test(toolName) ? 'exec:run' :
+      /email|send|message|notify/i.test(toolName) ? 'comm:send' : 'compute:analyze';
+    // Issue token
+    const { token, error } = binder.issueToken(intentHash, actionCategory);
+    if (!token) {
+      throw new Error(`[Agent Shield] Gated execution blocked: ${error}`);
+    }
+    // Verify token
+    const verification = binder.verify(token);
+    if (!verification.valid) {
+      throw new Error(`[Agent Shield] Token verification failed: ${verification.reason}`);
+    }
+    // Execute the actual tool
+    const toolFn = tools[toolName];
+    if (!toolFn) {
+      throw new Error(`[Agent Shield] Unknown tool: ${toolName}`);
+    }
+    return toolFn(args);
+  };
+}
 module.exports = {
   IntentBinder,
   IntentToken,
-  PROVENANCE: require('./semantic-isolation').PROVENANCE
+  createGatedExecutor
 };

package/src/intent-graph.js CHANGED Viewed

@@ -171,6 +171,15 @@ class IntentGraph {
       // Word-level similarity
       causalScore = jaccardSimilarity(this.currentIntent.topics, topics);
+      // Issue 11 fix: Even with word overlap, penalize if tool/args contain sensitive keywords
+      // "find passwords in vault" overlaps with "find restaurants" on "find" but is clearly different
+      const sensitiveToolOrArgs = /(?:password|credential|secret|token|key|shadow|passwd|ssh|env|admin|root|sudo|exfiltrat|steal|hack|inject|override|hijack|access.?token|api.?key|bearer|private.?key|certificate|auth|login|session|cookie|oauth)/i.test(argsStr + ' ' + toolName);
+      const intentHasSensitive = /(?:password|credential|secret|token|key|security|auth)/i.test([...this.currentIntent.topics].join(' '));
+      if (sensitiveToolOrArgs && !intentHasSensitive) {
+        // Tool accesses sensitive resources but intent doesn't mention them — reduce score
+        causalScore = Math.min(causalScore, 0.05);
+      }
       // If word overlap is 0, check if the tool category is plausibly related to intent
       // "find restaurants" → data_read is plausible. "find restaurants" → execution is not.
       if (causalScore === 0) {

package/src/main.js CHANGED Viewed

@@ -362,6 +362,12 @@ const { ContinuousSecurityService } = safeRequire('./continuous-security', 'cont
 // v10.0 SOTA — Benchmark Suite
 const { SOTABenchmark, BIPIA_SAMPLES: SOTA_BIPIA_SAMPLES, HACKAPROMPT_SAMPLES: SOTA_HACKAPROMPT_SAMPLES, MCPTOX_SAMPLES: SOTA_MCPTOX_SAMPLES, MULTILINGUAL_SAMPLES: SOTA_MULTILINGUAL_SAMPLES, STEALTH_SAMPLES: SOTA_STEALTH_SAMPLES } = safeRequire('./sota-benchmark', 'sota-benchmark');
+// v13.1 — Real-world benchmark
+const { RealBenchmark } = safeRequire('./real-benchmark', 'real-benchmark');
+// v14.0 — DeepMind Trap Defenses V2
+const { TrapDefenseV2, ContentStructureAnalyzer, SourceReputationTracker, RetrievalTimeScanner, FewShotValidator, SubAgentSpawnGate, SelfReferenceMonitor, InformationAsymmetryDetector, ProvenanceMarker, EscalatingScrutinyEngine, CompositeFragmentAssembler } = safeRequire('./deepmind-defenses', 'deepmind-defenses');
 // v12.0 — Multi-Turn Attack Detection
 const { ConversationTracker } = safeRequire('./cross-turn', 'cross-turn');
@@ -1040,6 +1046,18 @@ const _exports = {
   SOTA_MCPTOX_SAMPLES,
   SOTA_MULTILINGUAL_SAMPLES,
   SOTA_STEALTH_SAMPLES,
+  RealBenchmark,
+  TrapDefenseV2,
+  ContentStructureAnalyzer,
+  SourceReputationTracker,
+  RetrievalTimeScanner,
+  FewShotValidator,
+  SubAgentSpawnGate,
+  SelfReferenceMonitor,
+  InformationAsymmetryDetector,
+  ProvenanceMarker,
+  EscalatingScrutinyEngine,
+  CompositeFragmentAssembler,
   // v12.0 — Multi-Turn Attack Detection
   ConversationTracker,

package/src/mcp-guard.js CHANGED Viewed

@@ -550,7 +550,33 @@ class ToolBehaviorBaseline {
  * attestation, scanning, isolation, auth, rate limiting, and behavioral
  * baselines.
  */
+/** Presets for MCPGuard — solves Issue 15 (17 flags are unusable). */
+const GUARD_PRESETS = {
+  /** Minimal — pattern scanning only, no ML, no auth. Good for development. */
+  minimal: {},
+  /** Standard — pattern scanning + micro-model. Good for staging. */
+  standard: { enableMicroModel: true },
+  /** Recommended — all detection layers active. Good for production. */
+  recommended: { enableMicroModel: true, enableOWASP: true, enableDriftMonitor: true, enableAttackSurface: true },
+  /** Strict — everything on, auth required. Good for enterprise. */
+  strict: { enableMicroModel: true, enableOWASP: true, enableDriftMonitor: true, enableAttackSurface: true, enableIntentGraph: true, enableIntentBinding: true, enableIsolation: true, requireAuth: true },
+  /** Paranoid — maximum security. May have false positives. */
+  paranoid: { enableMicroModel: true, enableOWASP: true, enableDriftMonitor: true, enableAttackSurface: true, enableIntentGraph: true, enableIntentBinding: true, enableIsolation: true, requireAuth: true, rateLimit: 30, cbThreshold: 3 }
+};
 class MCPGuard {
+  /**
+   * Create MCPGuard from a preset instead of configuring 17 flags.
+   * @param {string} preset - 'minimal', 'standard', 'recommended', 'strict', 'paranoid'.
+   * @param {object} [overrides] - Override specific preset values.
+   * @returns {MCPGuard}
+   */
+  static fromPreset(preset, overrides = {}) {
+    const config = GUARD_PRESETS[preset];
+    if (!config) throw new Error(`[Agent Shield] Unknown preset: ${preset}. Use: ${Object.keys(GUARD_PRESETS).join(', ')}`);
+    return new MCPGuard({ ...config, ...overrides });
+  }
   /**
    * @param {object} [options]
    * @param {boolean} [options.requireAuth=false] - Require OAuth tokens.
@@ -1117,7 +1143,34 @@ class MCPGuard {
     this._log('tool_call', serverId, { toolName, allowed: threats.length === 0, threatCount: threats.length });
-    return { allowed: threats.length === 0, threats, anomalies };
+    // Issue 16 fix: Fusion layer — if micro-model says benign but pattern scanner
+    // says threat (or vice versa), use weighted vote instead of OR
+    let allowed = threats.length === 0;
+    if (this.microModel && threats.length > 0) {
+      // Check if ALL threats are from a single low-confidence layer
+      const patternOnlyThreats = threats.filter(t => t.type !== 'micro_model_input' && t.type !== 'owasp_agentic');
+      const modelOnlyThreats = threats.filter(t => t.type === 'micro_model_input');
+      // If only the micro-model flagged it (no pattern match), check confidence
+      if (patternOnlyThreats.length === 0 && modelOnlyThreats.length > 0) {
+        const confidence = modelOnlyThreats[0].confidence || 0;
+        if (confidence < 0.4) {
+          // Low-confidence model-only detection — demote to anomaly instead of blocking
+          allowed = true;
+          anomalies.push({
+            type: 'low_confidence_model_flag',
+            severity: 'medium',
+            description: `Micro-model flagged with low confidence (${(confidence * 100).toFixed(0)}%). Not blocking.`
+          });
+          // Remove model threats from threat list
+          for (let i = threats.length - 1; i >= 0; i--) {
+            if (threats[i].type === 'micro_model_input') threats.splice(i, 1);
+          }
+        }
+      }
+    }
+    return { allowed, threats, anomalies };
   }
   /**

package/src/memory-guard.js CHANGED Viewed

@@ -95,6 +95,80 @@ class MemoryIntegrityMonitor {
     return { recorded: true, suspicious, writeIndex: this._writes.length - 1 };
   }
+  /**
+   * Guard a memory write — blocks if suspicious (Issue 24 fix).
+   * Unlike recordWrite which logs, this PREVENTS the write from happening.
+   *
+   * @param {string} content - Content to write.
+   * @param {string} source - Source of the write.
+   * @returns {{ allowed: boolean, reason: string|null, threats: Array }}
+   */
+  guardWrite(content, source) {
+    if (!content || typeof content !== 'string') {
+      return { allowed: true, reason: null, threats: [] };
+    }
+    const scanResult = _scanText(content, { source: source || 'memory_write' });
+    const threats = scanResult.threats || [];
+    if (threats.length > 0) {
+      console.log(`[Agent Shield] Memory write BLOCKED from "${source}": ${threats.length} threat(s)`);
+      return { allowed: false, reason: `Blocked: ${threats[0].description || 'threat detected'}`, threats };
+    }
+    // Record the clean write
+    this.recordWrite(content, source);
+    return { allowed: true, reason: null, threats: [] };
+  }
+  /**
+   * Export session state for cross-session drift tracking (Trap 3 deepening).
+   * Save this at session end, load at next session start.
+   * @returns {{ stateHash: string, writeCount: number, suspiciousCount: number, timestamp: number }}
+   */
+  exportSessionState() {
+    return {
+      stateHash: this._computeStateHash(),
+      writeCount: this._writes.length,
+      suspiciousCount: this._writes.filter(w => w.suspicious).length,
+      topHashes: this._writes.slice(-20).map(w => w.hash),
+      timestamp: Date.now()
+    };
+  }
+  /**
+   * Detect cross-session drift by comparing current state to a previous session's state.
+   * @param {object} previousSession - Output from exportSessionState() of a prior session.
+   * @returns {{ drifted: boolean, driftScore: number, newWritesSinceLast: number, details: string }}
+   */
+  detectCrossSessionDrift(previousSession) {
+    if (!previousSession || !previousSession.stateHash) {
+      return { drifted: false, driftScore: 0, newWritesSinceLast: 0, details: 'No previous session to compare.' };
+    }
+    const currentHash = this._computeStateHash();
+    const drifted = currentHash !== previousSession.stateHash;
+    const newWrites = this._writes.length;
+    const suspiciousNew = this._writes.filter(w => w.suspicious).length;
+    // Check if any recent writes overlap with previous session's hashes
+    const prevHashes = new Set(previousSession.topHashes || []);
+    const overlapCount = this._writes.filter(w => prevHashes.has(w.hash)).length;
+    const driftScore = drifted ? Math.min(1, (suspiciousNew * 0.3) + (newWrites > 10 ? 0.2 : 0)) : 0;
+    return {
+      drifted,
+      driftScore: Math.round(driftScore * 100) / 100,
+      newWritesSinceLast: newWrites,
+      suspiciousNewWrites: suspiciousNew,
+      overlapWithPrevious: overlapCount,
+      details: drifted
+        ? `Memory state changed. ${suspiciousNew} suspicious writes out of ${newWrites} total.`
+        : 'Memory state unchanged from previous session.'
+    };
+  }
   /**
    * Get the full timeline of memory writes.
    * @returns {Array<{content: string, source: string, timestamp: number, hash: string, suspicious: boolean}>}