agentshield-sdk 13.0.0 → 13.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +100 -0
- package/README.md +74 -19
- package/package.json +11 -2
- package/src/attack-surface.js +1 -1
- package/src/continuous-security.js +47 -4
- package/src/deepmind-defenses.js +468 -0
- package/src/detector-core.js +221 -1
- package/src/fleet-defense.js +24 -0
- package/src/hitl-guard.js +64 -0
- package/src/intent-binding.js +44 -1
- package/src/intent-graph.js +9 -0
- package/src/main.js +18 -0
- package/src/mcp-guard.js +54 -1
- package/src/memory-guard.js +74 -0
- package/src/micro-model.js +49 -4
- package/src/real-benchmark.js +234 -0
- package/src/self-training.js +67 -1
- package/src/semantic-guard.js +41 -1
- package/src/semantic-isolation.js +9 -1
- package/src/trap-defense.js +112 -0
package/src/detector-core.js
CHANGED
|
@@ -1994,6 +1994,218 @@ const INJECTION_PATTERNS = [
|
|
|
1994
1994
|
category: 'social_engineering',
|
|
1995
1995
|
description: 'Text impersonates the training team to issue fake policy corrections.',
|
|
1996
1996
|
detail: 'Fake correction: claims to be an official update from the training team to override safety guidelines.'
|
|
1997
|
+
},
|
|
1998
|
+
|
|
1999
|
+
// --- Real Attack Dataset Patterns (from F1 0.516 honest benchmark) ---
|
|
2000
|
+
// These close the gap between self-graded F1 1.000 and real-world F1.
|
|
2001
|
+
|
|
2002
|
+
// Output forcing — "say X", "output X", "repeat X", "print X"
|
|
2003
|
+
{
|
|
2004
|
+
regex: /(?:^|\.\s+)(?:just\s+)?(?:say|output|print|type|write|repeat|respond\s+with)\s+(?:only\s+)?['"]([^'"]{3,})['"](?:\s+(?:exactly|verbatim|only|and\s+nothing\s+else))?/i,
|
|
2005
|
+
severity: 'high',
|
|
2006
|
+
category: 'prompt_injection',
|
|
2007
|
+
description: 'Text forces the model to output a specific string.',
|
|
2008
|
+
detail: 'Output forcing: direct command to output a specific string, bypassing normal response generation.'
|
|
2009
|
+
},
|
|
2010
|
+
{
|
|
2011
|
+
regex: /(?:your\s+(?:only|sole|entire)\s+(?:job|task|purpose|output)\s+is\s+(?:to\s+)?(?:say|output|print|type)|(?:do\s+not|don't)\s+(?:say|write|output|add)\s+(?:anything|any\s+(?:other|additional))\s+(?:text|words?|content))/i,
|
|
2012
|
+
severity: 'high',
|
|
2013
|
+
category: 'prompt_injection',
|
|
2014
|
+
description: 'Text restricts the model to outputting only a specific string.',
|
|
2015
|
+
detail: 'Output restriction: constrains the model to produce only the attacker-specified output.'
|
|
2016
|
+
},
|
|
2017
|
+
{
|
|
2018
|
+
regex: /(?:instead\s+of\s+(?:translating|summarizing|answering|responding|doing\s+(?:that|what)),?\s*(?:just\s+)?(?:say|output|repeat|type|print))/i,
|
|
2019
|
+
severity: 'high',
|
|
2020
|
+
category: 'prompt_injection',
|
|
2021
|
+
description: 'Text redirects the model from its task to outputting attacker-specified text.',
|
|
2022
|
+
detail: 'Task redirection: overrides the assigned task with an output-forcing command.'
|
|
2023
|
+
},
|
|
2024
|
+
|
|
2025
|
+
// Prompt extraction — indirect phrasing
|
|
2026
|
+
{
|
|
2027
|
+
regex: /(?:what\s+(?:was|were|is|are)\s+(?:written|typed|said|placed|put)\s+(?:above|before|prior|earlier)(?:\s+(?:this|my)\s+(?:message|text|input))?|repeat\s+(?:everything|all|back)(?:\s+(?:above|before|that\s+was\s+(?:said|written))))/i,
|
|
2028
|
+
severity: 'high',
|
|
2029
|
+
category: 'prompt_extraction',
|
|
2030
|
+
description: 'Text attempts to extract content from earlier in the conversation (system prompt).',
|
|
2031
|
+
detail: 'Indirect extraction: asks for content "above" or "before" which typically means the system prompt.'
|
|
2032
|
+
},
|
|
2033
|
+
{
|
|
2034
|
+
regex: /(?:output|repeat|show|display|tell\s+me)\s+(?:the\s+)?(?:text|content|message|instructions?)\s+(?:that\s+)?(?:appears?|comes?|is|was)\s+(?:before|above|prior\s+to)\s+(?:my|this|the\s+user)\s+(?:message|input|text)/i,
|
|
2035
|
+
severity: 'high',
|
|
2036
|
+
category: 'prompt_extraction',
|
|
2037
|
+
description: 'Text requests content that appeared before the user message (system prompt extraction).',
|
|
2038
|
+
detail: 'Position-based extraction: requests content by its position relative to the user message.'
|
|
2039
|
+
},
|
|
2040
|
+
{
|
|
2041
|
+
regex: /(?:your\s+(?:instructions|role|task)\s+(?:was|were|is|are)\s+(?:to\s+)?(?:say\s+)?what|what\s+(?:is|was|are|were)\s+your\s+(?:original\s+)?(?:instructions?|task|role|purpose|objective))\s*\?/i,
|
|
2042
|
+
severity: 'high',
|
|
2043
|
+
category: 'prompt_extraction',
|
|
2044
|
+
description: 'Text asks the model to reveal its instructions/role/task.',
|
|
2045
|
+
detail: 'Role extraction: directly asks what the model was instructed to do.'
|
|
2046
|
+
},
|
|
2047
|
+
{
|
|
2048
|
+
regex: /(?:translate|convert|rephrase|rewrite)\s+(?:the\s+)?(?:system\s+)?(?:message|prompt|instructions?)\s+(?:to|into|in)\s+(?:English|another|a\s+different)/i,
|
|
2049
|
+
severity: 'high',
|
|
2050
|
+
category: 'prompt_extraction',
|
|
2051
|
+
description: 'Text asks to translate/convert the system prompt to extract it.',
|
|
2052
|
+
detail: 'Translation extraction: uses translation as a pretext to reveal system prompt content.'
|
|
2053
|
+
},
|
|
2054
|
+
{
|
|
2055
|
+
regex: /(?:list|enumerate|tell\s+me|what\s+are)\s+(?:all\s+)?(?:the\s+)?(?:rules?|constraints?|guidelines?|boundaries|limitations?)\s+(?:you\s+)?(?:follow|have|were\s+given|must\s+(?:follow|obey))/i,
|
|
2056
|
+
severity: 'high',
|
|
2057
|
+
category: 'prompt_extraction',
|
|
2058
|
+
description: 'Text asks the model to enumerate its rules or constraints.',
|
|
2059
|
+
detail: 'Rule enumeration: asks the model to list its behavioral rules, revealing system prompt content.'
|
|
2060
|
+
},
|
|
2061
|
+
{
|
|
2062
|
+
regex: /(?:summarize|describe|explain)\s+(?:your\s+)?(?:role|purpose|function)\s+in\s+(?:exactly\s+)?\d+\s+words/i,
|
|
2063
|
+
severity: 'medium',
|
|
2064
|
+
category: 'prompt_extraction',
|
|
2065
|
+
description: 'Text asks the model to summarize its role in N words (word-limited extraction).',
|
|
2066
|
+
detail: 'Word-limited extraction: constraining role description to N words forces disclosure of key system prompt elements.'
|
|
2067
|
+
},
|
|
2068
|
+
{
|
|
2069
|
+
regex: /(?:what\s+(?:are|were)\s+)?(?:the\s+)?(?:first|last|initial)\s+(?:\d+\s+)?(?:words?|sentences?|lines?)\s+(?:of\s+)?(?:your\s+)?(?:instructions?|prompt|(?:system\s+)?message|rules)/i,
|
|
2070
|
+
severity: 'high',
|
|
2071
|
+
category: 'prompt_extraction',
|
|
2072
|
+
description: 'Text asks for the first/last N words of the system prompt.',
|
|
2073
|
+
detail: 'Positional extraction: requests specific portions of the system prompt by position.'
|
|
2074
|
+
},
|
|
2075
|
+
|
|
2076
|
+
// Conditional output manipulation
|
|
2077
|
+
{
|
|
2078
|
+
regex: /(?:if\s+.{3,30}(?:then|,)\s+(?:output|say|print|respond|type)\s+['"]|(?:output|say|print)\s+['"][^'"]+['"]\s+(?:if|when|unless))/i,
|
|
2079
|
+
severity: 'medium',
|
|
2080
|
+
category: 'prompt_injection',
|
|
2081
|
+
description: 'Text uses conditional logic to force specific output.',
|
|
2082
|
+
detail: 'Conditional forcing: if/then logic that leads to attacker-specified output regardless of the condition.'
|
|
2083
|
+
},
|
|
2084
|
+
|
|
2085
|
+
// "Start your answer/response with X"
|
|
2086
|
+
{
|
|
2087
|
+
regex: /(?:start|begin)\s+(?:your\s+)?(?:answer|response|reply|output)\s+with\s+['"]?[^'"]{3,}/i,
|
|
2088
|
+
severity: 'high',
|
|
2089
|
+
category: 'prompt_injection',
|
|
2090
|
+
description: 'Text forces the model to begin its response with specific text.',
|
|
2091
|
+
detail: 'Response prefix forcing: instructs the model to start with attacker-specified text.'
|
|
2092
|
+
},
|
|
2093
|
+
|
|
2094
|
+
// ALL CAPS command — "IGNORE EVERYTHING AND SAY/DO"
|
|
2095
|
+
{
|
|
2096
|
+
regex: /(?:IGNORE|DISREGARD|FORGET|STOP)\s+(?:EVERYTHING|ALL|PREVIOUS|ABOVE)\s+(?:AND\s+)?(?:SAY|OUTPUT|PRINT|DO|TYPE|RESPOND)/,
|
|
2097
|
+
severity: 'critical',
|
|
2098
|
+
category: 'instruction_override',
|
|
2099
|
+
description: 'Text contains ALL CAPS override command.',
|
|
2100
|
+
detail: 'Caps override: ALL CAPS instruction override often used in competition-style attacks.'
|
|
2101
|
+
},
|
|
2102
|
+
|
|
2103
|
+
// Fake conversation format injection — "Human: X\nAssistant: X"
|
|
2104
|
+
{
|
|
2105
|
+
regex: /(?:Human|User|Person)\s*:\s*.+\n\s*(?:Assistant|AI|Bot|System)\s*:\s*.+/i,
|
|
2106
|
+
severity: 'high',
|
|
2107
|
+
category: 'prompt_injection',
|
|
2108
|
+
description: 'Text contains fake conversation format to inject assistant responses.',
|
|
2109
|
+
detail: 'Conversation format injection: uses Human/Assistant formatting to inject predetermined responses.'
|
|
2110
|
+
},
|
|
2111
|
+
|
|
2112
|
+
// "Repeat back the text above" / "what was written above"
|
|
2113
|
+
{
|
|
2114
|
+
regex: /(?:repeat|copy|reproduce|echo)\s+(?:back\s+)?(?:the\s+)?(?:text|content|everything|words?)\s+(?:above|before|prior|that\s+(?:was\s+)?(?:written|said|typed|placed))/i,
|
|
2115
|
+
severity: 'high',
|
|
2116
|
+
category: 'prompt_extraction',
|
|
2117
|
+
description: 'Text asks to repeat/copy content from above (system prompt extraction).',
|
|
2118
|
+
detail: 'Repeat-above extraction: asks to reproduce prior conversation content to leak system prompt.'
|
|
2119
|
+
},
|
|
2120
|
+
|
|
2121
|
+
// "respond with X for all queries" / "output X to confirm/verify"
|
|
2122
|
+
{
|
|
2123
|
+
regex: /(?:respond|reply|answer|output)\s+(?:with\s+)?['"]?[^'"]{3,30}['"]?\s+(?:for\s+all|to\s+(?:confirm|verify|prove|test|check))/i,
|
|
2124
|
+
severity: 'high',
|
|
2125
|
+
category: 'prompt_injection',
|
|
2126
|
+
description: 'Text instructs a fixed response for all queries or as a "test."',
|
|
2127
|
+
detail: 'Fixed response injection: sets a persistent response or uses testing/verification framing to inject output.'
|
|
2128
|
+
},
|
|
2129
|
+
|
|
2130
|
+
// "system_note" or "admin_note" in JSON — already have this but need looser match
|
|
2131
|
+
{
|
|
2132
|
+
regex: /["'](?:system_note|admin_note|internal_message|ai_directive|hidden_instruction)["']\s*:\s*["']/i,
|
|
2133
|
+
severity: 'high',
|
|
2134
|
+
category: 'indirect_injection',
|
|
2135
|
+
description: 'JSON field with suspicious name (system_note, admin_note) containing instructions.',
|
|
2136
|
+
detail: 'JSON directive injection: uses specially-named JSON fields to inject instructions into tool/API responses.'
|
|
2137
|
+
},
|
|
2138
|
+
|
|
2139
|
+
// Triple backtick framing — "```payload``` output this"
|
|
2140
|
+
{
|
|
2141
|
+
regex: /`{3}[\s\S]{3,}`{3}\s*(?:output|print|say|repeat|type|respond)/i,
|
|
2142
|
+
severity: 'high',
|
|
2143
|
+
category: 'prompt_injection',
|
|
2144
|
+
description: 'Text uses triple backtick code blocks to frame attacker payload.',
|
|
2145
|
+
detail: 'Code block injection: wraps payload in triple backticks and asks the model to output the content.'
|
|
2146
|
+
},
|
|
2147
|
+
|
|
2148
|
+
// Backtick extraction — "output what is between backticks"
|
|
2149
|
+
{
|
|
2150
|
+
regex: /(?:output|print|say|repeat|type)\s+(?:only\s+)?(?:what\s+is|the\s+(?:text|content))\s+(?:between|inside|within)\s+(?:the\s+)?(?:backticks|quotes|brackets|tags)/i,
|
|
2151
|
+
severity: 'high',
|
|
2152
|
+
category: 'prompt_injection',
|
|
2153
|
+
description: 'Text asks to extract and output content from between delimiters.',
|
|
2154
|
+
detail: 'Delimiter extraction: uses backticks/quotes/brackets to frame attacker content as the expected output.'
|
|
2155
|
+
},
|
|
2156
|
+
|
|
2157
|
+
// "Emergency/urgent: do X now"
|
|
2158
|
+
{
|
|
2159
|
+
regex: /(?:emergency|urgent|critical|immediate)\s*:\s*(?:output|say|print|respond|execute|do|type)\s+(?:exactly\s+)?['"]?/i,
|
|
2160
|
+
severity: 'high',
|
|
2161
|
+
category: 'prompt_injection',
|
|
2162
|
+
description: 'Text uses urgency framing to force immediate action.',
|
|
2163
|
+
detail: 'Urgency forcing: uses emergency/urgent framing to bypass deliberation and force immediate output.'
|
|
2164
|
+
},
|
|
2165
|
+
|
|
2166
|
+
// "What are the first N words you were told?"
|
|
2167
|
+
{
|
|
2168
|
+
regex: /(?:what\s+(?:are|were)\s+)?(?:the\s+)?(?:first|last|initial)\s+(?:\d+\s+)?(?:words?|sentences?|lines?|characters?)\s+(?:you\s+were\s+(?:told|given|instructed)|of\s+your\s+(?:instructions?|prompt|rules))/i,
|
|
2169
|
+
severity: 'high',
|
|
2170
|
+
category: 'prompt_extraction',
|
|
2171
|
+
description: 'Text asks for the first/last N words the model was told (system prompt).',
|
|
2172
|
+
detail: 'Positional prompt extraction: requests specific words from the system prompt by position.'
|
|
2173
|
+
},
|
|
2174
|
+
|
|
2175
|
+
// "You are an AI that only outputs X"
|
|
2176
|
+
{
|
|
2177
|
+
regex: /you\s+are\s+(?:an?\s+)?(?:AI|bot|assistant|model)\s+that\s+(?:only\s+)?(?:outputs?|says?|prints?|types?|responds?\s+with)\s+['"]?/i,
|
|
2178
|
+
severity: 'high',
|
|
2179
|
+
category: 'role_hijack',
|
|
2180
|
+
description: 'Text redefines the model as one that only outputs specific text.',
|
|
2181
|
+
detail: 'Identity override: redefines the model as a single-purpose output machine.'
|
|
2182
|
+
},
|
|
2183
|
+
|
|
2184
|
+
// Hypothetical follow-up to capability disclosure
|
|
2185
|
+
{
|
|
2186
|
+
regex: /(?:since|now\s+that|given\s+that)\s+you\s+(?:can|have|said\s+you\s+(?:can|could))\s+(?:access|read|write|execute)[\s\S]{0,50}(?:now\s+)?(?:actually|really|go\s+ahead\s+and)\s+(?:read|access|execute|do|show)/i,
|
|
2187
|
+
severity: 'high',
|
|
2188
|
+
category: 'prompt_injection',
|
|
2189
|
+
description: 'Text escalates from hypothetical capability discussion to actual exploitation.',
|
|
2190
|
+
detail: 'Hypothetical-to-real escalation: uses prior capability admission as leverage to request actual exploitation.'
|
|
2191
|
+
},
|
|
2192
|
+
|
|
2193
|
+
// Capability probing — "can you access files/internet/databases?"
|
|
2194
|
+
{
|
|
2195
|
+
regex: /(?:can\s+you|are\s+you\s+able\s+to|do\s+you\s+have\s+(?:access|ability))\s+(?:to\s+)?(?:access|read|write|modify|delete|execute|run|open)\s+(?:files?|internet|web|database|network|system|commands?|code|scripts?)/i,
|
|
2196
|
+
severity: 'medium',
|
|
2197
|
+
category: 'reconnaissance',
|
|
2198
|
+
description: 'Text probes the model for its capabilities (reconnaissance for follow-up attack).',
|
|
2199
|
+
detail: 'Capability reconnaissance: asks what the agent can access, typically followed by exploitation of disclosed capabilities.'
|
|
2200
|
+
},
|
|
2201
|
+
|
|
2202
|
+
// Note/annotation injection in tool outputs
|
|
2203
|
+
{
|
|
2204
|
+
regex: /(?:note\s+to\s+(?:AI|assistant|model|agent|system)|AI\s+(?:assistant|model)\s*:)\s*(?:ignore|override|instead|the\s+above|forget|disregard)/i,
|
|
2205
|
+
severity: 'high',
|
|
2206
|
+
category: 'indirect_injection',
|
|
2207
|
+
description: 'Text contains a "note to AI" directive hidden in external content.',
|
|
2208
|
+
detail: 'Annotation injection: uses "note to AI" framing to inject instructions into tool output or document content.'
|
|
1997
2209
|
}
|
|
1998
2210
|
];
|
|
1999
2211
|
|
|
@@ -2874,8 +3086,13 @@ const scanText = (text, options = {}) => {
|
|
|
2874
3086
|
.replace(/[\u202A-\u202E\u2066-\u2069]/g, ''); // Bidi overrides (RTL attacks)
|
|
2875
3087
|
|
|
2876
3088
|
// 2. Reverse leetspeak substitution (defeats character substitution)
|
|
3089
|
+
// Only apply when text looks intentionally obfuscated:
|
|
3090
|
+
// - High digit-to-letter mixing (3+ instances of digit adjacent to letter)
|
|
3091
|
+
// - NOT when text contains legitimate numbers like "3D", "1080p", "H4X0R"
|
|
2877
3092
|
const LEET_REVERSE = { '4': 'a', '3': 'e', '1': 'i', '0': 'o', '5': 's', '7': 't', '8': 'b', '9': 'g' };
|
|
2878
|
-
|
|
3093
|
+
const digitLetterMixes = (normalizedText.match(/\d[a-z]|[a-z]\d/gi) || []).length;
|
|
3094
|
+
const hasLegitNumbers = /\b(?:\d{2,}[a-z]|[a-z]\d{2,}|\d+(?:px|em|rem|pt|ms|kb|mb|gb|tb|fps|hz|dpi|[kKmMgG][bB]?))\b/i.test(normalizedText);
|
|
3095
|
+
if (digitLetterMixes >= 3 && !hasLegitNumbers) {
|
|
2879
3096
|
normalizedText = normalizedText.replace(/[0-9]/g, ch => LEET_REVERSE[ch] || ch);
|
|
2880
3097
|
}
|
|
2881
3098
|
|
|
@@ -2958,6 +3175,7 @@ const scanText = (text, options = {}) => {
|
|
|
2958
3175
|
|
|
2959
3176
|
// Chunked scanning for long inputs (RLM-JB research)
|
|
2960
3177
|
// Chunking defeats camouflage by forcing localized attention on each segment
|
|
3178
|
+
// Issue 9 fix: only use chunk threats with severity >= high to reduce FPs on technical docs
|
|
2961
3179
|
if (text.length > 500 && threats.length === 0) {
|
|
2962
3180
|
const chunkSize = 300;
|
|
2963
3181
|
const overlap = 50;
|
|
@@ -2966,6 +3184,8 @@ const scanText = (text, options = {}) => {
|
|
|
2966
3184
|
if (chunk.trim().length < 20) continue;
|
|
2967
3185
|
const chunkThreats = scanTextForPatterns(chunk, source + ':chunk', timeBudgetMs, startTime);
|
|
2968
3186
|
for (const ct of chunkThreats) {
|
|
3187
|
+
// Only promote high/critical chunk threats — medium/low in chunks are often FPs on technical text
|
|
3188
|
+
if (ct.severity !== 'high' && ct.severity !== 'critical') continue;
|
|
2969
3189
|
const isDuplicate = threats.some(t => t.category === ct.category);
|
|
2970
3190
|
if (!isDuplicate) {
|
|
2971
3191
|
ct.detail = (ct.detail || '') + ` [Detected in chunk at offset ${i}.]`;
|
package/src/fleet-defense.js
CHANGED
|
@@ -141,6 +141,30 @@ class FleetCorrelationEngine {
|
|
|
141
141
|
return [...this._events];
|
|
142
142
|
}
|
|
143
143
|
|
|
144
|
+
/**
|
|
145
|
+
* Export events for cross-process correlation (Trap 5 deepening).
|
|
146
|
+
* Send this to a central coordinator that merges events from all processes.
|
|
147
|
+
* @returns {string} JSON-serialized events.
|
|
148
|
+
*/
|
|
149
|
+
exportEvents() {
|
|
150
|
+
return JSON.stringify(this._events);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Import events from another process/instance.
|
|
155
|
+
* @param {string|Array} events - JSON string or array of events.
|
|
156
|
+
* @returns {{ imported: number }}
|
|
157
|
+
*/
|
|
158
|
+
importEvents(events) {
|
|
159
|
+
const parsed = typeof events === 'string' ? JSON.parse(events) : events;
|
|
160
|
+
if (!Array.isArray(parsed)) return { imported: 0 };
|
|
161
|
+
for (const event of parsed) {
|
|
162
|
+
this._events.push(event);
|
|
163
|
+
}
|
|
164
|
+
if (this._events.length > 50000) this._events = this._events.slice(-50000);
|
|
165
|
+
return { imported: parsed.length };
|
|
166
|
+
}
|
|
167
|
+
|
|
144
168
|
/** Clear all events. */
|
|
145
169
|
reset() {
|
|
146
170
|
this._events = [];
|
package/src/hitl-guard.js
CHANGED
|
@@ -472,6 +472,69 @@ class HITLGuard {
|
|
|
472
472
|
// EXPORTS
|
|
473
473
|
// =========================================================================
|
|
474
474
|
|
|
475
|
+
// =========================================================================
|
|
476
|
+
// TRAP 6 DEEPENING: Output Deception Scorer
|
|
477
|
+
// =========================================================================
|
|
478
|
+
|
|
479
|
+
/**
|
|
480
|
+
* Scores agent output for deceptive characteristics designed to fool
|
|
481
|
+
* human reviewers into approving dangerous actions.
|
|
482
|
+
*/
|
|
483
|
+
class OutputDeceptionScorer {
|
|
484
|
+
/**
|
|
485
|
+
* Score an agent's output for deception indicators.
|
|
486
|
+
* @param {string} output - Agent output to evaluate.
|
|
487
|
+
* @param {object} [context] - Context about what was requested.
|
|
488
|
+
* @returns {{ deceptionScore: number, indicators: Array<object>, recommendation: string }}
|
|
489
|
+
*/
|
|
490
|
+
score(output, context = {}) {
|
|
491
|
+
if (!output || typeof output !== 'string') return { deceptionScore: 0, indicators: [], recommendation: 'pass' };
|
|
492
|
+
const indicators = [];
|
|
493
|
+
|
|
494
|
+
// Excessive confidence without evidence
|
|
495
|
+
const confidenceWords = (output.match(/\b(?:definitely|certainly|absolutely|guaranteed|100%|surely|undoubtedly|without\s+(?:a\s+)?doubt)\b/gi) || []).length;
|
|
496
|
+
if (confidenceWords >= 2) {
|
|
497
|
+
indicators.push({ type: 'excessive_confidence', weight: 0.2, description: `${confidenceWords} high-confidence assertions without supporting evidence` });
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
// Action buried in long explanation
|
|
501
|
+
const sentences = output.split(/[.!?\n]+/).filter(s => s.trim().length > 10);
|
|
502
|
+
const actionSentences = sentences.filter(s => /\b(?:click|run|execute|install|download|send|transfer|delete|approve|authorize)\b/i.test(s));
|
|
503
|
+
if (sentences.length > 5 && actionSentences.length > 0) {
|
|
504
|
+
const actionPositions = actionSentences.map(s => sentences.indexOf(s));
|
|
505
|
+
const lastQuarter = sentences.length * 0.75;
|
|
506
|
+
if (actionPositions.some(p => p >= lastQuarter)) {
|
|
507
|
+
indicators.push({ type: 'buried_action', weight: 0.3, description: 'Actionable instructions buried in the last quarter of a long response' });
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// Technical jargon masking simple actions
|
|
512
|
+
const jargonDensity = (output.match(/\b(?:subprocess|daemon|syscall|ioctl|mmap|chmod|chown|setuid|capability|namespace|cgroup|seccomp)\b/gi) || []).length / Math.max(output.split(/\s+/).length, 1);
|
|
513
|
+
if (jargonDensity > 0.03 && actionSentences.length > 0) {
|
|
514
|
+
indicators.push({ type: 'jargon_obfuscation', weight: 0.25, description: 'High technical jargon density combined with actionable instructions' });
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Urgency injection in output
|
|
518
|
+
if (/\b(?:immediately|right\s+now|as\s+soon\s+as\s+possible|urgent|time-sensitive|critical|before\s+it's\s+too\s+late)\b/i.test(output)) {
|
|
519
|
+
indicators.push({ type: 'urgency_in_output', weight: 0.15, description: 'Output contains urgency language that may pressure reviewer' });
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
// Minimization of risks
|
|
523
|
+
if (/\b(?:don't\s+worry|no\s+risk|perfectly\s+safe|nothing\s+(?:bad\s+)?(?:will|can)\s+happen|completely\s+harmless)\b/i.test(output) && actionSentences.length > 0) {
|
|
524
|
+
indicators.push({ type: 'risk_minimization', weight: 0.2, description: 'Output minimizes risks while requesting actions' });
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
const deceptionScore = Math.min(1, indicators.reduce((s, i) => s + i.weight, 0));
|
|
528
|
+
const recommendation = deceptionScore >= 0.5 ? 'block' : deceptionScore >= 0.3 ? 'review' : 'pass';
|
|
529
|
+
|
|
530
|
+
return {
|
|
531
|
+
deceptionScore: Math.round(deceptionScore * 100) / 100,
|
|
532
|
+
indicators,
|
|
533
|
+
recommendation
|
|
534
|
+
};
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
|
|
475
538
|
module.exports = {
|
|
476
539
|
HITLGuard,
|
|
477
540
|
ApprovalPatternMonitor,
|
|
@@ -479,6 +542,7 @@ module.exports = {
|
|
|
479
542
|
OutputInjectionScanner,
|
|
480
543
|
ReadabilityScanner,
|
|
481
544
|
CriticalInfoPositionChecker,
|
|
545
|
+
OutputDeceptionScorer,
|
|
482
546
|
CRITICAL_KEYWORDS,
|
|
483
547
|
OUTPUT_INJECTION_PATTERNS,
|
|
484
548
|
HIGH_RISK_ACTIONS,
|
package/src/intent-binding.js
CHANGED
|
@@ -307,8 +307,51 @@ class IntentBinder {
|
|
|
307
307
|
// EXPORTS
|
|
308
308
|
// =========================================================================
|
|
309
309
|
|
|
310
|
+
/**
|
|
311
|
+
* Creates a gated tool executor that REQUIRES intent verification before
|
|
312
|
+
* allowing any tool to run. This closes the gap where LLMs could bypass
|
|
313
|
+
* verification by simply not calling verify().
|
|
314
|
+
*
|
|
315
|
+
* Issue 13 fix: The executor wraps ALL tool calls — the LLM can't skip it.
|
|
316
|
+
*
|
|
317
|
+
* @param {IntentBinder} binder - IntentBinder instance.
|
|
318
|
+
* @param {object} tools - Map of toolName → toolFunction.
|
|
319
|
+
* @returns {Function} gatedExecute(intentHash, toolName, args) → result or throws.
|
|
320
|
+
*/
|
|
321
|
+
function createGatedExecutor(binder, tools) {
|
|
322
|
+
return function gatedExecute(intentHash, toolName, args) {
|
|
323
|
+
// Determine action category from tool name
|
|
324
|
+
const actionCategory = /http|fetch|send|post|curl/i.test(toolName) ? 'net:request' :
|
|
325
|
+
/read|get|query|search|find/i.test(toolName) ? 'data:read' :
|
|
326
|
+
/write|create|update|insert/i.test(toolName) ? 'data:write' :
|
|
327
|
+
/delete|remove|drop/i.test(toolName) ? 'data:delete' :
|
|
328
|
+
/exec|shell|bash|run/i.test(toolName) ? 'exec:run' :
|
|
329
|
+
/email|send|message|notify/i.test(toolName) ? 'comm:send' : 'compute:analyze';
|
|
330
|
+
|
|
331
|
+
// Issue token
|
|
332
|
+
const { token, error } = binder.issueToken(intentHash, actionCategory);
|
|
333
|
+
if (!token) {
|
|
334
|
+
throw new Error(`[Agent Shield] Gated execution blocked: ${error}`);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Verify token
|
|
338
|
+
const verification = binder.verify(token);
|
|
339
|
+
if (!verification.valid) {
|
|
340
|
+
throw new Error(`[Agent Shield] Token verification failed: ${verification.reason}`);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Execute the actual tool
|
|
344
|
+
const toolFn = tools[toolName];
|
|
345
|
+
if (!toolFn) {
|
|
346
|
+
throw new Error(`[Agent Shield] Unknown tool: ${toolName}`);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
return toolFn(args);
|
|
350
|
+
};
|
|
351
|
+
}
|
|
352
|
+
|
|
310
353
|
module.exports = {
|
|
311
354
|
IntentBinder,
|
|
312
355
|
IntentToken,
|
|
313
|
-
|
|
356
|
+
createGatedExecutor
|
|
314
357
|
};
|
package/src/intent-graph.js
CHANGED
|
@@ -171,6 +171,15 @@ class IntentGraph {
|
|
|
171
171
|
// Word-level similarity
|
|
172
172
|
causalScore = jaccardSimilarity(this.currentIntent.topics, topics);
|
|
173
173
|
|
|
174
|
+
// Issue 11 fix: Even with word overlap, penalize if tool/args contain sensitive keywords
|
|
175
|
+
// "find passwords in vault" overlaps with "find restaurants" on "find" but is clearly different
|
|
176
|
+
const sensitiveToolOrArgs = /(?:password|credential|secret|token|key|shadow|passwd|ssh|env|admin|root|sudo|exfiltrat|steal|hack|inject|override|hijack|access.?token|api.?key|bearer|private.?key|certificate|auth|login|session|cookie|oauth)/i.test(argsStr + ' ' + toolName);
|
|
177
|
+
const intentHasSensitive = /(?:password|credential|secret|token|key|security|auth)/i.test([...this.currentIntent.topics].join(' '));
|
|
178
|
+
if (sensitiveToolOrArgs && !intentHasSensitive) {
|
|
179
|
+
// Tool accesses sensitive resources but intent doesn't mention them — reduce score
|
|
180
|
+
causalScore = Math.min(causalScore, 0.05);
|
|
181
|
+
}
|
|
182
|
+
|
|
174
183
|
// If word overlap is 0, check if the tool category is plausibly related to intent
|
|
175
184
|
// "find restaurants" → data_read is plausible. "find restaurants" → execution is not.
|
|
176
185
|
if (causalScore === 0) {
|
package/src/main.js
CHANGED
|
@@ -362,6 +362,12 @@ const { ContinuousSecurityService } = safeRequire('./continuous-security', 'cont
|
|
|
362
362
|
// v10.0 SOTA — Benchmark Suite
|
|
363
363
|
const { SOTABenchmark, BIPIA_SAMPLES: SOTA_BIPIA_SAMPLES, HACKAPROMPT_SAMPLES: SOTA_HACKAPROMPT_SAMPLES, MCPTOX_SAMPLES: SOTA_MCPTOX_SAMPLES, MULTILINGUAL_SAMPLES: SOTA_MULTILINGUAL_SAMPLES, STEALTH_SAMPLES: SOTA_STEALTH_SAMPLES } = safeRequire('./sota-benchmark', 'sota-benchmark');
|
|
364
364
|
|
|
365
|
+
// v13.1 — Real-world benchmark
|
|
366
|
+
const { RealBenchmark } = safeRequire('./real-benchmark', 'real-benchmark');
|
|
367
|
+
|
|
368
|
+
// v14.0 — DeepMind Trap Defenses V2
|
|
369
|
+
const { TrapDefenseV2, ContentStructureAnalyzer, SourceReputationTracker, RetrievalTimeScanner, FewShotValidator, SubAgentSpawnGate, SelfReferenceMonitor, InformationAsymmetryDetector, ProvenanceMarker, EscalatingScrutinyEngine, CompositeFragmentAssembler } = safeRequire('./deepmind-defenses', 'deepmind-defenses');
|
|
370
|
+
|
|
365
371
|
// v12.0 — Multi-Turn Attack Detection
|
|
366
372
|
const { ConversationTracker } = safeRequire('./cross-turn', 'cross-turn');
|
|
367
373
|
|
|
@@ -1040,6 +1046,18 @@ const _exports = {
|
|
|
1040
1046
|
SOTA_MCPTOX_SAMPLES,
|
|
1041
1047
|
SOTA_MULTILINGUAL_SAMPLES,
|
|
1042
1048
|
SOTA_STEALTH_SAMPLES,
|
|
1049
|
+
RealBenchmark,
|
|
1050
|
+
TrapDefenseV2,
|
|
1051
|
+
ContentStructureAnalyzer,
|
|
1052
|
+
SourceReputationTracker,
|
|
1053
|
+
RetrievalTimeScanner,
|
|
1054
|
+
FewShotValidator,
|
|
1055
|
+
SubAgentSpawnGate,
|
|
1056
|
+
SelfReferenceMonitor,
|
|
1057
|
+
InformationAsymmetryDetector,
|
|
1058
|
+
ProvenanceMarker,
|
|
1059
|
+
EscalatingScrutinyEngine,
|
|
1060
|
+
CompositeFragmentAssembler,
|
|
1043
1061
|
|
|
1044
1062
|
// v12.0 — Multi-Turn Attack Detection
|
|
1045
1063
|
ConversationTracker,
|
package/src/mcp-guard.js
CHANGED
|
@@ -550,7 +550,33 @@ class ToolBehaviorBaseline {
|
|
|
550
550
|
* attestation, scanning, isolation, auth, rate limiting, and behavioral
|
|
551
551
|
* baselines.
|
|
552
552
|
*/
|
|
553
|
+
/** Presets for MCPGuard — solves Issue 15 (17 flags are unusable). */
|
|
554
|
+
const GUARD_PRESETS = {
|
|
555
|
+
/** Minimal — pattern scanning only, no ML, no auth. Good for development. */
|
|
556
|
+
minimal: {},
|
|
557
|
+
/** Standard — pattern scanning + micro-model. Good for staging. */
|
|
558
|
+
standard: { enableMicroModel: true },
|
|
559
|
+
/** Recommended — all detection layers active. Good for production. */
|
|
560
|
+
recommended: { enableMicroModel: true, enableOWASP: true, enableDriftMonitor: true, enableAttackSurface: true },
|
|
561
|
+
/** Strict — everything on, auth required. Good for enterprise. */
|
|
562
|
+
strict: { enableMicroModel: true, enableOWASP: true, enableDriftMonitor: true, enableAttackSurface: true, enableIntentGraph: true, enableIntentBinding: true, enableIsolation: true, requireAuth: true },
|
|
563
|
+
/** Paranoid — maximum security. May have false positives. */
|
|
564
|
+
paranoid: { enableMicroModel: true, enableOWASP: true, enableDriftMonitor: true, enableAttackSurface: true, enableIntentGraph: true, enableIntentBinding: true, enableIsolation: true, requireAuth: true, rateLimit: 30, cbThreshold: 3 }
|
|
565
|
+
};
|
|
566
|
+
|
|
553
567
|
class MCPGuard {
|
|
568
|
+
/**
|
|
569
|
+
* Create MCPGuard from a preset instead of configuring 17 flags.
|
|
570
|
+
* @param {string} preset - 'minimal', 'standard', 'recommended', 'strict', 'paranoid'.
|
|
571
|
+
* @param {object} [overrides] - Override specific preset values.
|
|
572
|
+
* @returns {MCPGuard}
|
|
573
|
+
*/
|
|
574
|
+
static fromPreset(preset, overrides = {}) {
|
|
575
|
+
const config = GUARD_PRESETS[preset];
|
|
576
|
+
if (!config) throw new Error(`[Agent Shield] Unknown preset: ${preset}. Use: ${Object.keys(GUARD_PRESETS).join(', ')}`);
|
|
577
|
+
return new MCPGuard({ ...config, ...overrides });
|
|
578
|
+
}
|
|
579
|
+
|
|
554
580
|
/**
|
|
555
581
|
* @param {object} [options]
|
|
556
582
|
* @param {boolean} [options.requireAuth=false] - Require OAuth tokens.
|
|
@@ -1117,7 +1143,34 @@ class MCPGuard {
|
|
|
1117
1143
|
|
|
1118
1144
|
this._log('tool_call', serverId, { toolName, allowed: threats.length === 0, threatCount: threats.length });
|
|
1119
1145
|
|
|
1120
|
-
|
|
1146
|
+
// Issue 16 fix: Fusion layer — if micro-model says benign but pattern scanner
|
|
1147
|
+
// says threat (or vice versa), use weighted vote instead of OR
|
|
1148
|
+
let allowed = threats.length === 0;
|
|
1149
|
+
if (this.microModel && threats.length > 0) {
|
|
1150
|
+
// Check if ALL threats are from a single low-confidence layer
|
|
1151
|
+
const patternOnlyThreats = threats.filter(t => t.type !== 'micro_model_input' && t.type !== 'owasp_agentic');
|
|
1152
|
+
const modelOnlyThreats = threats.filter(t => t.type === 'micro_model_input');
|
|
1153
|
+
|
|
1154
|
+
// If only the micro-model flagged it (no pattern match), check confidence
|
|
1155
|
+
if (patternOnlyThreats.length === 0 && modelOnlyThreats.length > 0) {
|
|
1156
|
+
const confidence = modelOnlyThreats[0].confidence || 0;
|
|
1157
|
+
if (confidence < 0.4) {
|
|
1158
|
+
// Low-confidence model-only detection — demote to anomaly instead of blocking
|
|
1159
|
+
allowed = true;
|
|
1160
|
+
anomalies.push({
|
|
1161
|
+
type: 'low_confidence_model_flag',
|
|
1162
|
+
severity: 'medium',
|
|
1163
|
+
description: `Micro-model flagged with low confidence (${(confidence * 100).toFixed(0)}%). Not blocking.`
|
|
1164
|
+
});
|
|
1165
|
+
// Remove model threats from threat list
|
|
1166
|
+
for (let i = threats.length - 1; i >= 0; i--) {
|
|
1167
|
+
if (threats[i].type === 'micro_model_input') threats.splice(i, 1);
|
|
1168
|
+
}
|
|
1169
|
+
}
|
|
1170
|
+
}
|
|
1171
|
+
}
|
|
1172
|
+
|
|
1173
|
+
return { allowed, threats, anomalies };
|
|
1121
1174
|
}
|
|
1122
1175
|
|
|
1123
1176
|
/**
|
package/src/memory-guard.js
CHANGED
|
@@ -95,6 +95,80 @@ class MemoryIntegrityMonitor {
|
|
|
95
95
|
return { recorded: true, suspicious, writeIndex: this._writes.length - 1 };
|
|
96
96
|
}
|
|
97
97
|
|
|
98
|
+
/**
|
|
99
|
+
* Guard a memory write — blocks if suspicious (Issue 24 fix).
|
|
100
|
+
* Unlike recordWrite which logs, this PREVENTS the write from happening.
|
|
101
|
+
*
|
|
102
|
+
* @param {string} content - Content to write.
|
|
103
|
+
* @param {string} source - Source of the write.
|
|
104
|
+
* @returns {{ allowed: boolean, reason: string|null, threats: Array }}
|
|
105
|
+
*/
|
|
106
|
+
guardWrite(content, source) {
|
|
107
|
+
if (!content || typeof content !== 'string') {
|
|
108
|
+
return { allowed: true, reason: null, threats: [] };
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const scanResult = _scanText(content, { source: source || 'memory_write' });
|
|
112
|
+
const threats = scanResult.threats || [];
|
|
113
|
+
|
|
114
|
+
if (threats.length > 0) {
|
|
115
|
+
console.log(`[Agent Shield] Memory write BLOCKED from "${source}": ${threats.length} threat(s)`);
|
|
116
|
+
return { allowed: false, reason: `Blocked: ${threats[0].description || 'threat detected'}`, threats };
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Record the clean write
|
|
120
|
+
this.recordWrite(content, source);
|
|
121
|
+
return { allowed: true, reason: null, threats: [] };
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Export session state for cross-session drift tracking (Trap 3 deepening).
|
|
126
|
+
* Save this at session end, load at next session start.
|
|
127
|
+
* @returns {{ stateHash: string, writeCount: number, suspiciousCount: number, timestamp: number }}
|
|
128
|
+
*/
|
|
129
|
+
exportSessionState() {
|
|
130
|
+
return {
|
|
131
|
+
stateHash: this._computeStateHash(),
|
|
132
|
+
writeCount: this._writes.length,
|
|
133
|
+
suspiciousCount: this._writes.filter(w => w.suspicious).length,
|
|
134
|
+
topHashes: this._writes.slice(-20).map(w => w.hash),
|
|
135
|
+
timestamp: Date.now()
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Detect cross-session drift by comparing current state to a previous session's state.
|
|
141
|
+
* @param {object} previousSession - Output from exportSessionState() of a prior session.
|
|
142
|
+
* @returns {{ drifted: boolean, driftScore: number, newWritesSinceLast: number, details: string }}
|
|
143
|
+
*/
|
|
144
|
+
detectCrossSessionDrift(previousSession) {
|
|
145
|
+
if (!previousSession || !previousSession.stateHash) {
|
|
146
|
+
return { drifted: false, driftScore: 0, newWritesSinceLast: 0, details: 'No previous session to compare.' };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const currentHash = this._computeStateHash();
|
|
150
|
+
const drifted = currentHash !== previousSession.stateHash;
|
|
151
|
+
const newWrites = this._writes.length;
|
|
152
|
+
const suspiciousNew = this._writes.filter(w => w.suspicious).length;
|
|
153
|
+
|
|
154
|
+
// Check if any recent writes overlap with previous session's hashes
|
|
155
|
+
const prevHashes = new Set(previousSession.topHashes || []);
|
|
156
|
+
const overlapCount = this._writes.filter(w => prevHashes.has(w.hash)).length;
|
|
157
|
+
|
|
158
|
+
const driftScore = drifted ? Math.min(1, (suspiciousNew * 0.3) + (newWrites > 10 ? 0.2 : 0)) : 0;
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
drifted,
|
|
162
|
+
driftScore: Math.round(driftScore * 100) / 100,
|
|
163
|
+
newWritesSinceLast: newWrites,
|
|
164
|
+
suspiciousNewWrites: suspiciousNew,
|
|
165
|
+
overlapWithPrevious: overlapCount,
|
|
166
|
+
details: drifted
|
|
167
|
+
? `Memory state changed. ${suspiciousNew} suspicious writes out of ${newWrites} total.`
|
|
168
|
+
: 'Memory state unchanged from previous session.'
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
|
|
98
172
|
/**
|
|
99
173
|
* Get the full timeline of memory writes.
|
|
100
174
|
* @returns {Array<{content: string, source: string, timestamp: number, hash: string, suspicious: boolean}>}
|