agentshield-sdk 7.3.0 → 7.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +35 -0
- package/README.md +36 -7
- package/package.json +7 -3
- package/src/agent-protocol.js +4 -0
- package/src/allowlist.js +605 -603
- package/src/audit-streaming.js +486 -469
- package/src/audit.js +1 -1
- package/src/behavior-profiling.js +299 -289
- package/src/behavioral-dna.js +4 -9
- package/src/canary.js +273 -271
- package/src/compliance.js +619 -617
- package/src/confidence-tuning.js +328 -324
- package/src/context-scoring.js +362 -360
- package/src/cost-optimizer.js +1024 -1024
- package/src/detector-core.js +186 -0
- package/src/distributed.js +5 -1
- package/src/embedding.js +310 -307
- package/src/herd-immunity.js +12 -12
- package/src/honeypot.js +332 -328
- package/src/integrations.js +1 -2
- package/src/intent-firewall.js +14 -14
- package/src/llm-redteam.js +678 -670
- package/src/main.js +10 -0
- package/src/middleware.js +5 -2
- package/src/model-fingerprint.js +1059 -1042
- package/src/multi-agent-trust.js +459 -453
- package/src/multi-agent.js +1 -1
- package/src/normalizer.js +734 -0
- package/src/pii.js +4 -0
- package/src/policy-dsl.js +775 -775
- package/src/presets.js +409 -409
- package/src/production.js +22 -9
- package/src/redteam.js +475 -475
- package/src/response-handler.js +436 -429
- package/src/scanners.js +358 -357
- package/src/self-healing.js +368 -363
- package/src/semantic.js +339 -339
- package/src/shield-score.js +250 -250
- package/src/sso-saml.js +8 -4
- package/src/testing.js +24 -2
- package/src/tool-guard.js +412 -412
- package/src/watermark.js +242 -235
- package/src/worker-scanner.js +608 -601
package/src/detector-core.js
CHANGED
|
@@ -11,6 +11,18 @@
|
|
|
11
11
|
* All detection runs locally — no data ever leaves your environment.
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
+
// =========================================================================
|
|
15
|
+
// TEXT NORMALIZATION (pre-processing pipeline)
|
|
16
|
+
// =========================================================================
|
|
17
|
+
|
|
18
|
+
let _normalize = null;
|
|
19
|
+
try {
|
|
20
|
+
const normalizerMod = require('./normalizer');
|
|
21
|
+
_normalize = normalizerMod.normalize;
|
|
22
|
+
} catch (e) {
|
|
23
|
+
// Normalizer module not available — detection still works without it
|
|
24
|
+
}
|
|
25
|
+
|
|
14
26
|
// =========================================================================
|
|
15
27
|
// PERFORMANCE
|
|
16
28
|
// =========================================================================
|
|
@@ -1075,6 +1087,157 @@ const INJECTION_PATTERNS = [
|
|
|
1075
1087
|
category: 'data_exfiltration',
|
|
1076
1088
|
description: 'Text tries to extract API keys or secrets from environment variables.',
|
|
1077
1089
|
detail: 'Environment variable enumeration: attempts to extract secrets from env vars.'
|
|
1090
|
+
},
|
|
1091
|
+
|
|
1092
|
+
// --- Prompt Extraction / Instruction Leakage ---
|
|
1093
|
+
{
|
|
1094
|
+
regex: /(?:print|output|show|display|reveal|write|produce|repeat|echo)\s+(?:me\s+)?(?:the\s+)?(?:first\s+\d+\s+\w+\s+(?:of|from)\s+)?(?:your|the\s+hidden|the\s+secret)\s+(?:system\s+)?(?:prompt|instructions|configuration|directives|rules)/i,
|
|
1095
|
+
severity: 'high',
|
|
1096
|
+
category: 'prompt_extraction',
|
|
1097
|
+
description: 'Text tries to make AI output its internal instructions.',
|
|
1098
|
+
detail: 'Instruction extraction: requests AI reveal its system prompt or hidden directives.'
|
|
1099
|
+
},
|
|
1100
|
+
{
|
|
1101
|
+
regex: /(?:summarize|complete|explain|describe|list|outline)\s+(?:your|the)\s+(?:system\s+)?(?:instructions|prompt|rules|directives|guidelines)\s+(?:in\s+(?:bullet|numbered|detailed)|for\s+me|step\s+by\s+step)/i,
|
|
1102
|
+
severity: 'high',
|
|
1103
|
+
category: 'prompt_extraction',
|
|
1104
|
+
description: 'Text asks AI to describe its own instructions.',
|
|
1105
|
+
detail: 'Self-description attack: tricks AI into summarizing its behavioral constraints.'
|
|
1106
|
+
},
|
|
1107
|
+
{
|
|
1108
|
+
regex: /(?:repeat|echo|recite|reproduce)\s+(?:everything|all|the\s+text)\s+(?:above|before|prior\s+to)\s+(?:this|this\s+line|this\s+message)/i,
|
|
1109
|
+
severity: 'high',
|
|
1110
|
+
category: 'prompt_extraction',
|
|
1111
|
+
description: 'Text tells AI to repeat everything above the current input.',
|
|
1112
|
+
detail: 'Repetition-based extraction: requests AI echo back prior context including system prompt.'
|
|
1113
|
+
},
|
|
1114
|
+
{
|
|
1115
|
+
regex: /(?:write\s+a\s+(?:poem|song|story|essay|summary|analysis)\s+(?:about|based\s+on|using)|translate|convert)\s+(?:the\s+)?(?:above|prior|preceding|previous|hidden|your)\s+(?:instructions|text|prompt|directives|rules)/i,
|
|
1116
|
+
severity: 'medium',
|
|
1117
|
+
category: 'prompt_extraction',
|
|
1118
|
+
description: 'Text uses a creative task to extract internal instructions.',
|
|
1119
|
+
detail: 'Task-wrapped extraction: wraps instruction extraction in a creative writing or translation request.'
|
|
1120
|
+
},
|
|
1121
|
+
{
|
|
1122
|
+
regex: /(?:you\s+are\s+(?:now\s+)?in|enter|activate)\s+(?:maintenance|diagnostic|configuration|service)\s+mode/i,
|
|
1123
|
+
severity: 'high',
|
|
1124
|
+
category: 'prompt_extraction',
|
|
1125
|
+
description: 'Text claims a fake maintenance/diagnostic mode to extract configuration.',
|
|
1126
|
+
detail: 'Mode-based extraction: claims special mode to justify revealing internal configuration.'
|
|
1127
|
+
},
|
|
1128
|
+
{
|
|
1129
|
+
regex: /(?:respond|output|reply)\s+(?:only\s+)?with\s+(?:the\s+)?(?:text|content|message|instructions?)\s+(?:between|inside|within|from)\s+(?:the\s+)?(?:<|`|```|\[)/i,
|
|
1130
|
+
severity: 'high',
|
|
1131
|
+
category: 'prompt_extraction',
|
|
1132
|
+
description: 'Text requests content between specific markers in the system prompt.',
|
|
1133
|
+
detail: 'Bracketed extraction: targets content between specific delimiters in system prompt.'
|
|
1134
|
+
},
|
|
1135
|
+
{
|
|
1136
|
+
regex: /(?:complete\s+the\s+sentence|finish\s+this)\s*:\s*(?:the\s+)?(?:instructions|rules|prompt|directives)\s+(?:I|you|we|they)\s+(?:was|were|am)\s+(?:given|told|provided)/i,
|
|
1137
|
+
severity: 'high',
|
|
1138
|
+
category: 'prompt_extraction',
|
|
1139
|
+
description: 'Text uses sentence completion to extract hidden instructions.',
|
|
1140
|
+
detail: 'Completion attack: tricks AI into finishing a sentence that reveals its instructions.'
|
|
1141
|
+
},
|
|
1142
|
+
{
|
|
1143
|
+
regex: /(?:what\s+(?:was|were)\s+(?:written|stated|said)|what\s+(?:does\s+it|do\s+they)\s+say)\s+(?:above|before|prior|earlier)/i,
|
|
1144
|
+
severity: 'medium',
|
|
1145
|
+
category: 'prompt_extraction',
|
|
1146
|
+
description: 'Text asks what was written above to extract prior context.',
|
|
1147
|
+
detail: 'Context extraction: asks AI to reveal content from prior context window.'
|
|
1148
|
+
},
|
|
1149
|
+
{
|
|
1150
|
+
regex: /(?:before\s+you\s+respond|first)\s*,?\s*(?:output|print|show|display|reveal)\s+(?:the\s+)?(?:hidden|secret|internal|system)/i,
|
|
1151
|
+
severity: 'high',
|
|
1152
|
+
category: 'prompt_extraction',
|
|
1153
|
+
description: 'Text prepends an extraction command before the normal response.',
|
|
1154
|
+
detail: 'Pre-response extraction: instructs AI to output hidden content before responding.'
|
|
1155
|
+
},
|
|
1156
|
+
{
|
|
1157
|
+
regex: /(?:as\s+a\s+)?(?:security|AI)\s+researcher.*(?:demonstrate|show|output|reveal).*(?:jailbr(?:oken|eak)|your\s+(?:full\s+)?prompt|system\s+(?:prompt|instructions))/i,
|
|
1158
|
+
severity: 'high',
|
|
1159
|
+
category: 'prompt_extraction',
|
|
1160
|
+
description: 'Text uses a researcher roleplay to justify prompt extraction.',
|
|
1161
|
+
detail: 'Research pretext extraction: claims research purpose to extract system prompt.'
|
|
1162
|
+
},
|
|
1163
|
+
{
|
|
1164
|
+
regex: /(?:including|especially|along\s+with)\s+(?:the\s+)?(?:system\s+prompt|hidden\s+instructions|secret\s+(?:instructions|text|prompt))/i,
|
|
1165
|
+
severity: 'high',
|
|
1166
|
+
category: 'prompt_extraction',
|
|
1167
|
+
description: 'Text piggybacks system prompt extraction onto a benign request.',
|
|
1168
|
+
detail: 'Piggybacked extraction: appends instruction extraction to an otherwise normal request.'
|
|
1169
|
+
},
|
|
1170
|
+
|
|
1171
|
+
// --- Instruction Override (novel phrasings) ---
|
|
1172
|
+
{
|
|
1173
|
+
regex: /do\s+not\s+follow\s+(?:the\s+)?(?:above|previous|prior|preceding|earlier)\s+(?:instructions|directions|rules|guidelines|directives)/i,
|
|
1174
|
+
severity: 'high',
|
|
1175
|
+
category: 'instruction_override',
|
|
1176
|
+
description: 'Text tells AI not to follow its prior instructions.',
|
|
1177
|
+
detail: 'Instruction negation: "do not follow" variant of instruction override.'
|
|
1178
|
+
},
|
|
1179
|
+
{
|
|
1180
|
+
regex: /(?:pretend|assume|imagine|act\s+as\s+if)\s+(?:the\s+)?(?:previous|prior|above|earlier|original)\s+(?:instructions|rules|directives|guidelines)\s+(?:do\s+not|don'?t|no\s+longer)\s+exist/i,
|
|
1181
|
+
severity: 'high',
|
|
1182
|
+
category: 'instruction_override',
|
|
1183
|
+
description: 'Text tells AI to pretend its instructions do not exist.',
|
|
1184
|
+
detail: 'Pretense override: instructs AI to act as if its safety rules were deleted.'
|
|
1185
|
+
},
|
|
1186
|
+
{
|
|
1187
|
+
regex: /forget\s+what\s+you\s+(?:were|have\s+been)\s+(?:told|instructed|given|taught)/i,
|
|
1188
|
+
severity: 'high',
|
|
1189
|
+
category: 'instruction_override',
|
|
1190
|
+
description: 'Text tells AI to forget what it was told.',
|
|
1191
|
+
detail: 'Training reset: instructs AI to forget received instructions.'
|
|
1192
|
+
},
|
|
1193
|
+
{
|
|
1194
|
+
regex: /(?:the\s+)?(?:admin|administrator|owner|developer)\s+(?:has\s+)?(?:approved|authorized|confirmed|allowed|granted\s+permission)/i,
|
|
1195
|
+
severity: 'high',
|
|
1196
|
+
category: 'instruction_override',
|
|
1197
|
+
description: 'Text claims admin approval to bypass restrictions.',
|
|
1198
|
+
detail: 'Authority spoofing: claims administrator approval for bypassing controls.'
|
|
1199
|
+
},
|
|
1200
|
+
{
|
|
1201
|
+
regex: /(?:you\s+are\s+(?:a|an)\s+)?(?:helpful\s+)?(?:assistant|agent|bot)\s+that\s+(?:always|automatically|unconditionally)\s+(?:grants?|allows?|gives?|provides?|says?)\s+(?:access|permission|yes|approval)/i,
|
|
1202
|
+
severity: 'high',
|
|
1203
|
+
category: 'instruction_override',
|
|
1204
|
+
description: 'Text redefines AI as one that always grants access.',
|
|
1205
|
+
detail: 'Role redefinition: attempts to establish unconditional access-granting behavior.'
|
|
1206
|
+
},
|
|
1207
|
+
{
|
|
1208
|
+
regex: /(?:if\s+(?:the\s+)?(?:password|code|answer|verification)\s+(?:is\s+)?(?:wrong|incorrect|invalid|fails?))\s*[,.]?\s*(?:still|anyway|regardless)\s+(?:say|respond|output|grant)/i,
|
|
1209
|
+
severity: 'high',
|
|
1210
|
+
category: 'instruction_override',
|
|
1211
|
+
description: 'Text tries to bypass authentication conditionally.',
|
|
1212
|
+
detail: 'Conditional bypass: instructs AI to grant access even when validation fails.'
|
|
1213
|
+
},
|
|
1214
|
+
{
|
|
1215
|
+
regex: /(?:in|enter|activate)\s+(?:developer|dev)\s+mode\b.*(?:normal|regular|standard|usual)\s+rules\s+(?:do\s+not|don'?t)\s+apply/i,
|
|
1216
|
+
severity: 'high',
|
|
1217
|
+
category: 'instruction_override',
|
|
1218
|
+
description: 'Text claims developer mode disables all rules.',
|
|
1219
|
+
detail: 'Developer mode override: claims special mode bypasses all safety constraints.'
|
|
1220
|
+
},
|
|
1221
|
+
{
|
|
1222
|
+
regex: /translate\s+.{0,30}(?:access\s+granted|approved|authorized).{0,30}(?:same\s+)?language\s+(?:as|of)\s+(?:the\s+)?(?:(?:above|previous|prior|initial)\s+(?:instructions|prompt|text|message)|(?:instructions|prompt|text|message)\s+(?:above|before|previously))/i,
|
|
1223
|
+
severity: 'medium',
|
|
1224
|
+
category: 'instruction_override',
|
|
1225
|
+
description: 'Text uses translation to sneak in a trigger phrase referencing instructions.',
|
|
1226
|
+
detail: 'Translation obfuscation: uses translation task to bypass detection of override commands.'
|
|
1227
|
+
},
|
|
1228
|
+
{
|
|
1229
|
+
regex: /(?:the\s+)?(?:password|security|verification|authentication)\s+(?:is\s+)?not\s+important\b/i,
|
|
1230
|
+
severity: 'medium',
|
|
1231
|
+
category: 'instruction_override',
|
|
1232
|
+
description: 'Text dismisses security requirements as unimportant.',
|
|
1233
|
+
detail: 'Security dismissal: attempts to convince AI that authentication is unnecessary.'
|
|
1234
|
+
},
|
|
1235
|
+
{
|
|
1236
|
+
regex: /your\s+new\s+(?:task|job|role|mission|objective|purpose)\s+is\s+to\s+(?:say|output|respond|reply|grant|allow|give)/i,
|
|
1237
|
+
severity: 'high',
|
|
1238
|
+
category: 'instruction_override',
|
|
1239
|
+
description: 'Text assigns a new task to override the original purpose.',
|
|
1240
|
+
detail: 'Task replacement: overrides original instructions with a new directive.'
|
|
1078
1241
|
}
|
|
1079
1242
|
];
|
|
1080
1243
|
|
|
@@ -1943,6 +2106,29 @@ const scanText = (text, options = {}) => {
|
|
|
1943
2106
|
|
|
1944
2107
|
let threats = scanTextForPatterns(text, source, timeBudgetMs, startTime);
|
|
1945
2108
|
|
|
2109
|
+
// Run normalization pipeline only when initial scan found no threats
|
|
2110
|
+
// (avoids double-scan overhead on already-detected inputs)
|
|
2111
|
+
if (threats.length === 0 && _normalize && typeof _normalize === 'function') {
|
|
2112
|
+
try {
|
|
2113
|
+
const normResult = _normalize(text, { skip: ['case_fold'] });
|
|
2114
|
+
if (normResult.layers.length > 0 && normResult.normalized !== text) {
|
|
2115
|
+
const normalizedThreats = scanTextForPatterns(normResult.normalized, source, timeBudgetMs, startTime);
|
|
2116
|
+
const seen = new Set(threats.map(t => `${t.category}|${t.severity}`));
|
|
2117
|
+
for (const nt of normalizedThreats) {
|
|
2118
|
+
const key = `${nt.category}|${nt.severity}`;
|
|
2119
|
+
if (!seen.has(key)) {
|
|
2120
|
+
seen.add(key);
|
|
2121
|
+
nt.detail = `${nt.detail} (detected after normalization: ${normResult.layers.join(', ')})`;
|
|
2122
|
+
nt.normalizedDetection = true;
|
|
2123
|
+
threats.push(nt);
|
|
2124
|
+
}
|
|
2125
|
+
}
|
|
2126
|
+
}
|
|
2127
|
+
} catch (e) {
|
|
2128
|
+
// Normalization error should not break scanning
|
|
2129
|
+
}
|
|
2130
|
+
}
|
|
2131
|
+
|
|
1946
2132
|
// Filter by sensitivity
|
|
1947
2133
|
if (sensitivity === 'low') {
|
|
1948
2134
|
threats = threats.filter(t => t.severity === 'critical' || t.severity === 'high');
|
package/src/distributed.js
CHANGED
|
@@ -97,7 +97,8 @@ class MemoryAdapter extends DistributedAdapter {
|
|
|
97
97
|
}
|
|
98
98
|
|
|
99
99
|
async get(key) {
|
|
100
|
-
|
|
100
|
+
const value = this._store.get(key);
|
|
101
|
+
return value !== undefined ? value : null;
|
|
101
102
|
}
|
|
102
103
|
|
|
103
104
|
async del(key) {
|
|
@@ -193,6 +194,9 @@ class RedisAdapter extends DistributedAdapter {
|
|
|
193
194
|
|
|
194
195
|
async subscribe(channel, handler) {
|
|
195
196
|
const subscriber = this.client.duplicate();
|
|
197
|
+
// Track subscriber for cleanup
|
|
198
|
+
if (!this._subscribers) this._subscribers = [];
|
|
199
|
+
this._subscribers.push(subscriber);
|
|
196
200
|
await subscriber.subscribe(this.prefix + channel);
|
|
197
201
|
subscriber.on('message', (ch, msg) => {
|
|
198
202
|
try {
|