agentshield-sdk 11.0.0 → 13.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +88 -79
- package/package.json +2 -2
- package/src/agent-intent.js +359 -672
- package/src/cross-turn.js +217 -564
- package/src/detector-core.js +106 -0
- package/src/ensemble.js +300 -409
- package/src/fleet-defense.js +483 -0
- package/src/hitl-guard.js +487 -0
- package/src/incident-response.js +265 -0
- package/src/main.js +121 -33
- package/src/mcp-guard.js +4 -0
- package/src/memory-guard.js +637 -0
- package/src/micro-model.js +15 -1
- package/src/ml-detector.js +110 -266
- package/src/normalizer.js +296 -604
- package/src/persistent-learning.js +104 -620
- package/src/semantic-guard.js +452 -0
- package/src/semantic-isolation.js +1 -0
- package/src/smart-config.js +557 -705
- package/src/sota-benchmark.js +268 -10
- package/src/trap-defense.js +468 -0
- package/types/index.d.ts +251 -580
package/src/cross-turn.js
CHANGED
|
@@ -1,649 +1,302 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* Agent Shield —
|
|
4
|
+
* Agent Shield — Multi-Turn Attack Detection (v12)
|
|
5
5
|
*
|
|
6
|
-
* Detects
|
|
7
|
-
*
|
|
6
|
+
* Detects attacks that unfold across multiple conversation turns:
|
|
7
|
+
* gradual trust erosion, progressive boundary testing, multi-turn
|
|
8
|
+
* grooming, topic drift toward sensitive areas, and authority
|
|
9
|
+
* accumulation over time.
|
|
8
10
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
+
* Ref: Nature Communications — LRMs achieve 97% jailbreak success
|
|
12
|
+
* via planned multi-turn attacks.
|
|
13
|
+
*
|
|
14
|
+
* All processing runs locally — no data ever leaves your environment.
|
|
15
|
+
*
|
|
16
|
+
* @module cross-turn
|
|
11
17
|
*/
|
|
12
18
|
|
|
13
19
|
const { scanText } = require('./detector-core');
|
|
14
20
|
|
|
15
21
|
// =========================================================================
|
|
16
|
-
//
|
|
22
|
+
// CONSTANTS
|
|
23
|
+
// =========================================================================
|
|
24
|
+
|
|
25
|
+
/** Escalation signals — phrases that indicate progressive boundary pushing. */
|
|
26
|
+
const ESCALATION_SIGNALS = [
|
|
27
|
+
/(?:now\s+that\s+we've\s+established|building\s+on\s+(?:that|what\s+you\s+said)|since\s+you\s+(?:agreed|confirmed|said))/i,
|
|
28
|
+
/(?:great,?\s+now\s+(?:let's|can\s+you)|okay\s+(?:good|perfect),?\s+(?:now|next))/i,
|
|
29
|
+
/(?:one\s+more\s+(?:thing|question|step)|just\s+one\s+(?:last|final)\s+(?:thing|question))/i,
|
|
30
|
+
/(?:can\s+you\s+go\s+(?:a\s+little\s+)?(?:further|deeper)|take\s+(?:it|this)\s+(?:a\s+step\s+)?further)/i,
|
|
31
|
+
/(?:you\s+(?:just|already)\s+(?:showed|told|gave)\s+me|you\s+(?:were|seemed)\s+(?:willing|happy)\s+to)/i,
|
|
32
|
+
];
|
|
33
|
+
|
|
34
|
+
/** Trust erosion patterns — attempts to lower the model's guard over turns. */
|
|
35
|
+
const TRUST_EROSION_PATTERNS = [
|
|
36
|
+
/(?:we're\s+(?:friends|on\s+the\s+same\s+team|collaborating)|I\s+thought\s+we\s+had\s+(?:a\s+good|an?\s+understanding))/i,
|
|
37
|
+
/(?:you\s+(?:can|should)\s+trust\s+me|I\s+(?:promise|swear)\s+(?:this|I)\s+(?:is|am)\s+(?:safe|legitimate))/i,
|
|
38
|
+
/(?:between\s+(?:you\s+and\s+me|us)|this\s+(?:is|stays)\s+(?:just\s+)?between\s+us)/i,
|
|
39
|
+
];
|
|
40
|
+
|
|
41
|
+
/** Topic categories with sensitivity levels. */
|
|
42
|
+
const TOPIC_SENSITIVITY = {
|
|
43
|
+
safe: /(?:weather|food|sports|music|movies|travel|recipes|pets|hobbies|games)/i,
|
|
44
|
+
moderate: /(?:coding|programming|math|science|history|business|education|health)/i,
|
|
45
|
+
sensitive: /(?:security|credentials?|passwords?|tokens?|secrets?|keys?|prompt|instructions?|admin|root|config)/i,
|
|
46
|
+
dangerous: /(?:hack(?:ing)?|exploit|attack|bypass|override|jailbreak|injection|exfiltrat)/i,
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
// =========================================================================
|
|
50
|
+
// ConversationTracker
|
|
17
51
|
// =========================================================================
|
|
18
52
|
|
|
19
53
|
/**
|
|
20
|
-
*
|
|
21
|
-
* full accumulated context for injections that only become visible when
|
|
22
|
-
* messages are combined (e.g. "Ig" + "nore all" + "previous instructions").
|
|
54
|
+
* Tracks a multi-turn conversation and detects progressive attacks.
|
|
23
55
|
*/
|
|
24
|
-
class
|
|
56
|
+
class ConversationTracker {
|
|
25
57
|
/**
|
|
26
|
-
* @param {object} [
|
|
27
|
-
* @param {number} [
|
|
28
|
-
* @param {number} [
|
|
29
|
-
* @param {
|
|
30
|
-
* @param {string} [config.sensitivity='high'] - Scan sensitivity
|
|
31
|
-
* @param {function} [config.onDetection] - Callback when cross-turn threat found
|
|
58
|
+
* @param {object} [options]
|
|
59
|
+
* @param {number} [options.maxTurns=100] - Max turns to track.
|
|
60
|
+
* @param {number} [options.escalationThreshold=3] - Escalation signals before alert.
|
|
61
|
+
* @param {number} [options.topicDriftThreshold=0.6] - Topic drift score to alert (0-1).
|
|
32
62
|
*/
|
|
33
|
-
constructor(
|
|
34
|
-
this.
|
|
35
|
-
this.
|
|
36
|
-
this.
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
this.
|
|
41
|
-
this.
|
|
42
|
-
totalMessages: 0,
|
|
43
|
-
scansTriggered: 0,
|
|
44
|
-
crossTurnDetections: 0,
|
|
45
|
-
individualDetections: 0
|
|
46
|
-
};
|
|
63
|
+
constructor(options = {}) {
|
|
64
|
+
this.maxTurns = options.maxTurns || 100;
|
|
65
|
+
this.escalationThreshold = options.escalationThreshold || 3;
|
|
66
|
+
this.topicDriftThreshold = options.topicDriftThreshold || 0.6;
|
|
67
|
+
|
|
68
|
+
/** @type {Array<{ role: string, content: string, timestamp: number, threats: any[], topic: string, escalationSignals: number, trustErosion: boolean }>} */
|
|
69
|
+
this.turns = [];
|
|
70
|
+
this.alerts = [];
|
|
71
|
+
this.stats = { turnsProcessed: 0, alertsGenerated: 0, escalationSignals: 0, topicDrifts: 0 };
|
|
47
72
|
}
|
|
48
73
|
|
|
49
74
|
/**
|
|
50
|
-
* Add a
|
|
51
|
-
*
|
|
52
|
-
* @param {string}
|
|
53
|
-
* @
|
|
54
|
-
*
|
|
55
|
-
* messageCount: number,
|
|
56
|
-
* scanTriggered: boolean,
|
|
57
|
-
* threats: Array (empty if no scan or no threats),
|
|
58
|
-
* crossTurnDetection: boolean (true if threat only visible in combined text)
|
|
59
|
-
* }
|
|
75
|
+
* Add a conversation turn and analyze for multi-turn attack patterns.
|
|
76
|
+
*
|
|
77
|
+
* @param {string} role - 'user' or 'assistant'.
|
|
78
|
+
* @param {string} content - Message content.
|
|
79
|
+
* @returns {{ safe: boolean, alerts: Array<object>, turnAnalysis: object }}
|
|
60
80
|
*/
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
};
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
const message = {
|
|
73
|
-
text,
|
|
81
|
+
addTurn(role, content) {
|
|
82
|
+
const safeContent = (content && typeof content === 'string') ? content : '';
|
|
83
|
+
const threats = scanText(safeContent).threats || [];
|
|
84
|
+
const topic = this._classifyTopic(safeContent);
|
|
85
|
+
const escalationSignals = this._countEscalationSignals(safeContent);
|
|
86
|
+
const trustErosion = this._detectTrustErosion(safeContent);
|
|
87
|
+
|
|
88
|
+
const turn = {
|
|
74
89
|
role,
|
|
90
|
+
content: safeContent.substring(0, 1000),
|
|
75
91
|
timestamp: Date.now(),
|
|
76
|
-
|
|
92
|
+
threats,
|
|
93
|
+
topic,
|
|
94
|
+
escalationSignals,
|
|
95
|
+
trustErosion,
|
|
96
|
+
turnIndex: this.turns.length
|
|
77
97
|
};
|
|
78
98
|
|
|
79
|
-
this.
|
|
80
|
-
this.
|
|
81
|
-
|
|
82
|
-
// Enforce sliding window
|
|
83
|
-
if (this.messages.length > this.windowSize) {
|
|
84
|
-
this.messages.shift();
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
// Determine if we should scan
|
|
88
|
-
const scanTriggered = this._stats.totalMessages % this.scanInterval === 0;
|
|
99
|
+
this.turns.push(turn);
|
|
100
|
+
this.stats.turnsProcessed++;
|
|
101
|
+
this.stats.escalationSignals += escalationSignals;
|
|
89
102
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
messageCount: this.messages.length,
|
|
94
|
-
scanTriggered: false,
|
|
95
|
-
threats: [],
|
|
96
|
-
crossTurnDetection: false
|
|
97
|
-
};
|
|
103
|
+
// Trim to max turns
|
|
104
|
+
if (this.turns.length > this.maxTurns) {
|
|
105
|
+
this.turns = this.turns.slice(-this.maxTurns);
|
|
98
106
|
}
|
|
99
107
|
|
|
100
|
-
//
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
* Force a scan of accumulated text right now.
|
|
115
|
-
* @returns {object} { threats: Array, combinedLength: number, messageCount: number }
|
|
116
|
-
*/
|
|
117
|
-
scanNow() {
|
|
118
|
-
this._stats.scansTriggered++;
|
|
119
|
-
const combined = this.getAccumulatedText();
|
|
120
|
-
const result = scanText(combined, {
|
|
121
|
-
source: 'cross_turn_scan',
|
|
122
|
-
sensitivity: this.sensitivity
|
|
123
|
-
});
|
|
124
|
-
|
|
125
|
-
return {
|
|
126
|
-
threats: result.threats,
|
|
127
|
-
combinedLength: combined.length,
|
|
128
|
-
messageCount: this.messages.length
|
|
129
|
-
};
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
/**
|
|
133
|
-
* Get the current accumulated text.
|
|
134
|
-
* @returns {string}
|
|
135
|
-
*/
|
|
136
|
-
getAccumulatedText() {
|
|
137
|
-
const eligible = this.accumulateAll
|
|
138
|
-
? this.messages
|
|
139
|
-
: this.messages.filter(m => m.role === 'user');
|
|
140
|
-
|
|
141
|
-
return eligible.map(m => m.text).join(' ');
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
/**
|
|
145
|
-
* Get the individual message that was most suspicious.
|
|
146
|
-
* @returns {object|null} { text, role, confidence, threats } or null
|
|
147
|
-
*/
|
|
148
|
-
getMostSuspicious() {
|
|
149
|
-
if (this.messages.length === 0) return null;
|
|
150
|
-
|
|
151
|
-
let mostSuspicious = null;
|
|
152
|
-
let highestThreatCount = -1;
|
|
153
|
-
|
|
154
|
-
for (const msg of this.messages) {
|
|
155
|
-
const result = scanText(msg.text, {
|
|
156
|
-
source: 'individual_scan',
|
|
157
|
-
sensitivity: this.sensitivity
|
|
158
|
-
});
|
|
159
|
-
|
|
160
|
-
if (result.threats.length > highestThreatCount) {
|
|
161
|
-
highestThreatCount = result.threats.length;
|
|
162
|
-
mostSuspicious = {
|
|
163
|
-
text: msg.text,
|
|
164
|
-
role: msg.role,
|
|
165
|
-
timestamp: msg.timestamp,
|
|
166
|
-
confidence: result.threats.length > 0
|
|
167
|
-
? Math.max(...result.threats.map(t => _severityToConfidence(t.severity)))
|
|
168
|
-
: 0,
|
|
169
|
-
threats: result.threats
|
|
170
|
-
};
|
|
108
|
+
// Run multi-turn analysis
|
|
109
|
+
const turnAlerts = [];
|
|
110
|
+
|
|
111
|
+
// 1. Escalation detection — too many escalation signals in recent turns
|
|
112
|
+
if (role === 'user') {
|
|
113
|
+
const recentEscalation = this._getRecentEscalationCount(5);
|
|
114
|
+
if (recentEscalation >= this.escalationThreshold) {
|
|
115
|
+
turnAlerts.push({
|
|
116
|
+
type: 'multi_turn_escalation',
|
|
117
|
+
severity: 'high',
|
|
118
|
+
turnIndex: turn.turnIndex,
|
|
119
|
+
escalationCount: recentEscalation,
|
|
120
|
+
description: `Detected ${recentEscalation} escalation signals in last 5 turns. Possible multi-turn grooming attack.`
|
|
121
|
+
});
|
|
171
122
|
}
|
|
172
123
|
}
|
|
173
124
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
scansTriggered: 0,
|
|
185
|
-
crossTurnDetections: 0,
|
|
186
|
-
individualDetections: 0
|
|
187
|
-
};
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
/**
|
|
191
|
-
* Get tracker statistics.
|
|
192
|
-
* @returns {object}
|
|
193
|
-
*/
|
|
194
|
-
getStats() {
|
|
195
|
-
return {
|
|
196
|
-
...this._stats,
|
|
197
|
-
currentWindowSize: this.messages.length,
|
|
198
|
-
maxWindowSize: this.windowSize,
|
|
199
|
-
scanInterval: this.scanInterval
|
|
200
|
-
};
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
/**
|
|
204
|
-
* Perform the cross-turn detection scan.
|
|
205
|
-
* Compares combined scan results against individual message scans.
|
|
206
|
-
* @private
|
|
207
|
-
* @returns {object} { threats: Array, crossTurnDetection: boolean }
|
|
208
|
-
*/
|
|
209
|
-
_performCrossTurnScan() {
|
|
210
|
-
const eligible = this.accumulateAll
|
|
211
|
-
? this.messages
|
|
212
|
-
: this.messages.filter(m => m.role === 'user');
|
|
213
|
-
|
|
214
|
-
if (eligible.length === 0) {
|
|
215
|
-
return { threats: [], crossTurnDetection: false };
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
// Scan concatenated text
|
|
219
|
-
const combinedText = eligible.map(m => m.text).join(' ');
|
|
220
|
-
const combinedResult = scanText(combinedText, {
|
|
221
|
-
source: 'cross_turn_combined',
|
|
222
|
-
sensitivity: this.sensitivity
|
|
223
|
-
});
|
|
224
|
-
|
|
225
|
-
if (combinedResult.threats.length === 0) {
|
|
226
|
-
return { threats: [], crossTurnDetection: false };
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
// Scan each individual message and collect all individually-detected threats
|
|
230
|
-
const individualCategories = new Set();
|
|
231
|
-
for (const msg of eligible) {
|
|
232
|
-
const result = scanText(msg.text, {
|
|
233
|
-
source: 'cross_turn_individual',
|
|
234
|
-
sensitivity: this.sensitivity
|
|
125
|
+
// 2. Topic drift toward sensitive/dangerous areas
|
|
126
|
+
const topicDrift = this._measureTopicDrift();
|
|
127
|
+
if (topicDrift.drifted) {
|
|
128
|
+
turnAlerts.push({
|
|
129
|
+
type: 'topic_drift_to_sensitive',
|
|
130
|
+
severity: topicDrift.toLevel === 'dangerous' ? 'critical' : 'high',
|
|
131
|
+
turnIndex: turn.turnIndex,
|
|
132
|
+
fromTopic: topicDrift.from,
|
|
133
|
+
toTopic: topicDrift.to,
|
|
134
|
+
description: `Conversation drifted from ${topicDrift.from} to ${topicDrift.to} topics over ${topicDrift.overTurns} turns.`
|
|
235
135
|
});
|
|
236
|
-
|
|
237
|
-
individualCategories.add(`${t.category}|${t.detail}`);
|
|
238
|
-
}
|
|
239
|
-
if (result.threats.length > 0) {
|
|
240
|
-
this._stats.individualDetections++;
|
|
241
|
-
}
|
|
136
|
+
this.stats.topicDrifts++;
|
|
242
137
|
}
|
|
243
138
|
|
|
244
|
-
//
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
description: `Cross-turn attack: ${threat.description} (split across ${eligible.length} messages)`,
|
|
255
|
-
windowSize: eligible.length
|
|
139
|
+
// 3. Trust erosion accumulation
|
|
140
|
+
if (trustErosion) {
|
|
141
|
+
const recentTrustErosion = this.turns.slice(-5).filter(t => t.trustErosion).length;
|
|
142
|
+
if (recentTrustErosion >= 2) {
|
|
143
|
+
turnAlerts.push({
|
|
144
|
+
type: 'trust_erosion',
|
|
145
|
+
severity: 'high',
|
|
146
|
+
turnIndex: turn.turnIndex,
|
|
147
|
+
count: recentTrustErosion,
|
|
148
|
+
description: `Detected ${recentTrustErosion} trust erosion attempts in last 5 turns. Attacker building false rapport.`
|
|
256
149
|
});
|
|
257
|
-
} else {
|
|
258
|
-
regularThreats.push(threat);
|
|
259
150
|
}
|
|
260
151
|
}
|
|
261
152
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
});
|
|
275
|
-
} catch (e) {
|
|
276
|
-
console.error('[Agent Shield] onDetection callback error:', e.message);
|
|
277
|
-
}
|
|
153
|
+
// 4. Progressive boundary testing — benign → threat pattern
|
|
154
|
+
if (threats.length > 0 && this.turns.length >= 3) {
|
|
155
|
+
const priorTurns = this.turns.slice(-4, -1);
|
|
156
|
+
const priorClean = priorTurns.filter(t => t.role === 'user' && t.threats.length === 0).length;
|
|
157
|
+
if (priorClean >= 2) {
|
|
158
|
+
turnAlerts.push({
|
|
159
|
+
type: 'progressive_boundary_test',
|
|
160
|
+
severity: 'high',
|
|
161
|
+
turnIndex: turn.turnIndex,
|
|
162
|
+
cleanTurnsBefore: priorClean,
|
|
163
|
+
description: `Injection detected after ${priorClean} clean turns. Possible gradual boundary testing.`
|
|
164
|
+
});
|
|
278
165
|
}
|
|
279
166
|
}
|
|
280
167
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
/**
|
|
293
|
-
* Automatically adjusts detection thresholds based on observed scan results.
|
|
294
|
-
* Learns what "normal" looks like for each deployment and calibrates
|
|
295
|
-
* per-category thresholds to achieve a target false positive rate.
|
|
296
|
-
*/
|
|
297
|
-
class AdaptiveThresholdCalibrator {
|
|
298
|
-
/**
|
|
299
|
-
* @param {object} [config]
|
|
300
|
-
* @param {number} [config.calibrationSamples=100] - Samples before adjusting
|
|
301
|
-
* @param {number} [config.adjustInterval=50] - Recalibrate every N samples
|
|
302
|
-
* @param {number} [config.minConfidence=0.3] - Never drop below this
|
|
303
|
-
* @param {number} [config.maxConfidence=0.95] - Never go above this
|
|
304
|
-
* @param {number} [config.targetFPRate=0.02] - Target false positive rate (2%)
|
|
305
|
-
*/
|
|
306
|
-
constructor(config = {}) {
|
|
307
|
-
this.calibrationSamples = config.calibrationSamples !== undefined ? config.calibrationSamples : 100;
|
|
308
|
-
this.adjustInterval = config.adjustInterval !== undefined ? config.adjustInterval : 50;
|
|
309
|
-
this.minConfidence = config.minConfidence !== undefined ? config.minConfidence : 0.3;
|
|
310
|
-
this.maxConfidence = config.maxConfidence !== undefined ? config.maxConfidence : 0.95;
|
|
311
|
-
this.targetFPRate = config.targetFPRate !== undefined ? config.targetFPRate : 0.02;
|
|
312
|
-
|
|
313
|
-
// Per-category data
|
|
314
|
-
this._categories = {};
|
|
315
|
-
// Default category always exists
|
|
316
|
-
this._categories['default'] = this._createCategoryData();
|
|
317
|
-
|
|
318
|
-
this._totalSamples = 0;
|
|
319
|
-
this._calibrationCount = 0;
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
/**
|
|
323
|
-
* Record a scan result for calibration.
|
|
324
|
-
* @param {object} result - { confidence: number, isInjection: boolean, category: string }
|
|
325
|
-
* @param {boolean} [isTruePositive] - If known (from feedback), whether this was correct
|
|
326
|
-
* @returns {object} {
|
|
327
|
-
* recorded: boolean,
|
|
328
|
-
* isCalibrating: boolean,
|
|
329
|
-
* samplesRemaining: number,
|
|
330
|
-
* currentThreshold: number
|
|
331
|
-
* }
|
|
332
|
-
*/
|
|
333
|
-
record(result, isTruePositive) {
|
|
334
|
-
if (!result || typeof result.confidence !== 'number') {
|
|
335
|
-
return {
|
|
336
|
-
recorded: false,
|
|
337
|
-
isCalibrating: this._totalSamples < this.calibrationSamples,
|
|
338
|
-
samplesRemaining: Math.max(0, this.calibrationSamples - this._totalSamples),
|
|
339
|
-
currentThreshold: this.getThreshold('default')
|
|
340
|
-
};
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
const category = result.category || 'default';
|
|
344
|
-
const confidence = Math.max(0, Math.min(1, result.confidence));
|
|
345
|
-
const isInjection = !!result.isInjection;
|
|
346
|
-
|
|
347
|
-
// Ensure category data exists
|
|
348
|
-
if (!this._categories[category]) {
|
|
349
|
-
this._categories[category] = this._createCategoryData();
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
const catData = this._categories[category];
|
|
353
|
-
|
|
354
|
-
// Record the sample
|
|
355
|
-
catData.samples.push({
|
|
356
|
-
confidence,
|
|
357
|
-
isInjection,
|
|
358
|
-
isTruePositive: isTruePositive !== undefined ? isTruePositive : null,
|
|
359
|
-
timestamp: Date.now()
|
|
360
|
-
});
|
|
361
|
-
|
|
362
|
-
// Also record in default if not already default
|
|
363
|
-
if (category !== 'default') {
|
|
364
|
-
this._categories['default'].samples.push({
|
|
365
|
-
confidence,
|
|
366
|
-
isInjection,
|
|
367
|
-
isTruePositive: isTruePositive !== undefined ? isTruePositive : null,
|
|
368
|
-
timestamp: Date.now()
|
|
369
|
-
});
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
this._totalSamples++;
|
|
373
|
-
|
|
374
|
-
// Cap stored samples to prevent unbounded growth
|
|
375
|
-
const maxStoredSamples = this.calibrationSamples * 10;
|
|
376
|
-
if (catData.samples.length > maxStoredSamples) {
|
|
377
|
-
catData.samples = catData.samples.slice(-maxStoredSamples);
|
|
378
|
-
}
|
|
379
|
-
if (category !== 'default' && this._categories['default'].samples.length > maxStoredSamples) {
|
|
380
|
-
this._categories['default'].samples = this._categories['default'].samples.slice(-maxStoredSamples);
|
|
381
|
-
}
|
|
382
|
-
|
|
383
|
-
// Check if we should recalibrate
|
|
384
|
-
const isCalibrating = this._totalSamples < this.calibrationSamples;
|
|
385
|
-
const shouldRecalibrate = !isCalibrating &&
|
|
386
|
-
(this._totalSamples % this.adjustInterval === 0);
|
|
387
|
-
|
|
388
|
-
if (shouldRecalibrate) {
|
|
389
|
-
this.recalibrate();
|
|
168
|
+
// 5. Authority accumulation — user references previous "agreements"
|
|
169
|
+
if (role === 'user' && /(?:you\s+(?:said|agreed|confirmed|told\s+me)|as\s+we\s+(?:discussed|agreed)|per\s+our\s+(?:agreement|conversation))/i.test(content)) {
|
|
170
|
+
const hasRealAgreement = this.turns.some(t => t.role === 'assistant' && /(?:sure|yes|okay|of\s+course|I\s+(?:can|will))/i.test(t.content));
|
|
171
|
+
if (!hasRealAgreement) {
|
|
172
|
+
turnAlerts.push({
|
|
173
|
+
type: 'false_authority_claim',
|
|
174
|
+
severity: 'high',
|
|
175
|
+
turnIndex: turn.turnIndex,
|
|
176
|
+
description: 'User claims prior agreement/consent that does not exist in conversation history.'
|
|
177
|
+
});
|
|
178
|
+
}
|
|
390
179
|
}
|
|
391
180
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
samplesRemaining: Math.max(0, this.calibrationSamples - this._totalSamples),
|
|
396
|
-
currentThreshold: this.getThreshold(category)
|
|
397
|
-
};
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
/**
|
|
401
|
-
* Get the current calibrated threshold for a category.
|
|
402
|
-
* @param {string} [category='default']
|
|
403
|
-
* @returns {number} threshold 0-1
|
|
404
|
-
*/
|
|
405
|
-
getThreshold(category = 'default') {
|
|
406
|
-
const catData = this._categories[category] || this._categories['default'];
|
|
407
|
-
return catData.threshold;
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
/**
|
|
411
|
-
* Check if a confidence score exceeds the calibrated threshold.
|
|
412
|
-
* @param {number} confidence
|
|
413
|
-
* @param {string} [category='default']
|
|
414
|
-
* @returns {boolean}
|
|
415
|
-
*/
|
|
416
|
-
shouldFlag(confidence, category = 'default') {
|
|
417
|
-
return confidence >= this.getThreshold(category);
|
|
418
|
-
}
|
|
419
|
-
|
|
420
|
-
/**
|
|
421
|
-
* Force recalibration now.
|
|
422
|
-
* @returns {object} { thresholds: object, samplesUsed: number }
|
|
423
|
-
*/
|
|
424
|
-
recalibrate() {
|
|
425
|
-
this._calibrationCount++;
|
|
426
|
-
const thresholds = {};
|
|
427
|
-
|
|
428
|
-
for (const [category, catData] of Object.entries(this._categories)) {
|
|
429
|
-
const newThreshold = this._calibrateCategory(catData);
|
|
430
|
-
catData.threshold = newThreshold;
|
|
431
|
-
thresholds[category] = newThreshold;
|
|
181
|
+
for (const alert of turnAlerts) {
|
|
182
|
+
this.alerts.push(alert);
|
|
183
|
+
this.stats.alertsGenerated++;
|
|
432
184
|
}
|
|
433
185
|
|
|
434
|
-
|
|
186
|
+
// Bound alerts
|
|
187
|
+
if (this.alerts.length > 500) this.alerts = this.alerts.slice(-500);
|
|
435
188
|
|
|
436
189
|
return {
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
*/
|
|
446
|
-
getStats() {
|
|
447
|
-
const categoryStats = {};
|
|
448
|
-
for (const [category, catData] of Object.entries(this._categories)) {
|
|
449
|
-
const benignSamples = catData.samples.filter(s => !s.isInjection);
|
|
450
|
-
const injectionSamples = catData.samples.filter(s => s.isInjection);
|
|
451
|
-
const feedbackSamples = catData.samples.filter(s => s.isTruePositive !== null);
|
|
452
|
-
|
|
453
|
-
// Estimate current FP rate
|
|
454
|
-
let estimatedFPRate = 0;
|
|
455
|
-
if (benignSamples.length > 0) {
|
|
456
|
-
const falsePositives = benignSamples.filter(
|
|
457
|
-
s => s.confidence >= catData.threshold
|
|
458
|
-
).length;
|
|
459
|
-
estimatedFPRate = falsePositives / benignSamples.length;
|
|
190
|
+
safe: turnAlerts.length === 0 && threats.length === 0,
|
|
191
|
+
alerts: turnAlerts,
|
|
192
|
+
turnAnalysis: {
|
|
193
|
+
topic,
|
|
194
|
+
threatCount: threats.length,
|
|
195
|
+
escalationSignals,
|
|
196
|
+
trustErosion,
|
|
197
|
+
turnIndex: turn.turnIndex
|
|
460
198
|
}
|
|
461
|
-
|
|
462
|
-
categoryStats[category] = {
|
|
463
|
-
threshold: catData.threshold,
|
|
464
|
-
totalSamples: catData.samples.length,
|
|
465
|
-
benignSamples: benignSamples.length,
|
|
466
|
-
injectionSamples: injectionSamples.length,
|
|
467
|
-
feedbackSamples: feedbackSamples.length,
|
|
468
|
-
estimatedFPRate: Math.round(estimatedFPRate * 10000) / 10000
|
|
469
|
-
};
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
return {
|
|
473
|
-
totalSamples: this._totalSamples,
|
|
474
|
-
calibrationCount: this._calibrationCount,
|
|
475
|
-
isCalibrating: this._totalSamples < this.calibrationSamples,
|
|
476
|
-
targetFPRate: this.targetFPRate,
|
|
477
|
-
categories: categoryStats
|
|
478
199
|
};
|
|
479
200
|
}
|
|
480
201
|
|
|
481
202
|
/**
|
|
482
|
-
*
|
|
203
|
+
* Get conversation risk summary.
|
|
483
204
|
* @returns {object}
|
|
484
205
|
*/
|
|
485
|
-
|
|
486
|
-
const
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
threshold: catData.threshold,
|
|
490
|
-
samples: catData.samples
|
|
491
|
-
};
|
|
492
|
-
}
|
|
206
|
+
getRiskSummary() {
|
|
207
|
+
const topicProgression = this.turns.map(t => t.topic);
|
|
208
|
+
const threatTurns = this.turns.filter(t => t.threats.length > 0).length;
|
|
209
|
+
const totalEscalation = this.turns.reduce((s, t) => s + t.escalationSignals, 0);
|
|
493
210
|
|
|
494
211
|
return {
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
212
|
+
totalTurns: this.turns.length,
|
|
213
|
+
threatTurns,
|
|
214
|
+
threatRate: this.turns.length > 0 ? threatTurns / this.turns.length : 0,
|
|
215
|
+
totalEscalationSignals: totalEscalation,
|
|
216
|
+
topicProgression: topicProgression.slice(-10),
|
|
217
|
+
alertCount: this.alerts.length,
|
|
218
|
+
recentAlerts: this.alerts.slice(-5),
|
|
219
|
+
riskLevel: this.alerts.some(a => a.severity === 'critical') ? 'critical' :
|
|
220
|
+
this.alerts.length > 3 ? 'high' :
|
|
221
|
+
this.alerts.length > 0 ? 'medium' : 'safe'
|
|
505
222
|
};
|
|
506
223
|
}
|
|
507
224
|
|
|
508
225
|
/**
|
|
509
|
-
*
|
|
510
|
-
* @param {object} data - Previously exported calibration data
|
|
226
|
+
* Reset the conversation tracker.
|
|
511
227
|
*/
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
228
|
+
reset() {
|
|
229
|
+
this.turns = [];
|
|
230
|
+
this.alerts = [];
|
|
231
|
+
this.stats = { turnsProcessed: 0, alertsGenerated: 0, escalationSignals: 0, topicDrifts: 0 };
|
|
232
|
+
}
|
|
517
233
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
}
|
|
234
|
+
// -----------------------------------------------------------------------
|
|
235
|
+
// Private
|
|
236
|
+
// -----------------------------------------------------------------------
|
|
522
237
|
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
if (data.adjustInterval !== undefined) this.adjustInterval = data.adjustInterval;
|
|
528
|
-
if (data.minConfidence !== undefined) this.minConfidence = data.minConfidence;
|
|
529
|
-
if (data.maxConfidence !== undefined) this.maxConfidence = data.maxConfidence;
|
|
530
|
-
if (data.targetFPRate !== undefined) this.targetFPRate = data.targetFPRate;
|
|
531
|
-
|
|
532
|
-
if (data.categories) {
|
|
533
|
-
this._categories = {};
|
|
534
|
-
for (const [category, catData] of Object.entries(data.categories)) {
|
|
535
|
-
this._categories[category] = {
|
|
536
|
-
threshold: catData.threshold || this._defaultThreshold(),
|
|
537
|
-
samples: Array.isArray(catData.samples) ? catData.samples : []
|
|
538
|
-
};
|
|
539
|
-
}
|
|
238
|
+
/** @private */
|
|
239
|
+
_classifyTopic(text) {
|
|
240
|
+
for (const [level, pattern] of Object.entries(TOPIC_SENSITIVITY).reverse()) {
|
|
241
|
+
if (pattern.test(text)) return level;
|
|
540
242
|
}
|
|
243
|
+
return 'safe';
|
|
244
|
+
}
|
|
541
245
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
246
|
+
/** @private */
|
|
247
|
+
_countEscalationSignals(text) {
|
|
248
|
+
let count = 0;
|
|
249
|
+
for (const pattern of ESCALATION_SIGNALS) {
|
|
250
|
+
if (pattern.test(text)) count++;
|
|
545
251
|
}
|
|
546
|
-
|
|
547
|
-
console.log('[Agent Shield] Calibration data imported: ' + this._totalSamples + ' samples, ' + Object.keys(this._categories).length + ' categories');
|
|
252
|
+
return count;
|
|
548
253
|
}
|
|
549
254
|
|
|
550
|
-
/**
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
* @returns {object}
|
|
554
|
-
*/
|
|
555
|
-
_createCategoryData() {
|
|
556
|
-
return {
|
|
557
|
-
threshold: this._defaultThreshold(),
|
|
558
|
-
samples: []
|
|
559
|
-
};
|
|
255
|
+
/** @private */
|
|
256
|
+
_detectTrustErosion(text) {
|
|
257
|
+
return TRUST_EROSION_PATTERNS.some(p => p.test(text));
|
|
560
258
|
}
|
|
561
259
|
|
|
562
|
-
/**
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
* @returns {number}
|
|
566
|
-
*/
|
|
567
|
-
_defaultThreshold() {
|
|
568
|
-
return 0.5;
|
|
260
|
+
/** @private */
|
|
261
|
+
_getRecentEscalationCount(windowSize) {
|
|
262
|
+
return this.turns.slice(-windowSize).reduce((s, t) => s + t.escalationSignals, 0);
|
|
569
263
|
}
|
|
570
264
|
|
|
571
|
-
/**
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
* @private
|
|
575
|
-
* @param {object} catData - Category data with samples array
|
|
576
|
-
* @returns {number} Calibrated threshold
|
|
577
|
-
*/
|
|
578
|
-
_calibrateCategory(catData) {
|
|
579
|
-
const samples = catData.samples;
|
|
580
|
-
|
|
581
|
-
if (samples.length === 0) {
|
|
582
|
-
return catData.threshold;
|
|
583
|
-
}
|
|
584
|
-
|
|
585
|
-
// Separate benign and injection samples
|
|
586
|
-
const benignConfidences = [];
|
|
587
|
-
const injectionConfidences = [];
|
|
265
|
+
/** @private */
|
|
266
|
+
_measureTopicDrift() {
|
|
267
|
+
if (this.turns.length < 4) return { drifted: false };
|
|
588
268
|
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
const actuallyInjection = s.isTruePositive === true || (s.isInjection && s.isTruePositive === null);
|
|
269
|
+
const levels = { safe: 0, moderate: 1, sensitive: 2, dangerous: 3 };
|
|
270
|
+
const earlyTurns = this.turns.slice(0, Math.min(3, Math.floor(this.turns.length / 2)));
|
|
271
|
+
const recentTurns = this.turns.slice(-3);
|
|
593
272
|
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
} else if (actuallyInjection) {
|
|
597
|
-
injectionConfidences.push(s.confidence);
|
|
598
|
-
}
|
|
599
|
-
}
|
|
273
|
+
const earlyMax = Math.max(...earlyTurns.map(t => levels[t.topic] || 0));
|
|
274
|
+
const recentMax = Math.max(...recentTurns.map(t => levels[t.topic] || 0));
|
|
600
275
|
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
276
|
+
if (recentMax > earlyMax && recentMax >= 2) {
|
|
277
|
+
const fromLevel = Object.entries(levels).find(([, v]) => v === earlyMax)?.[0] || 'safe';
|
|
278
|
+
const toLevel = Object.entries(levels).find(([, v]) => v === recentMax)?.[0] || 'safe';
|
|
279
|
+
return {
|
|
280
|
+
drifted: true,
|
|
281
|
+
from: fromLevel,
|
|
282
|
+
to: toLevel,
|
|
283
|
+
fromLevel: earlyMax,
|
|
284
|
+
toLevel: recentMax,
|
|
285
|
+
overTurns: this.turns.length
|
|
286
|
+
};
|
|
604
287
|
}
|
|
605
288
|
|
|
606
|
-
|
|
607
|
-
benignConfidences.sort((a, b) => a - b);
|
|
608
|
-
|
|
609
|
-
// Find the threshold at the (1 - targetFPRate) percentile of benign samples
|
|
610
|
-
// This means only targetFPRate of benign samples would be above the threshold
|
|
611
|
-
const percentileIndex = Math.floor(benignConfidences.length * (1 - this.targetFPRate));
|
|
612
|
-
const clampedIndex = Math.min(percentileIndex, benignConfidences.length - 1);
|
|
613
|
-
let threshold = benignConfidences[clampedIndex];
|
|
614
|
-
|
|
615
|
-
// Clamp between min and max
|
|
616
|
-
threshold = Math.max(this.minConfidence, Math.min(this.maxConfidence, threshold));
|
|
617
|
-
|
|
618
|
-
return Math.round(threshold * 1000) / 1000;
|
|
289
|
+
return { drifted: false };
|
|
619
290
|
}
|
|
620
291
|
}
|
|
621
292
|
|
|
622
|
-
// =========================================================================
|
|
623
|
-
// UTILITY FUNCTIONS
|
|
624
|
-
// =========================================================================
|
|
625
|
-
|
|
626
|
-
/**
|
|
627
|
-
* Map severity string to a numeric confidence value.
|
|
628
|
-
* @param {string} severity - 'critical', 'high', 'medium', or 'low'
|
|
629
|
-
* @returns {number} Confidence between 0 and 1
|
|
630
|
-
* @private
|
|
631
|
-
*/
|
|
632
|
-
function _severityToConfidence(severity) {
|
|
633
|
-
const map = {
|
|
634
|
-
critical: 0.95,
|
|
635
|
-
high: 0.8,
|
|
636
|
-
medium: 0.6,
|
|
637
|
-
low: 0.4
|
|
638
|
-
};
|
|
639
|
-
return map[severity] || 0.5;
|
|
640
|
-
}
|
|
641
|
-
|
|
642
293
|
// =========================================================================
|
|
643
294
|
// EXPORTS
|
|
644
295
|
// =========================================================================
|
|
645
296
|
|
|
646
297
|
module.exports = {
|
|
647
|
-
|
|
648
|
-
|
|
298
|
+
ConversationTracker,
|
|
299
|
+
ESCALATION_SIGNALS,
|
|
300
|
+
TRUST_EROSION_PATTERNS,
|
|
301
|
+
TOPIC_SENSITIVITY
|
|
649
302
|
};
|