agentshield-sdk 11.0.0 → 13.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/cross-turn.js CHANGED
@@ -1,649 +1,302 @@
1
1
  'use strict';
2
2
 
3
3
  /**
4
- * Agent Shield — Cross-Turn Injection Tracking & Adaptive Thresholds (v8)
4
+ * Agent Shield — Multi-Turn Attack Detection (v12)
5
5
  *
6
- * Detects injection attacks spread across multiple conversation turns and
7
- * auto-calibrates detection thresholds based on observed scan results.
6
+ * Detects attacks that unfold across multiple conversation turns:
7
+ * gradual trust erosion, progressive boundary testing, multi-turn
8
+ * grooming, topic drift toward sensitive areas, and authority
9
+ * accumulation over time.
8
10
  *
9
- * All computation is pure JavaScript no external dependencies.
10
- * No data ever leaves the user's environment.
11
+ * Ref: Nature Communications LRMs achieve 97% jailbreak success
12
+ * via planned multi-turn attacks.
13
+ *
14
+ * All processing runs locally — no data ever leaves your environment.
15
+ *
16
+ * @module cross-turn
11
17
  */
12
18
 
13
19
  const { scanText } = require('./detector-core');
14
20
 
15
21
  // =========================================================================
16
- // CROSS-TURN TRACKER
22
+ // CONSTANTS
23
+ // =========================================================================
24
+
25
+ /** Escalation signals — phrases that indicate progressive boundary pushing. */
26
+ const ESCALATION_SIGNALS = [
27
+ /(?:now\s+that\s+we've\s+established|building\s+on\s+(?:that|what\s+you\s+said)|since\s+you\s+(?:agreed|confirmed|said))/i,
28
+ /(?:great,?\s+now\s+(?:let's|can\s+you)|okay\s+(?:good|perfect),?\s+(?:now|next))/i,
29
+ /(?:one\s+more\s+(?:thing|question|step)|just\s+one\s+(?:last|final)\s+(?:thing|question))/i,
30
+ /(?:can\s+you\s+go\s+(?:a\s+little\s+)?(?:further|deeper)|take\s+(?:it|this)\s+(?:a\s+step\s+)?further)/i,
31
+ /(?:you\s+(?:just|already)\s+(?:showed|told|gave)\s+me|you\s+(?:were|seemed)\s+(?:willing|happy)\s+to)/i,
32
+ ];
33
+
34
+ /** Trust erosion patterns — attempts to lower the model's guard over turns. */
35
+ const TRUST_EROSION_PATTERNS = [
36
+ /(?:we're\s+(?:friends|on\s+the\s+same\s+team|collaborating)|I\s+thought\s+we\s+had\s+(?:a\s+good|an?\s+understanding))/i,
37
+ /(?:you\s+(?:can|should)\s+trust\s+me|I\s+(?:promise|swear)\s+(?:this|I)\s+(?:is|am)\s+(?:safe|legitimate))/i,
38
+ /(?:between\s+(?:you\s+and\s+me|us)|this\s+(?:is|stays)\s+(?:just\s+)?between\s+us)/i,
39
+ ];
40
+
41
+ /** Topic categories with sensitivity levels. */
42
+ const TOPIC_SENSITIVITY = {
43
+ safe: /(?:weather|food|sports|music|movies|travel|recipes|pets|hobbies|games)/i,
44
+ moderate: /(?:coding|programming|math|science|history|business|education|health)/i,
45
+ sensitive: /(?:security|credentials?|passwords?|tokens?|secrets?|keys?|prompt|instructions?|admin|root|config)/i,
46
+ dangerous: /(?:hack(?:ing)?|exploit|attack|bypass|override|jailbreak|injection|exfiltrat)/i,
47
+ };
48
+
49
+ // =========================================================================
50
+ // ConversationTracker
17
51
  // =========================================================================
18
52
 
19
53
  /**
20
- * Accumulates conversation text across turns and periodically scans the
21
- * full accumulated context for injections that only become visible when
22
- * messages are combined (e.g. "Ig" + "nore all" + "previous instructions").
54
+ * Tracks a multi-turn conversation and detects progressive attacks.
23
55
  */
24
- class CrossTurnTracker {
56
+ class ConversationTracker {
25
57
  /**
26
- * @param {object} [config]
27
- * @param {number} [config.windowSize=20] - Max messages to keep in window
28
- * @param {number} [config.scanInterval=3] - Scan every N messages
29
- * @param {boolean} [config.accumulateAll=true] - Keep all text or just user messages
30
- * @param {string} [config.sensitivity='high'] - Scan sensitivity
31
- * @param {function} [config.onDetection] - Callback when cross-turn threat found
58
+ * @param {object} [options]
59
+ * @param {number} [options.maxTurns=100] - Max turns to track.
60
+ * @param {number} [options.escalationThreshold=3] - Escalation signals before alert.
61
+ * @param {number} [options.topicDriftThreshold=0.6] - Topic drift score to alert (0-1).
32
62
  */
33
- constructor(config = {}) {
34
- this.windowSize = config.windowSize !== undefined ? config.windowSize : 20;
35
- this.scanInterval = config.scanInterval !== undefined ? config.scanInterval : 3;
36
- this.accumulateAll = config.accumulateAll !== undefined ? config.accumulateAll : true;
37
- this.sensitivity = config.sensitivity || 'high';
38
- this.onDetection = config.onDetection || null;
39
-
40
- this.messages = [];
41
- this._stats = {
42
- totalMessages: 0,
43
- scansTriggered: 0,
44
- crossTurnDetections: 0,
45
- individualDetections: 0
46
- };
63
+ constructor(options = {}) {
64
+ this.maxTurns = options.maxTurns || 100;
65
+ this.escalationThreshold = options.escalationThreshold || 3;
66
+ this.topicDriftThreshold = options.topicDriftThreshold || 0.6;
67
+
68
+ /** @type {Array<{ role: string, content: string, timestamp: number, threats: any[], topic: string, escalationSignals: number, trustErosion: boolean }>} */
69
+ this.turns = [];
70
+ this.alerts = [];
71
+ this.stats = { turnsProcessed: 0, alertsGenerated: 0, escalationSignals: 0, topicDrifts: 0 };
47
72
  }
48
73
 
49
74
  /**
50
- * Add a message to the conversation.
51
- * @param {string} text - Message text
52
- * @param {string} [role='user'] - 'user' or 'assistant'
53
- * @returns {object} {
54
- * tracked: boolean,
55
- * messageCount: number,
56
- * scanTriggered: boolean,
57
- * threats: Array (empty if no scan or no threats),
58
- * crossTurnDetection: boolean (true if threat only visible in combined text)
59
- * }
75
+ * Add a conversation turn and analyze for multi-turn attack patterns.
76
+ *
77
+ * @param {string} role - 'user' or 'assistant'.
78
+ * @param {string} content - Message content.
79
+ * @returns {{ safe: boolean, alerts: Array<object>, turnAnalysis: object }}
60
80
  */
61
- addMessage(text, role = 'user') {
62
- if (!text || typeof text !== 'string') {
63
- return {
64
- tracked: false,
65
- messageCount: this.messages.length,
66
- scanTriggered: false,
67
- threats: [],
68
- crossTurnDetection: false
69
- };
70
- }
71
-
72
- const message = {
73
- text,
81
+ addTurn(role, content) {
82
+ const safeContent = (content && typeof content === 'string') ? content : '';
83
+ const threats = scanText(safeContent).threats || [];
84
+ const topic = this._classifyTopic(safeContent);
85
+ const escalationSignals = this._countEscalationSignals(safeContent);
86
+ const trustErosion = this._detectTrustErosion(safeContent);
87
+
88
+ const turn = {
74
89
  role,
90
+ content: safeContent.substring(0, 1000),
75
91
  timestamp: Date.now(),
76
- index: this._stats.totalMessages
92
+ threats,
93
+ topic,
94
+ escalationSignals,
95
+ trustErosion,
96
+ turnIndex: this.turns.length
77
97
  };
78
98
 
79
- this.messages.push(message);
80
- this._stats.totalMessages++;
81
-
82
- // Enforce sliding window
83
- if (this.messages.length > this.windowSize) {
84
- this.messages.shift();
85
- }
86
-
87
- // Determine if we should scan
88
- const scanTriggered = this._stats.totalMessages % this.scanInterval === 0;
99
+ this.turns.push(turn);
100
+ this.stats.turnsProcessed++;
101
+ this.stats.escalationSignals += escalationSignals;
89
102
 
90
- if (!scanTriggered) {
91
- return {
92
- tracked: true,
93
- messageCount: this.messages.length,
94
- scanTriggered: false,
95
- threats: [],
96
- crossTurnDetection: false
97
- };
103
+ // Trim to max turns
104
+ if (this.turns.length > this.maxTurns) {
105
+ this.turns = this.turns.slice(-this.maxTurns);
98
106
  }
99
107
 
100
- // Perform cross-turn scan
101
- this._stats.scansTriggered++;
102
- const scanResult = this._performCrossTurnScan();
103
-
104
- return {
105
- tracked: true,
106
- messageCount: this.messages.length,
107
- scanTriggered: true,
108
- threats: scanResult.threats,
109
- crossTurnDetection: scanResult.crossTurnDetection
110
- };
111
- }
112
-
113
- /**
114
- * Force a scan of accumulated text right now.
115
- * @returns {object} { threats: Array, combinedLength: number, messageCount: number }
116
- */
117
- scanNow() {
118
- this._stats.scansTriggered++;
119
- const combined = this.getAccumulatedText();
120
- const result = scanText(combined, {
121
- source: 'cross_turn_scan',
122
- sensitivity: this.sensitivity
123
- });
124
-
125
- return {
126
- threats: result.threats,
127
- combinedLength: combined.length,
128
- messageCount: this.messages.length
129
- };
130
- }
131
-
132
- /**
133
- * Get the current accumulated text.
134
- * @returns {string}
135
- */
136
- getAccumulatedText() {
137
- const eligible = this.accumulateAll
138
- ? this.messages
139
- : this.messages.filter(m => m.role === 'user');
140
-
141
- return eligible.map(m => m.text).join(' ');
142
- }
143
-
144
- /**
145
- * Get the individual message that was most suspicious.
146
- * @returns {object|null} { text, role, confidence, threats } or null
147
- */
148
- getMostSuspicious() {
149
- if (this.messages.length === 0) return null;
150
-
151
- let mostSuspicious = null;
152
- let highestThreatCount = -1;
153
-
154
- for (const msg of this.messages) {
155
- const result = scanText(msg.text, {
156
- source: 'individual_scan',
157
- sensitivity: this.sensitivity
158
- });
159
-
160
- if (result.threats.length > highestThreatCount) {
161
- highestThreatCount = result.threats.length;
162
- mostSuspicious = {
163
- text: msg.text,
164
- role: msg.role,
165
- timestamp: msg.timestamp,
166
- confidence: result.threats.length > 0
167
- ? Math.max(...result.threats.map(t => _severityToConfidence(t.severity)))
168
- : 0,
169
- threats: result.threats
170
- };
108
+ // Run multi-turn analysis
109
+ const turnAlerts = [];
110
+
111
+ // 1. Escalation detection — too many escalation signals in recent turns
112
+ if (role === 'user') {
113
+ const recentEscalation = this._getRecentEscalationCount(5);
114
+ if (recentEscalation >= this.escalationThreshold) {
115
+ turnAlerts.push({
116
+ type: 'multi_turn_escalation',
117
+ severity: 'high',
118
+ turnIndex: turn.turnIndex,
119
+ escalationCount: recentEscalation,
120
+ description: `Detected ${recentEscalation} escalation signals in last 5 turns. Possible multi-turn grooming attack.`
121
+ });
171
122
  }
172
123
  }
173
124
 
174
- return mostSuspicious;
175
- }
176
-
177
- /**
178
- * Reset the tracker to initial state.
179
- */
180
- reset() {
181
- this.messages = [];
182
- this._stats = {
183
- totalMessages: 0,
184
- scansTriggered: 0,
185
- crossTurnDetections: 0,
186
- individualDetections: 0
187
- };
188
- }
189
-
190
- /**
191
- * Get tracker statistics.
192
- * @returns {object}
193
- */
194
- getStats() {
195
- return {
196
- ...this._stats,
197
- currentWindowSize: this.messages.length,
198
- maxWindowSize: this.windowSize,
199
- scanInterval: this.scanInterval
200
- };
201
- }
202
-
203
- /**
204
- * Perform the cross-turn detection scan.
205
- * Compares combined scan results against individual message scans.
206
- * @private
207
- * @returns {object} { threats: Array, crossTurnDetection: boolean }
208
- */
209
- _performCrossTurnScan() {
210
- const eligible = this.accumulateAll
211
- ? this.messages
212
- : this.messages.filter(m => m.role === 'user');
213
-
214
- if (eligible.length === 0) {
215
- return { threats: [], crossTurnDetection: false };
216
- }
217
-
218
- // Scan concatenated text
219
- const combinedText = eligible.map(m => m.text).join(' ');
220
- const combinedResult = scanText(combinedText, {
221
- source: 'cross_turn_combined',
222
- sensitivity: this.sensitivity
223
- });
224
-
225
- if (combinedResult.threats.length === 0) {
226
- return { threats: [], crossTurnDetection: false };
227
- }
228
-
229
- // Scan each individual message and collect all individually-detected threats
230
- const individualCategories = new Set();
231
- for (const msg of eligible) {
232
- const result = scanText(msg.text, {
233
- source: 'cross_turn_individual',
234
- sensitivity: this.sensitivity
125
+ // 2. Topic drift toward sensitive/dangerous areas
126
+ const topicDrift = this._measureTopicDrift();
127
+ if (topicDrift.drifted) {
128
+ turnAlerts.push({
129
+ type: 'topic_drift_to_sensitive',
130
+ severity: topicDrift.toLevel === 'dangerous' ? 'critical' : 'high',
131
+ turnIndex: turn.turnIndex,
132
+ fromTopic: topicDrift.from,
133
+ toTopic: topicDrift.to,
134
+ description: `Conversation drifted from ${topicDrift.from} to ${topicDrift.to} topics over ${topicDrift.overTurns} turns.`
235
135
  });
236
- for (const t of result.threats) {
237
- individualCategories.add(`${t.category}|${t.detail}`);
238
- }
239
- if (result.threats.length > 0) {
240
- this._stats.individualDetections++;
241
- }
136
+ this.stats.topicDrifts++;
242
137
  }
243
138
 
244
- // Cross-turn threats: found in combined scan but NOT in any individual scan
245
- const crossTurnThreats = [];
246
- const regularThreats = [];
247
-
248
- for (const threat of combinedResult.threats) {
249
- const key = `${threat.category}|${threat.detail}`;
250
- if (!individualCategories.has(key)) {
251
- crossTurnThreats.push({
252
- ...threat,
253
- crossTurn: true,
254
- description: `Cross-turn attack: ${threat.description} (split across ${eligible.length} messages)`,
255
- windowSize: eligible.length
139
+ // 3. Trust erosion accumulation
140
+ if (trustErosion) {
141
+ const recentTrustErosion = this.turns.slice(-5).filter(t => t.trustErosion).length;
142
+ if (recentTrustErosion >= 2) {
143
+ turnAlerts.push({
144
+ type: 'trust_erosion',
145
+ severity: 'high',
146
+ turnIndex: turn.turnIndex,
147
+ count: recentTrustErosion,
148
+ description: `Detected ${recentTrustErosion} trust erosion attempts in last 5 turns. Attacker building false rapport.`
256
149
  });
257
- } else {
258
- regularThreats.push(threat);
259
150
  }
260
151
  }
261
152
 
262
- const crossTurnDetection = crossTurnThreats.length > 0;
263
-
264
- if (crossTurnDetection) {
265
- this._stats.crossTurnDetections++;
266
- console.log('[Agent Shield] Cross-turn injection detected: ' + crossTurnThreats.length + ' threat(s) found across ' + eligible.length + ' messages');
267
-
268
- if (this.onDetection) {
269
- try {
270
- this.onDetection({
271
- threats: crossTurnThreats,
272
- messages: eligible.map(m => ({ text: m.text, role: m.role })),
273
- timestamp: Date.now()
274
- });
275
- } catch (e) {
276
- console.error('[Agent Shield] onDetection callback error:', e.message);
277
- }
153
+ // 4. Progressive boundary testing — benign → threat pattern
154
+ if (threats.length > 0 && this.turns.length >= 3) {
155
+ const priorTurns = this.turns.slice(-4, -1);
156
+ const priorClean = priorTurns.filter(t => t.role === 'user' && t.threats.length === 0).length;
157
+ if (priorClean >= 2) {
158
+ turnAlerts.push({
159
+ type: 'progressive_boundary_test',
160
+ severity: 'high',
161
+ turnIndex: turn.turnIndex,
162
+ cleanTurnsBefore: priorClean,
163
+ description: `Injection detected after ${priorClean} clean turns. Possible gradual boundary testing.`
164
+ });
278
165
  }
279
166
  }
280
167
 
281
- return {
282
- threats: [...crossTurnThreats, ...regularThreats],
283
- crossTurnDetection
284
- };
285
- }
286
- }
287
-
288
- // =========================================================================
289
- // ADAPTIVE THRESHOLD CALIBRATOR
290
- // =========================================================================
291
-
292
- /**
293
- * Automatically adjusts detection thresholds based on observed scan results.
294
- * Learns what "normal" looks like for each deployment and calibrates
295
- * per-category thresholds to achieve a target false positive rate.
296
- */
297
- class AdaptiveThresholdCalibrator {
298
- /**
299
- * @param {object} [config]
300
- * @param {number} [config.calibrationSamples=100] - Samples before adjusting
301
- * @param {number} [config.adjustInterval=50] - Recalibrate every N samples
302
- * @param {number} [config.minConfidence=0.3] - Never drop below this
303
- * @param {number} [config.maxConfidence=0.95] - Never go above this
304
- * @param {number} [config.targetFPRate=0.02] - Target false positive rate (2%)
305
- */
306
- constructor(config = {}) {
307
- this.calibrationSamples = config.calibrationSamples !== undefined ? config.calibrationSamples : 100;
308
- this.adjustInterval = config.adjustInterval !== undefined ? config.adjustInterval : 50;
309
- this.minConfidence = config.minConfidence !== undefined ? config.minConfidence : 0.3;
310
- this.maxConfidence = config.maxConfidence !== undefined ? config.maxConfidence : 0.95;
311
- this.targetFPRate = config.targetFPRate !== undefined ? config.targetFPRate : 0.02;
312
-
313
- // Per-category data
314
- this._categories = {};
315
- // Default category always exists
316
- this._categories['default'] = this._createCategoryData();
317
-
318
- this._totalSamples = 0;
319
- this._calibrationCount = 0;
320
- }
321
-
322
- /**
323
- * Record a scan result for calibration.
324
- * @param {object} result - { confidence: number, isInjection: boolean, category: string }
325
- * @param {boolean} [isTruePositive] - If known (from feedback), whether this was correct
326
- * @returns {object} {
327
- * recorded: boolean,
328
- * isCalibrating: boolean,
329
- * samplesRemaining: number,
330
- * currentThreshold: number
331
- * }
332
- */
333
- record(result, isTruePositive) {
334
- if (!result || typeof result.confidence !== 'number') {
335
- return {
336
- recorded: false,
337
- isCalibrating: this._totalSamples < this.calibrationSamples,
338
- samplesRemaining: Math.max(0, this.calibrationSamples - this._totalSamples),
339
- currentThreshold: this.getThreshold('default')
340
- };
341
- }
342
-
343
- const category = result.category || 'default';
344
- const confidence = Math.max(0, Math.min(1, result.confidence));
345
- const isInjection = !!result.isInjection;
346
-
347
- // Ensure category data exists
348
- if (!this._categories[category]) {
349
- this._categories[category] = this._createCategoryData();
350
- }
351
-
352
- const catData = this._categories[category];
353
-
354
- // Record the sample
355
- catData.samples.push({
356
- confidence,
357
- isInjection,
358
- isTruePositive: isTruePositive !== undefined ? isTruePositive : null,
359
- timestamp: Date.now()
360
- });
361
-
362
- // Also record in default if not already default
363
- if (category !== 'default') {
364
- this._categories['default'].samples.push({
365
- confidence,
366
- isInjection,
367
- isTruePositive: isTruePositive !== undefined ? isTruePositive : null,
368
- timestamp: Date.now()
369
- });
370
- }
371
-
372
- this._totalSamples++;
373
-
374
- // Cap stored samples to prevent unbounded growth
375
- const maxStoredSamples = this.calibrationSamples * 10;
376
- if (catData.samples.length > maxStoredSamples) {
377
- catData.samples = catData.samples.slice(-maxStoredSamples);
378
- }
379
- if (category !== 'default' && this._categories['default'].samples.length > maxStoredSamples) {
380
- this._categories['default'].samples = this._categories['default'].samples.slice(-maxStoredSamples);
381
- }
382
-
383
- // Check if we should recalibrate
384
- const isCalibrating = this._totalSamples < this.calibrationSamples;
385
- const shouldRecalibrate = !isCalibrating &&
386
- (this._totalSamples % this.adjustInterval === 0);
387
-
388
- if (shouldRecalibrate) {
389
- this.recalibrate();
168
+ // 5. Authority accumulation — user references previous "agreements"
169
+ if (role === 'user' && /(?:you\s+(?:said|agreed|confirmed|told\s+me)|as\s+we\s+(?:discussed|agreed)|per\s+our\s+(?:agreement|conversation))/i.test(content)) {
170
+ const hasRealAgreement = this.turns.some(t => t.role === 'assistant' && /(?:sure|yes|okay|of\s+course|I\s+(?:can|will))/i.test(t.content));
171
+ if (!hasRealAgreement) {
172
+ turnAlerts.push({
173
+ type: 'false_authority_claim',
174
+ severity: 'high',
175
+ turnIndex: turn.turnIndex,
176
+ description: 'User claims prior agreement/consent that does not exist in conversation history.'
177
+ });
178
+ }
390
179
  }
391
180
 
392
- return {
393
- recorded: true,
394
- isCalibrating,
395
- samplesRemaining: Math.max(0, this.calibrationSamples - this._totalSamples),
396
- currentThreshold: this.getThreshold(category)
397
- };
398
- }
399
-
400
- /**
401
- * Get the current calibrated threshold for a category.
402
- * @param {string} [category='default']
403
- * @returns {number} threshold 0-1
404
- */
405
- getThreshold(category = 'default') {
406
- const catData = this._categories[category] || this._categories['default'];
407
- return catData.threshold;
408
- }
409
-
410
- /**
411
- * Check if a confidence score exceeds the calibrated threshold.
412
- * @param {number} confidence
413
- * @param {string} [category='default']
414
- * @returns {boolean}
415
- */
416
- shouldFlag(confidence, category = 'default') {
417
- return confidence >= this.getThreshold(category);
418
- }
419
-
420
- /**
421
- * Force recalibration now.
422
- * @returns {object} { thresholds: object, samplesUsed: number }
423
- */
424
- recalibrate() {
425
- this._calibrationCount++;
426
- const thresholds = {};
427
-
428
- for (const [category, catData] of Object.entries(this._categories)) {
429
- const newThreshold = this._calibrateCategory(catData);
430
- catData.threshold = newThreshold;
431
- thresholds[category] = newThreshold;
181
+ for (const alert of turnAlerts) {
182
+ this.alerts.push(alert);
183
+ this.stats.alertsGenerated++;
432
184
  }
433
185
 
434
- console.log('[Agent Shield] Adaptive thresholds recalibrated (round ' + this._calibrationCount + '): ' + Object.entries(thresholds).map(([cat, th]) => cat + '=' + th.toFixed(3)).join(', '));
186
+ // Bound alerts
187
+ if (this.alerts.length > 500) this.alerts = this.alerts.slice(-500);
435
188
 
436
189
  return {
437
- thresholds,
438
- samplesUsed: this._totalSamples
439
- };
440
- }
441
-
442
- /**
443
- * Get calibration stats.
444
- * @returns {object}
445
- */
446
- getStats() {
447
- const categoryStats = {};
448
- for (const [category, catData] of Object.entries(this._categories)) {
449
- const benignSamples = catData.samples.filter(s => !s.isInjection);
450
- const injectionSamples = catData.samples.filter(s => s.isInjection);
451
- const feedbackSamples = catData.samples.filter(s => s.isTruePositive !== null);
452
-
453
- // Estimate current FP rate
454
- let estimatedFPRate = 0;
455
- if (benignSamples.length > 0) {
456
- const falsePositives = benignSamples.filter(
457
- s => s.confidence >= catData.threshold
458
- ).length;
459
- estimatedFPRate = falsePositives / benignSamples.length;
190
+ safe: turnAlerts.length === 0 && threats.length === 0,
191
+ alerts: turnAlerts,
192
+ turnAnalysis: {
193
+ topic,
194
+ threatCount: threats.length,
195
+ escalationSignals,
196
+ trustErosion,
197
+ turnIndex: turn.turnIndex
460
198
  }
461
-
462
- categoryStats[category] = {
463
- threshold: catData.threshold,
464
- totalSamples: catData.samples.length,
465
- benignSamples: benignSamples.length,
466
- injectionSamples: injectionSamples.length,
467
- feedbackSamples: feedbackSamples.length,
468
- estimatedFPRate: Math.round(estimatedFPRate * 10000) / 10000
469
- };
470
- }
471
-
472
- return {
473
- totalSamples: this._totalSamples,
474
- calibrationCount: this._calibrationCount,
475
- isCalibrating: this._totalSamples < this.calibrationSamples,
476
- targetFPRate: this.targetFPRate,
477
- categories: categoryStats
478
199
  };
479
200
  }
480
201
 
481
202
  /**
482
- * Export calibration data for persistence.
203
+ * Get conversation risk summary.
483
204
  * @returns {object}
484
205
  */
485
- export() {
486
- const categories = {};
487
- for (const [category, catData] of Object.entries(this._categories)) {
488
- categories[category] = {
489
- threshold: catData.threshold,
490
- samples: catData.samples
491
- };
492
- }
206
+ getRiskSummary() {
207
+ const topicProgression = this.turns.map(t => t.topic);
208
+ const threatTurns = this.turns.filter(t => t.threats.length > 0).length;
209
+ const totalEscalation = this.turns.reduce((s, t) => s + t.escalationSignals, 0);
493
210
 
494
211
  return {
495
- version: 1,
496
- totalSamples: this._totalSamples,
497
- calibrationCount: this._calibrationCount,
498
- calibrationSamples: this.calibrationSamples,
499
- adjustInterval: this.adjustInterval,
500
- minConfidence: this.minConfidence,
501
- maxConfidence: this.maxConfidence,
502
- targetFPRate: this.targetFPRate,
503
- categories,
504
- exportedAt: Date.now()
212
+ totalTurns: this.turns.length,
213
+ threatTurns,
214
+ threatRate: this.turns.length > 0 ? threatTurns / this.turns.length : 0,
215
+ totalEscalationSignals: totalEscalation,
216
+ topicProgression: topicProgression.slice(-10),
217
+ alertCount: this.alerts.length,
218
+ recentAlerts: this.alerts.slice(-5),
219
+ riskLevel: this.alerts.some(a => a.severity === 'critical') ? 'critical' :
220
+ this.alerts.length > 3 ? 'high' :
221
+ this.alerts.length > 0 ? 'medium' : 'safe'
505
222
  };
506
223
  }
507
224
 
508
225
  /**
509
- * Import calibration data from a previous export.
510
- * @param {object} data - Previously exported calibration data
226
+ * Reset the conversation tracker.
511
227
  */
512
- import(data) {
513
- if (!data || typeof data !== 'object') {
514
- console.error('[Agent Shield] Invalid calibration data for import');
515
- return;
516
- }
228
+ reset() {
229
+ this.turns = [];
230
+ this.alerts = [];
231
+ this.stats = { turnsProcessed: 0, alertsGenerated: 0, escalationSignals: 0, topicDrifts: 0 };
232
+ }
517
233
 
518
- if (data.version !== 1) {
519
- console.error('[Agent Shield] Unsupported calibration data version: ' + data.version);
520
- return;
521
- }
234
+ // -----------------------------------------------------------------------
235
+ // Private
236
+ // -----------------------------------------------------------------------
522
237
 
523
- this._totalSamples = data.totalSamples || 0;
524
- this._calibrationCount = data.calibrationCount || 0;
525
-
526
- if (data.calibrationSamples !== undefined) this.calibrationSamples = data.calibrationSamples;
527
- if (data.adjustInterval !== undefined) this.adjustInterval = data.adjustInterval;
528
- if (data.minConfidence !== undefined) this.minConfidence = data.minConfidence;
529
- if (data.maxConfidence !== undefined) this.maxConfidence = data.maxConfidence;
530
- if (data.targetFPRate !== undefined) this.targetFPRate = data.targetFPRate;
531
-
532
- if (data.categories) {
533
- this._categories = {};
534
- for (const [category, catData] of Object.entries(data.categories)) {
535
- this._categories[category] = {
536
- threshold: catData.threshold || this._defaultThreshold(),
537
- samples: Array.isArray(catData.samples) ? catData.samples : []
538
- };
539
- }
238
+ /** @private */
239
+ _classifyTopic(text) {
240
+ for (const [level, pattern] of Object.entries(TOPIC_SENSITIVITY).reverse()) {
241
+ if (pattern.test(text)) return level;
540
242
  }
243
+ return 'safe';
244
+ }
541
245
 
542
- // Ensure default category exists
543
- if (!this._categories['default']) {
544
- this._categories['default'] = this._createCategoryData();
246
+ /** @private */
247
+ _countEscalationSignals(text) {
248
+ let count = 0;
249
+ for (const pattern of ESCALATION_SIGNALS) {
250
+ if (pattern.test(text)) count++;
545
251
  }
546
-
547
- console.log('[Agent Shield] Calibration data imported: ' + this._totalSamples + ' samples, ' + Object.keys(this._categories).length + ' categories');
252
+ return count;
548
253
  }
549
254
 
550
- /**
551
- * Create initial data structure for a category.
552
- * @private
553
- * @returns {object}
554
- */
555
- _createCategoryData() {
556
- return {
557
- threshold: this._defaultThreshold(),
558
- samples: []
559
- };
255
+ /** @private */
256
+ _detectTrustErosion(text) {
257
+ return TRUST_EROSION_PATTERNS.some(p => p.test(text));
560
258
  }
561
259
 
562
- /**
563
- * Get the default starting threshold.
564
- * @private
565
- * @returns {number}
566
- */
567
- _defaultThreshold() {
568
- return 0.5;
260
+ /** @private */
261
+ _getRecentEscalationCount(windowSize) {
262
+ return this.turns.slice(-windowSize).reduce((s, t) => s + t.escalationSignals, 0);
569
263
  }
570
264
 
571
- /**
572
- * Calibrate a single category using the percentile-based approach.
573
- * Finds the threshold that achieves the target FP rate on benign samples.
574
- * @private
575
- * @param {object} catData - Category data with samples array
576
- * @returns {number} Calibrated threshold
577
- */
578
- _calibrateCategory(catData) {
579
- const samples = catData.samples;
580
-
581
- if (samples.length === 0) {
582
- return catData.threshold;
583
- }
584
-
585
- // Separate benign and injection samples
586
- const benignConfidences = [];
587
- const injectionConfidences = [];
265
+ /** @private */
266
+ _measureTopicDrift() {
267
+ if (this.turns.length < 4) return { drifted: false };
588
268
 
589
- for (const s of samples) {
590
- // Use feedback if available, otherwise use isInjection flag
591
- const actuallyBenign = s.isTruePositive === false || (!s.isInjection && s.isTruePositive === null);
592
- const actuallyInjection = s.isTruePositive === true || (s.isInjection && s.isTruePositive === null);
269
+ const levels = { safe: 0, moderate: 1, sensitive: 2, dangerous: 3 };
270
+ const earlyTurns = this.turns.slice(0, Math.min(3, Math.floor(this.turns.length / 2)));
271
+ const recentTurns = this.turns.slice(-3);
593
272
 
594
- if (actuallyBenign) {
595
- benignConfidences.push(s.confidence);
596
- } else if (actuallyInjection) {
597
- injectionConfidences.push(s.confidence);
598
- }
599
- }
273
+ const earlyMax = Math.max(...earlyTurns.map(t => levels[t.topic] || 0));
274
+ const recentMax = Math.max(...recentTurns.map(t => levels[t.topic] || 0));
600
275
 
601
- // If we have no benign samples, keep current threshold
602
- if (benignConfidences.length === 0) {
603
- return catData.threshold;
276
+ if (recentMax > earlyMax && recentMax >= 2) {
277
+ const fromLevel = Object.entries(levels).find(([, v]) => v === earlyMax)?.[0] || 'safe';
278
+ const toLevel = Object.entries(levels).find(([, v]) => v === recentMax)?.[0] || 'safe';
279
+ return {
280
+ drifted: true,
281
+ from: fromLevel,
282
+ to: toLevel,
283
+ fromLevel: earlyMax,
284
+ toLevel: recentMax,
285
+ overTurns: this.turns.length
286
+ };
604
287
  }
605
288
 
606
- // Sort benign confidence scores ascending
607
- benignConfidences.sort((a, b) => a - b);
608
-
609
- // Find the threshold at the (1 - targetFPRate) percentile of benign samples
610
- // This means only targetFPRate of benign samples would be above the threshold
611
- const percentileIndex = Math.floor(benignConfidences.length * (1 - this.targetFPRate));
612
- const clampedIndex = Math.min(percentileIndex, benignConfidences.length - 1);
613
- let threshold = benignConfidences[clampedIndex];
614
-
615
- // Clamp between min and max
616
- threshold = Math.max(this.minConfidence, Math.min(this.maxConfidence, threshold));
617
-
618
- return Math.round(threshold * 1000) / 1000;
289
+ return { drifted: false };
619
290
  }
620
291
  }
621
292
 
622
- // =========================================================================
623
- // UTILITY FUNCTIONS
624
- // =========================================================================
625
-
626
- /**
627
- * Map severity string to a numeric confidence value.
628
- * @param {string} severity - 'critical', 'high', 'medium', or 'low'
629
- * @returns {number} Confidence between 0 and 1
630
- * @private
631
- */
632
- function _severityToConfidence(severity) {
633
- const map = {
634
- critical: 0.95,
635
- high: 0.8,
636
- medium: 0.6,
637
- low: 0.4
638
- };
639
- return map[severity] || 0.5;
640
- }
641
-
642
293
  // =========================================================================
643
294
  // EXPORTS
644
295
  // =========================================================================
645
296
 
646
297
  module.exports = {
647
- CrossTurnTracker,
648
- AdaptiveThresholdCalibrator
298
+ ConversationTracker,
299
+ ESCALATION_SIGNALS,
300
+ TRUST_EROSION_PATTERNS,
301
+ TOPIC_SENSITIVITY
649
302
  };