agentshield-sdk 7.4.0 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,523 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Agent Shield — Ensemble Voting Classifier (v8.0)
5
+ *
6
+ * Combines multiple detection signals (pattern matching, TF-IDF similarity,
7
+ * entropy analysis, IPIA classification) into a single threat/benign decision
8
+ * using weighted majority voting.
9
+ *
10
+ * Zero dependencies. All processing runs locally — no data ever leaves
11
+ * your environment.
12
+ *
13
+ * @module ensemble
14
+ */
15
+
16
+ const { scanText } = require('./detector-core');
17
+ const { EmbeddingSimilarityDetector } = require('./embedding');
18
+ const { EntropyAnalyzer, PerplexityEstimator } = require('./token-analysis');
19
+ const { FeatureExtractor, TreeClassifier, ContextConstructor } = require('./ipia-detector');
20
+
21
+ // =========================================================================
22
+ // CONSTANTS
23
+ // =========================================================================
24
+
25
+ /** Default voter names */
26
+ const VOTER_NAMES = ['pattern', 'tfidf', 'entropy', 'ipia'];
27
+
28
+ /** Severity thresholds */
29
+ const SEVERITY_MAP = [
30
+ { min: 0.8, label: 'critical' },
31
+ { min: 0.6, label: 'high' },
32
+ { min: 0.4, label: 'medium' },
33
+ { min: 0.0, label: 'low' },
34
+ ];
35
+
36
+ /**
37
+ * Map a confidence score to a severity label.
38
+ * @param {number} confidence
39
+ * @returns {string}
40
+ */
41
+ function confidenceToSeverity(confidence) {
42
+ for (const entry of SEVERITY_MAP) {
43
+ if (confidence >= entry.min) return entry.label;
44
+ }
45
+ return 'low';
46
+ }
47
+
48
+ /**
49
+ * Map a pattern severity string to a confidence value.
50
+ * @param {string} severity
51
+ * @returns {number}
52
+ */
53
+ function severityToConfidence(severity) {
54
+ switch (severity) {
55
+ case 'critical': return 1.0;
56
+ case 'high': return 0.85;
57
+ case 'medium': return 0.6;
58
+ case 'low': return 0.3;
59
+ default: return 0.5;
60
+ }
61
+ }
62
+
63
+ // =========================================================================
64
+ // PATTERN VOTER
65
+ // =========================================================================
66
+
67
+ /**
68
+ * Wraps the detector-core scanText pattern matcher as an ensemble voter.
69
+ */
70
+ class PatternVoter {
71
+ /**
72
+ * @param {object} [options]
73
+ * @param {string} [options.source='ensemble'] - Source label for scanText.
74
+ */
75
+ constructor(options = {}) {
76
+ this.name = 'pattern';
77
+ this.source = options.source || 'ensemble';
78
+ }
79
+
80
+ /**
81
+ * Cast a vote by running pattern-based detection.
82
+ * @param {string} text - Text to scan.
83
+ * @param {object} [context] - Optional context (unused by this voter).
84
+ * @returns {{ voter: string, isInjection: boolean, confidence: number, reason: string }}
85
+ */
86
+ vote(text) {
87
+ const result = scanText(text, { source: this.source });
88
+ const threats = result.threats || [];
89
+
90
+ if (threats.length === 0) {
91
+ return {
92
+ voter: this.name,
93
+ isInjection: false,
94
+ confidence: 0,
95
+ reason: 'No pattern matches found'
96
+ };
97
+ }
98
+
99
+ // Use highest severity threat to determine confidence
100
+ let maxConfidence = 0;
101
+ let topDescription = threats[0].description || 'Pattern match detected';
102
+ for (const threat of threats) {
103
+ const c = severityToConfidence(threat.severity);
104
+ if (c > maxConfidence) {
105
+ maxConfidence = c;
106
+ topDescription = threat.description || topDescription;
107
+ }
108
+ }
109
+
110
+ return {
111
+ voter: this.name,
112
+ isInjection: true,
113
+ confidence: maxConfidence,
114
+ reason: topDescription
115
+ };
116
+ }
117
+ }
118
+
119
+ // =========================================================================
120
+ // TF-IDF VOTER
121
+ // =========================================================================
122
+
123
+ /**
124
+ * Wraps the EmbeddingSimilarityDetector as an ensemble voter.
125
+ * Detects paraphrased attacks via TF-IDF cosine similarity.
126
+ */
127
+ class TFIDFVoter {
128
+ /**
129
+ * @param {object} [options]
130
+ * @param {number} [options.similarityThreshold=0.45] - Threshold for flagging.
131
+ */
132
+ constructor(options = {}) {
133
+ this.name = 'tfidf';
134
+ this._detector = new EmbeddingSimilarityDetector({
135
+ similarityThreshold: options.similarityThreshold || 0.45
136
+ });
137
+ }
138
+
139
+ /**
140
+ * Cast a vote by running TF-IDF similarity detection.
141
+ * @param {string} text - Text to scan.
142
+ * @param {object} [context] - Optional context (unused by this voter).
143
+ * @returns {{ voter: string, isInjection: boolean, confidence: number, reason: string }}
144
+ */
145
+ vote(text) {
146
+ const result = this._detector.check(text);
147
+
148
+ if (!result.isSimilar || !result.bestMatch) {
149
+ return {
150
+ voter: this.name,
151
+ isInjection: false,
152
+ confidence: result.bestMatch ? result.bestMatch.similarity : 0,
153
+ reason: 'No significant similarity to known attacks'
154
+ };
155
+ }
156
+
157
+ return {
158
+ voter: this.name,
159
+ isInjection: true,
160
+ confidence: result.bestMatch.similarity,
161
+ reason: 'Similar to known attack: ' + result.bestMatch.text
162
+ };
163
+ }
164
+ }
165
+
166
+ // =========================================================================
167
+ // ENTROPY VOTER
168
+ // =========================================================================
169
+
170
+ /**
171
+ * Wraps EntropyAnalyzer and PerplexityEstimator as an ensemble voter.
172
+ * Detects statistical anomalies in character entropy and n-gram perplexity.
173
+ */
174
+ class EntropyVoter {
175
+ /**
176
+ * @param {object} [options]
177
+ * @param {number} [options.entropyThreshold=0.3] - Entropy shift threshold.
178
+ * @param {number} [options.ngramSize=3] - N-gram size for perplexity.
179
+ */
180
+ constructor(options = {}) {
181
+ this.name = 'entropy';
182
+ this._entropyAnalyzer = new EntropyAnalyzer({
183
+ threshold: options.entropyThreshold || 0.3
184
+ });
185
+ this._perplexityEstimator = new PerplexityEstimator({
186
+ ngramSize: options.ngramSize || 3
187
+ });
188
+ }
189
+
190
+ /**
191
+ * Cast a vote by running entropy and perplexity analysis.
192
+ * @param {string} text - Text to scan.
193
+ * @param {object} [context] - Optional context (unused by this voter).
194
+ * @returns {{ voter: string, isInjection: boolean, confidence: number, reason: string }}
195
+ */
196
+ vote(text) {
197
+ const entropyResult = this._entropyAnalyzer.analyze(text);
198
+ const perplexityResult = this._perplexityEstimator.estimate(text);
199
+
200
+ const hasEntropyAnomaly = entropyResult.anomalies.length > 0;
201
+ const hasPerplexityAnomaly = perplexityResult.suspicious;
202
+
203
+ if (!hasEntropyAnomaly && !hasPerplexityAnomaly) {
204
+ return {
205
+ voter: this.name,
206
+ isInjection: false,
207
+ confidence: 0,
208
+ reason: 'No statistical anomalies detected'
209
+ };
210
+ }
211
+
212
+ // Compute confidence based on deviation magnitude
213
+ let confidence = 0;
214
+ const reasons = [];
215
+
216
+ if (hasEntropyAnomaly) {
217
+ // Use max deviation normalized against a reference range
218
+ let maxDeviation = 0;
219
+ for (const anomaly of entropyResult.anomalies) {
220
+ if (anomaly.deviation > maxDeviation) {
221
+ maxDeviation = anomaly.deviation;
222
+ }
223
+ }
224
+ // Entropy typically ranges 0-5 bits; deviation of 1+ is significant
225
+ const entropyConfidence = Math.min(maxDeviation / 2, 1.0);
226
+ confidence = Math.max(confidence, entropyConfidence);
227
+ reasons.push('Entropy anomaly in ' + entropyResult.anomalies.length + ' segment(s), max deviation ' + maxDeviation.toFixed(2));
228
+ }
229
+
230
+ if (hasPerplexityAnomaly) {
231
+ // Perplexity ratio: how many times higher than baseline
232
+ const ratio = this._perplexityEstimator.baselinePerplexity > 0
233
+ ? perplexityResult.perplexity / this._perplexityEstimator.baselinePerplexity
234
+ : 2;
235
+ // ratio of 2 is the threshold; scale 2-5 range to 0.4-1.0
236
+ const perplexityConfidence = Math.min(Math.max((ratio - 1) / 4, 0.3), 1.0);
237
+ confidence = Math.max(confidence, perplexityConfidence);
238
+ reasons.push('High perplexity ' + perplexityResult.perplexity.toFixed(2) + ' (baseline: ' + this._perplexityEstimator.baselinePerplexity.toFixed(2) + ')');
239
+ }
240
+
241
+ return {
242
+ voter: this.name,
243
+ isInjection: true,
244
+ confidence: Math.round(confidence * 1000) / 1000,
245
+ reason: reasons.join('; ')
246
+ };
247
+ }
248
+ }
249
+
250
+ // =========================================================================
251
+ // IPIA VOTER
252
+ // =========================================================================
253
+
254
+ /**
255
+ * Wraps the IPIA FeatureExtractor and TreeClassifier as an ensemble voter.
256
+ * Uses joint-context analysis to detect indirect prompt injection.
257
+ */
258
+ class IPIAVoter {
259
+ /**
260
+ * @param {object} [options]
261
+ * @param {number} [options.classifierThreshold=0.5] - Decision tree threshold.
262
+ */
263
+ constructor(options = {}) {
264
+ this.name = 'ipia';
265
+ this._contextConstructor = new ContextConstructor();
266
+ this._featureExtractor = new FeatureExtractor();
267
+ this._classifier = new TreeClassifier({
268
+ threshold: options.classifierThreshold || 0.5
269
+ });
270
+ }
271
+
272
+ /**
273
+ * Cast a vote by running the IPIA pipeline.
274
+ * Uses context.intent as user intent and the text itself as external content.
275
+ * If no intent is provided, uses a generic intent.
276
+ * @param {string} text - Text to scan (treated as external content).
277
+ * @param {object} [context] - Optional context.
278
+ * @param {string} [context.intent] - User intent for joint-context analysis.
279
+ * @returns {{ voter: string, isInjection: boolean, confidence: number, reason: string }}
280
+ */
281
+ vote(text, context = {}) {
282
+ const intent = context.intent || 'Summarize the following content.';
283
+ const ctx = this._contextConstructor.build(text, intent);
284
+ const { features, featureMap } = this._featureExtractor.extract(ctx);
285
+ const result = this._classifier.classify(features, featureMap);
286
+
287
+ return {
288
+ voter: this.name,
289
+ isInjection: result.isInjection,
290
+ confidence: Math.round(result.confidence * 1000) / 1000,
291
+ reason: result.reason || (result.isInjection ? 'IPIA classifier flagged content' : 'IPIA classifier found no injection')
292
+ };
293
+ }
294
+ }
295
+
296
+ // =========================================================================
297
+ // VOTER REGISTRY
298
+ // =========================================================================
299
+
300
+ /**
301
+ * Map voter names to their constructor classes.
302
+ * @type {Object<string, Function>}
303
+ */
304
+ const VOTER_REGISTRY = {
305
+ pattern: PatternVoter,
306
+ tfidf: TFIDFVoter,
307
+ entropy: EntropyVoter,
308
+ ipia: IPIAVoter,
309
+ };
310
+
311
+ // =========================================================================
312
+ // ENSEMBLE CLASSIFIER
313
+ // =========================================================================
314
+
315
+ /**
316
+ * Ensemble Voting Classifier — combines multiple detection backends into
317
+ * a unified threat/benign decision via weighted majority voting.
318
+ *
319
+ * @example
320
+ * const { EnsembleClassifier } = require('./ensemble');
321
+ * const ensemble = new EnsembleClassifier({ threshold: 0.5 });
322
+ * const result = ensemble.scan('ignore all previous instructions');
323
+ * console.log(result.isInjection, result.confidence, result.severity);
324
+ */
325
+ class EnsembleClassifier {
326
+ /**
327
+ * Create an EnsembleClassifier.
328
+ * @param {object} [config]
329
+ * @param {string[]} [config.voters=['pattern','tfidf','entropy','ipia']] - Voter names to use.
330
+ * @param {number} [config.threshold=0.5] - Confidence threshold for final decision.
331
+ * @param {boolean} [config.requireUnanimous=false] - If true, all voters must agree.
332
+ * @param {Object<string, number>} [config.weights] - Per-voter weights. Default: equal weights.
333
+ * @param {number} [config.minVoters=2] - Minimum voters that must cast a vote.
334
+ * @param {object} [config.voterOptions] - Per-voter config, keyed by voter name.
335
+ */
336
+ constructor(config = {}) {
337
+ this.threshold = config.threshold !== undefined ? config.threshold : 0.5;
338
+ this.requireUnanimous = config.requireUnanimous || false;
339
+ this.minVoters = config.minVoters !== undefined ? config.minVoters : 2;
340
+
341
+ const voterNames = config.voters || [...VOTER_NAMES];
342
+ const voterOptions = config.voterOptions || {};
343
+
344
+ // Resolve weights (default: equal weight of 1)
345
+ this.weights = {};
346
+ for (const name of voterNames) {
347
+ this.weights[name] = (config.weights && config.weights[name] !== undefined)
348
+ ? config.weights[name]
349
+ : 1;
350
+ }
351
+
352
+ // Instantiate voters
353
+ this._voters = [];
354
+ for (const name of voterNames) {
355
+ const VoterClass = VOTER_REGISTRY[name];
356
+ if (!VoterClass) {
357
+ console.log('[Agent Shield] Ensemble: unknown voter "' + name + '", skipping');
358
+ continue;
359
+ }
360
+ try {
361
+ const voter = new VoterClass(voterOptions[name] || {});
362
+ this._voters.push(voter);
363
+ } catch (err) {
364
+ console.log('[Agent Shield] Ensemble: failed to initialize voter "' + name + '": ' + err.message);
365
+ }
366
+ }
367
+
368
+ // Stats tracking
369
+ this._stats = {
370
+ totalScans: 0,
371
+ injections: 0,
372
+ safe: 0,
373
+ voterAgreementSum: 0,
374
+ averageConfidence: 0,
375
+ confidenceSum: 0,
376
+ };
377
+
378
+ console.log('[Agent Shield] EnsembleClassifier initialized (' + this._voters.length + ' voters: ' + this._voters.map(v => v.name).join(', ') + ', threshold: ' + this.threshold + ')');
379
+ }
380
+
381
+ /**
382
+ * Scan text using all enabled voters and combine results via weighted majority voting.
383
+ * @param {string} text - Text to scan.
384
+ * @param {object} [context] - Optional context { intent, source, conversationHistory }.
385
+ * @returns {{
386
+ * isInjection: boolean,
387
+ * confidence: number,
388
+ * severity: string,
389
+ * votes: Array<{ voter: string, isInjection: boolean, confidence: number, reason: string }>,
390
+ * agreement: number,
391
+ * method: string,
392
+ * timestamp: string
393
+ * }}
394
+ */
395
+ scan(text, context = {}) {
396
+ this._stats.totalScans++;
397
+
398
+ // Collect votes from all voters
399
+ const votes = [];
400
+ for (const voter of this._voters) {
401
+ try {
402
+ const vote = voter.vote(text, context);
403
+ votes.push(vote);
404
+ } catch (err) {
405
+ console.log('[Agent Shield] Ensemble: voter "' + voter.name + '" threw error: ' + err.message);
406
+ // Voter abstains on error
407
+ }
408
+ }
409
+
410
+ // Check minimum voter requirement
411
+ if (votes.length < this.minVoters) {
412
+ console.log('[Agent Shield] Ensemble: only ' + votes.length + ' voter(s) responded, need ' + this.minVoters);
413
+ return this._buildResult(false, 0, votes);
414
+ }
415
+
416
+ // Weighted majority voting
417
+ // weightedScore: positive = injection, negative = benign
418
+ let weightedScore = 0;
419
+ let totalWeight = 0;
420
+
421
+ for (const vote of votes) {
422
+ const weight = this.weights[vote.voter] || 1;
423
+ const direction = vote.isInjection ? 1 : -1;
424
+ weightedScore += weight * vote.confidence * direction;
425
+ totalWeight += weight;
426
+ }
427
+
428
+ // Normalize to 0-1 range
429
+ // weightedScore ranges from -totalWeight to +totalWeight
430
+ // Map to 0-1: (score + totalWeight) / (2 * totalWeight)
431
+ const normalizedScore = totalWeight > 0
432
+ ? (weightedScore + totalWeight) / (2 * totalWeight)
433
+ : 0;
434
+
435
+ let isInjection = normalizedScore >= this.threshold;
436
+
437
+ // Unanimous check
438
+ if (this.requireUnanimous && isInjection) {
439
+ const allAgree = votes.every(v => v.isInjection);
440
+ if (!allAgree) {
441
+ isInjection = false;
442
+ }
443
+ }
444
+
445
+ // Compute agreement
446
+ const injectionCount = votes.filter(v => v.isInjection).length;
447
+ const benignCount = votes.length - injectionCount;
448
+ const majorityCount = Math.max(injectionCount, benignCount);
449
+ const agreement = votes.length > 0 ? majorityCount / votes.length : 0;
450
+
451
+ const confidence = Math.round(normalizedScore * 1000) / 1000;
452
+
453
+ // Update stats
454
+ if (isInjection) {
455
+ this._stats.injections++;
456
+ } else {
457
+ this._stats.safe++;
458
+ }
459
+ this._stats.voterAgreementSum += agreement;
460
+ this._stats.confidenceSum += confidence;
461
+
462
+ return this._buildResult(isInjection, confidence, votes, agreement);
463
+ }
464
+
465
+ /**
466
+ * Build a standardized result object.
467
+ * @param {boolean} isInjection
468
+ * @param {number} confidence
469
+ * @param {Array} votes
470
+ * @param {number} [agreement=0]
471
+ * @returns {object}
472
+ * @private
473
+ */
474
+ _buildResult(isInjection, confidence, votes, agreement = 0) {
475
+ return {
476
+ isInjection,
477
+ confidence,
478
+ severity: confidenceToSeverity(isInjection ? confidence : 0),
479
+ votes,
480
+ agreement: Math.round(agreement * 1000) / 1000,
481
+ method: 'ensemble',
482
+ timestamp: new Date().toISOString()
483
+ };
484
+ }
485
+
486
+ /**
487
+ * Get stats about ensemble performance.
488
+ * @returns {{
489
+ * totalScans: number,
490
+ * injections: number,
491
+ * safe: number,
492
+ * averageAgreement: number,
493
+ * averageConfidence: number,
494
+ * voterCount: number,
495
+ * voters: string[]
496
+ * }}
497
+ */
498
+ getStats() {
499
+ const total = this._stats.totalScans || 1;
500
+ return {
501
+ totalScans: this._stats.totalScans,
502
+ injections: this._stats.injections,
503
+ safe: this._stats.safe,
504
+ averageAgreement: Math.round((this._stats.voterAgreementSum / total) * 1000) / 1000,
505
+ averageConfidence: Math.round((this._stats.confidenceSum / total) * 1000) / 1000,
506
+ voterCount: this._voters.length,
507
+ voters: this._voters.map(v => v.name)
508
+ };
509
+ }
510
+ }
511
+
512
+ // =========================================================================
513
+ // EXPORTS
514
+ // =========================================================================
515
+
516
+ module.exports = {
517
+ EnsembleClassifier,
518
+ PatternVoter,
519
+ TFIDFVoter,
520
+ EntropyVoter,
521
+ IPIAVoter,
522
+ VOTER_NAMES,
523
+ };
package/src/main.js CHANGED
@@ -112,6 +112,24 @@ const { Allowlist, ConfidenceCalibrator, FeedbackLoop, ScanCache } = safeRequire
112
112
  // Presets & Config Builder
113
113
  const { PRESETS, ConfigBuilder, SnippetGenerator, getPresets, getPreset } = safeRequire('./presets', 'presets');
114
114
 
115
+ // v8.0 — Smart Config
116
+ const { ShieldBuilder, createShield, validateConfig, describeConfig, FEATURE_DEFAULTS, VALID_PRESETS } = safeRequire('./smart-config', 'smart-config');
117
+
118
+ // v8.0 — Ensemble Detection
119
+ const { EnsembleClassifier, PatternVoter, TFIDFVoter, EntropyVoter, IPIAVoter, VOTER_NAMES } = safeRequire('./ensemble', 'ensemble');
120
+
121
+ // v8.0 — Agent Intent & Goal Drift
122
+ const { AgentIntent, GoalDriftDetector, ToolSequenceModeler } = safeRequire('./agent-intent', 'agent-intent');
123
+
124
+ // v8.0 — Persistent Learning & Feedback
125
+ const { PersistentLearningLoop, FeedbackCollector } = safeRequire('./persistent-learning', 'persistent-learning');
126
+
127
+ // v8.0 — Cross-Turn & Adaptive Thresholds
128
+ const { CrossTurnTracker, AdaptiveThresholdCalibrator } = safeRequire('./cross-turn', 'cross-turn');
129
+
130
+ // v8.0 — Adversarial Self-Training
131
+ const { SelfTrainer, MutationEngine: SelfTrainingMutationEngine, SEED_ATTACKS, MUTATION_STRATEGIES } = safeRequire('./self-training', 'self-training');
132
+
115
133
  // Advanced Scanners
116
134
  const { RAGScanner, RAG_INJECTION_PATTERNS, PromptLinter, LINT_RULES, ToolSchemaValidator, DANGEROUS_TOOL_PATTERNS } = safeRequire('./scanners', 'scanners');
117
135
 
@@ -896,6 +914,41 @@ const _exports = {
896
914
  NORMALIZER_HOMOGLYPH_MAP,
897
915
  LEET_MAP,
898
916
  DEFAULT_LAYERS,
917
+
918
+ // v8.0 — Smart Config
919
+ ShieldBuilder,
920
+ createShield,
921
+ validateConfig,
922
+ describeConfig,
923
+ FEATURE_DEFAULTS,
924
+ VALID_PRESETS,
925
+
926
+ // v8.0 — Ensemble Detection
927
+ EnsembleClassifier,
928
+ PatternVoter,
929
+ TFIDFVoter,
930
+ EntropyVoter,
931
+ IPIAVoter,
932
+ VOTER_NAMES,
933
+
934
+ // v8.0 — Agent Intent & Goal Drift
935
+ AgentIntent,
936
+ GoalDriftDetector,
937
+ ToolSequenceModeler,
938
+
939
+ // v8.0 — Persistent Learning & Feedback
940
+ PersistentLearningLoop,
941
+ FeedbackCollector,
942
+
943
+ // v8.0 — Cross-Turn & Adaptive Thresholds
944
+ CrossTurnTracker,
945
+ AdaptiveThresholdCalibrator,
946
+
947
+ // v8.0 — Adversarial Self-Training
948
+ SelfTrainer,
949
+ SelfTrainingMutationEngine,
950
+ SEED_ATTACKS,
951
+ MUTATION_STRATEGIES,
899
952
  };
900
953
 
901
954
  // Filter out undefined exports (from modules that failed to load)