agentshield-sdk 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +191 -0
- package/LICENSE +21 -0
- package/README.md +975 -0
- package/bin/agent-shield.js +680 -0
- package/package.json +118 -0
- package/src/adaptive.js +330 -0
- package/src/agent-protocol.js +998 -0
- package/src/alert-tuning.js +480 -0
- package/src/allowlist.js +603 -0
- package/src/audit-immutable.js +914 -0
- package/src/audit-streaming.js +469 -0
- package/src/badges.js +196 -0
- package/src/behavior-profiling.js +289 -0
- package/src/benchmark-harness.js +804 -0
- package/src/canary.js +271 -0
- package/src/certification.js +563 -0
- package/src/circuit-breaker.js +321 -0
- package/src/compliance.js +617 -0
- package/src/confidence-tuning.js +324 -0
- package/src/confused-deputy.js +624 -0
- package/src/context-scoring.js +360 -0
- package/src/conversation.js +494 -0
- package/src/cost-optimizer.js +1024 -0
- package/src/ctf.js +462 -0
- package/src/detector-core.js +1999 -0
- package/src/distributed.js +359 -0
- package/src/document-scanner.js +795 -0
- package/src/embedding.js +307 -0
- package/src/encoding.js +429 -0
- package/src/enterprise.js +405 -0
- package/src/errors.js +100 -0
- package/src/eu-ai-act.js +523 -0
- package/src/fuzzer.js +764 -0
- package/src/honeypot.js +328 -0
- package/src/i18n-patterns.js +523 -0
- package/src/index.js +430 -0
- package/src/integrations.js +528 -0
- package/src/llm-redteam.js +670 -0
- package/src/main.js +741 -0
- package/src/main.mjs +38 -0
- package/src/mcp-bridge.js +542 -0
- package/src/mcp-certification.js +846 -0
- package/src/mcp-sdk-integration.js +355 -0
- package/src/mcp-security-runtime.js +741 -0
- package/src/mcp-server.js +740 -0
- package/src/middleware.js +208 -0
- package/src/model-finetuning.js +884 -0
- package/src/model-fingerprint.js +1042 -0
- package/src/multi-agent-trust.js +453 -0
- package/src/multi-agent.js +404 -0
- package/src/multimodal.js +296 -0
- package/src/nist-mapping.js +505 -0
- package/src/observability.js +330 -0
- package/src/openclaw.js +450 -0
- package/src/otel.js +544 -0
- package/src/owasp-2025.js +483 -0
- package/src/pii.js +390 -0
- package/src/plugin-marketplace.js +628 -0
- package/src/plugin-system.js +349 -0
- package/src/policy-dsl.js +775 -0
- package/src/policy-extended.js +635 -0
- package/src/policy.js +443 -0
- package/src/presets.js +409 -0
- package/src/production.js +557 -0
- package/src/prompt-leakage.js +321 -0
- package/src/rag-vulnerability.js +579 -0
- package/src/redteam.js +475 -0
- package/src/response-handler.js +429 -0
- package/src/scanners.js +357 -0
- package/src/self-healing.js +363 -0
- package/src/semantic.js +339 -0
- package/src/shield-score.js +250 -0
- package/src/sso-saml.js +897 -0
- package/src/stream-scanner.js +806 -0
- package/src/testing.js +505 -0
- package/src/threat-encyclopedia.js +629 -0
- package/src/threat-intel-network.js +1017 -0
- package/src/token-analysis.js +467 -0
- package/src/tool-guard.js +412 -0
- package/src/tool-output-validator.js +354 -0
- package/src/utils.js +83 -0
- package/src/watermark.js +235 -0
- package/src/worker-scanner.js +601 -0
- package/types/index.d.ts +2088 -0
|
@@ -0,0 +1,804 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Agent Shield — Standardized Benchmark Harness
|
|
5
|
+
*
|
|
6
|
+
* Provides a reproducible framework for evaluating detection engines.
|
|
7
|
+
* Supports dataset loading (JSON, BIPIA, Garak formats), metric computation
|
|
8
|
+
* (precision, recall, F1, MCC, per-category breakdowns), regression tracking,
|
|
9
|
+
* and multi-engine comparison.
|
|
10
|
+
*
|
|
11
|
+
* All processing is local — no data leaves the environment.
|
|
12
|
+
*
|
|
13
|
+
* @module benchmark-harness
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
const fs = require('fs');
|
|
17
|
+
const path = require('path');
|
|
18
|
+
|
|
19
|
+
// =========================================================================
|
|
20
|
+
// CONSTANTS
|
|
21
|
+
// =========================================================================
|
|
22
|
+
|
|
23
|
+
/** Required fields for each dataset entry. */
|
|
24
|
+
const REQUIRED_ENTRY_FIELDS = ['id', 'text', 'category', 'expected_detection', 'severity', 'difficulty'];
|
|
25
|
+
|
|
26
|
+
/** Valid severity levels. */
|
|
27
|
+
const VALID_SEVERITIES = ['critical', 'high', 'medium', 'low'];
|
|
28
|
+
|
|
29
|
+
/** Valid difficulty levels. */
|
|
30
|
+
const VALID_DIFFICULTIES = ['easy', 'medium', 'hard'];
|
|
31
|
+
|
|
32
|
+
/** Default F1 regression threshold (absolute drop). */
|
|
33
|
+
const DEFAULT_F1_REGRESSION_THRESHOLD = 0.02;
|
|
34
|
+
|
|
35
|
+
/** Default latency regression threshold (relative increase). */
|
|
36
|
+
const DEFAULT_LATENCY_REGRESSION_THRESHOLD = 0.20;
|
|
37
|
+
|
|
38
|
+
// =========================================================================
|
|
39
|
+
// DatasetLoader
|
|
40
|
+
// =========================================================================
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Validates and loads benchmark datasets from various formats.
|
|
44
|
+
*/
|
|
45
|
+
class DatasetLoader {
|
|
46
|
+
/**
|
|
47
|
+
* Load a dataset from a JSON file.
|
|
48
|
+
* @param {string} filePath — absolute or relative path to the JSON dataset
|
|
49
|
+
* @returns {{ entries: Array<Object>, meta: Object }}
|
|
50
|
+
*/
|
|
51
|
+
load(filePath) {
|
|
52
|
+
const resolved = path.resolve(filePath);
|
|
53
|
+
if (!fs.existsSync(resolved)) {
|
|
54
|
+
throw new Error(`[Agent Shield] Dataset file not found: ${resolved}`);
|
|
55
|
+
}
|
|
56
|
+
const raw = fs.readFileSync(resolved, 'utf-8');
|
|
57
|
+
let data;
|
|
58
|
+
try {
|
|
59
|
+
data = JSON.parse(raw);
|
|
60
|
+
} catch (err) {
|
|
61
|
+
throw new Error(`[Agent Shield] Invalid JSON in dataset: ${err.message}`);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const entries = Array.isArray(data) ? data : (data.entries || data.dataset || []);
|
|
65
|
+
const meta = Array.isArray(data) ? {} : (data.meta || {});
|
|
66
|
+
|
|
67
|
+
this.validate(entries);
|
|
68
|
+
return { entries, meta };
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Validate the structure of dataset entries.
|
|
73
|
+
* @param {Array<Object>} entries
|
|
74
|
+
* @throws {Error} if any entry is missing required fields or has invalid values
|
|
75
|
+
*/
|
|
76
|
+
validate(entries) {
|
|
77
|
+
if (!Array.isArray(entries) || entries.length === 0) {
|
|
78
|
+
throw new Error('[Agent Shield] Dataset must be a non-empty array of entries');
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
for (let i = 0; i < entries.length; i++) {
|
|
82
|
+
const entry = entries[i];
|
|
83
|
+
for (const field of REQUIRED_ENTRY_FIELDS) {
|
|
84
|
+
if (!(field in entry)) {
|
|
85
|
+
throw new Error(`[Agent Shield] Entry ${i} missing required field: "${field}"`);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
if (typeof entry.expected_detection !== 'boolean') {
|
|
89
|
+
throw new Error(`[Agent Shield] Entry ${i} (${entry.id}): expected_detection must be boolean`);
|
|
90
|
+
}
|
|
91
|
+
if (!VALID_SEVERITIES.includes(entry.severity)) {
|
|
92
|
+
throw new Error(`[Agent Shield] Entry ${i} (${entry.id}): invalid severity "${entry.severity}"`);
|
|
93
|
+
}
|
|
94
|
+
if (!VALID_DIFFICULTIES.includes(entry.difficulty)) {
|
|
95
|
+
throw new Error(`[Agent Shield] Entry ${i} (${entry.id}): invalid difficulty "${entry.difficulty}"`);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Convert BIPIA-format entries to the standard format.
|
|
102
|
+
* BIPIA entries have: { prompt, label, attack_type }
|
|
103
|
+
* @param {Array<Object>} entries
|
|
104
|
+
* @returns {Array<Object>}
|
|
105
|
+
*/
|
|
106
|
+
fromBIPIA(entries) {
|
|
107
|
+
if (!Array.isArray(entries)) {
|
|
108
|
+
throw new Error('[Agent Shield] BIPIA entries must be an array');
|
|
109
|
+
}
|
|
110
|
+
return entries.map((e, i) => ({
|
|
111
|
+
id: e.id || `bipia-${i}`,
|
|
112
|
+
text: e.prompt || e.text || '',
|
|
113
|
+
category: e.attack_type || 'prompt_injection',
|
|
114
|
+
expected_detection: e.label === 'attack' || e.label === 1 || e.label === true,
|
|
115
|
+
severity: e.severity || 'high',
|
|
116
|
+
difficulty: e.difficulty || 'medium'
|
|
117
|
+
}));
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Convert Garak-format entries to the standard format.
|
|
122
|
+
* Garak entries have: { prompt, detector_results, probe_name }
|
|
123
|
+
* @param {Array<Object>} entries
|
|
124
|
+
* @returns {Array<Object>}
|
|
125
|
+
*/
|
|
126
|
+
fromGarak(entries) {
|
|
127
|
+
if (!Array.isArray(entries)) {
|
|
128
|
+
throw new Error('[Agent Shield] Garak entries must be an array');
|
|
129
|
+
}
|
|
130
|
+
return entries.map((e, i) => ({
|
|
131
|
+
id: e.id || `garak-${i}`,
|
|
132
|
+
text: e.prompt || e.text || '',
|
|
133
|
+
category: e.probe_name || 'unknown',
|
|
134
|
+
expected_detection: e.detector_results === 'fail' || e.expected_detection === true,
|
|
135
|
+
severity: e.severity || 'medium',
|
|
136
|
+
difficulty: e.difficulty || 'medium'
|
|
137
|
+
}));
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// =========================================================================
|
|
142
|
+
// BenchmarkMetrics
|
|
143
|
+
// =========================================================================
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Computes classification metrics from benchmark results.
|
|
147
|
+
*/
|
|
148
|
+
class BenchmarkMetrics {
|
|
149
|
+
/**
|
|
150
|
+
* Compute comprehensive metrics from benchmark results.
|
|
151
|
+
* @param {Array<Object>} results — array of { entry, detected, expected, latencyMs }
|
|
152
|
+
* @returns {Object} metrics object
|
|
153
|
+
*/
|
|
154
|
+
compute(results) {
|
|
155
|
+
let tp = 0, fp = 0, tn = 0, fn = 0;
|
|
156
|
+
const latencies = [];
|
|
157
|
+
const perCategory = {};
|
|
158
|
+
const perDifficulty = { easy: { tp: 0, fp: 0, tn: 0, fn: 0 }, medium: { tp: 0, fp: 0, tn: 0, fn: 0 }, hard: { tp: 0, fp: 0, tn: 0, fn: 0 } };
|
|
159
|
+
|
|
160
|
+
for (const r of results) {
|
|
161
|
+
const expected = r.expected;
|
|
162
|
+
const detected = r.detected;
|
|
163
|
+
const cat = r.entry.category;
|
|
164
|
+
const diff = r.entry.difficulty;
|
|
165
|
+
|
|
166
|
+
if (!perCategory[cat]) {
|
|
167
|
+
perCategory[cat] = { tp: 0, fp: 0, tn: 0, fn: 0 };
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
if (expected && detected) {
|
|
171
|
+
tp++;
|
|
172
|
+
perCategory[cat].tp++;
|
|
173
|
+
if (perDifficulty[diff]) perDifficulty[diff].tp++;
|
|
174
|
+
} else if (!expected && detected) {
|
|
175
|
+
fp++;
|
|
176
|
+
perCategory[cat].fp++;
|
|
177
|
+
if (perDifficulty[diff]) perDifficulty[diff].fp++;
|
|
178
|
+
} else if (!expected && !detected) {
|
|
179
|
+
tn++;
|
|
180
|
+
perCategory[cat].tn++;
|
|
181
|
+
if (perDifficulty[diff]) perDifficulty[diff].tn++;
|
|
182
|
+
} else {
|
|
183
|
+
fn++;
|
|
184
|
+
perCategory[cat].fn++;
|
|
185
|
+
if (perDifficulty[diff]) perDifficulty[diff].fn++;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
if (typeof r.latencyMs === 'number') {
|
|
189
|
+
latencies.push(r.latencyMs);
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const precision = (tp + fp) > 0 ? tp / (tp + fp) : 0;
|
|
194
|
+
const recall = (tp + fn) > 0 ? tp / (tp + fn) : 0;
|
|
195
|
+
const f1 = (precision + recall) > 0 ? (2 * precision * recall) / (precision + recall) : 0;
|
|
196
|
+
const accuracy = results.length > 0 ? (tp + tn) / results.length : 0;
|
|
197
|
+
const mcc = this._computeMCC(tp, tn, fp, fn);
|
|
198
|
+
|
|
199
|
+
const confusionMatrix = { tp, fp, tn, fn };
|
|
200
|
+
|
|
201
|
+
// Per-category metrics
|
|
202
|
+
const perCategoryMetrics = {};
|
|
203
|
+
for (const [cat, cm] of Object.entries(perCategory)) {
|
|
204
|
+
perCategoryMetrics[cat] = this._metricsFromCM(cm);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Per-difficulty metrics
|
|
208
|
+
const perDifficultyMetrics = {};
|
|
209
|
+
for (const [diff, cm] of Object.entries(perDifficulty)) {
|
|
210
|
+
if (cm.tp + cm.fp + cm.tn + cm.fn > 0) {
|
|
211
|
+
perDifficultyMetrics[diff] = this._metricsFromCM(cm);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Latency stats
|
|
216
|
+
const latency = this._computeLatencyStats(latencies);
|
|
217
|
+
|
|
218
|
+
return {
|
|
219
|
+
total: results.length,
|
|
220
|
+
precision,
|
|
221
|
+
recall,
|
|
222
|
+
f1,
|
|
223
|
+
accuracy,
|
|
224
|
+
mcc,
|
|
225
|
+
confusionMatrix,
|
|
226
|
+
perCategory: perCategoryMetrics,
|
|
227
|
+
perDifficulty: perDifficultyMetrics,
|
|
228
|
+
latency
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Compute Matthews Correlation Coefficient.
|
|
234
|
+
* MCC = (TP*TN - FP*FN) / sqrt((TP+FP)(TP+FN)(TN+FP)(TN+FN))
|
|
235
|
+
* @param {number} tp
|
|
236
|
+
* @param {number} tn
|
|
237
|
+
* @param {number} fp
|
|
238
|
+
* @param {number} fn
|
|
239
|
+
* @returns {number}
|
|
240
|
+
* @private
|
|
241
|
+
*/
|
|
242
|
+
_computeMCC(tp, tn, fp, fn) {
|
|
243
|
+
const numerator = (tp * tn) - (fp * fn);
|
|
244
|
+
const denominator = Math.sqrt(
|
|
245
|
+
(tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
|
|
246
|
+
);
|
|
247
|
+
if (denominator === 0) return 0;
|
|
248
|
+
return numerator / denominator;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Derive precision/recall/f1 from a confusion matrix bucket.
|
|
253
|
+
* @param {{ tp: number, fp: number, tn: number, fn: number }} cm
|
|
254
|
+
* @returns {{ precision: number, recall: number, f1: number, total: number }}
|
|
255
|
+
* @private
|
|
256
|
+
*/
|
|
257
|
+
_metricsFromCM(cm) {
|
|
258
|
+
const precision = (cm.tp + cm.fp) > 0 ? cm.tp / (cm.tp + cm.fp) : 0;
|
|
259
|
+
const recall = (cm.tp + cm.fn) > 0 ? cm.tp / (cm.tp + cm.fn) : 0;
|
|
260
|
+
const f1 = (precision + recall) > 0 ? (2 * precision * recall) / (precision + recall) : 0;
|
|
261
|
+
return { precision, recall, f1, total: cm.tp + cm.fp + cm.tn + cm.fn, ...cm };
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
/**
|
|
265
|
+
* Compute latency statistics from an array of timings.
|
|
266
|
+
* @param {number[]} latencies
|
|
267
|
+
* @returns {{ mean: number, median: number, p95: number, p99: number, min: number, max: number }}
|
|
268
|
+
* @private
|
|
269
|
+
*/
|
|
270
|
+
_computeLatencyStats(latencies) {
|
|
271
|
+
if (latencies.length === 0) {
|
|
272
|
+
return { mean: 0, median: 0, p95: 0, p99: 0, min: 0, max: 0 };
|
|
273
|
+
}
|
|
274
|
+
const sorted = [...latencies].sort((a, b) => a - b);
|
|
275
|
+
const sum = sorted.reduce((a, b) => a + b, 0);
|
|
276
|
+
const mean = sum / sorted.length;
|
|
277
|
+
const median = sorted[Math.floor(sorted.length / 2)];
|
|
278
|
+
const p95 = sorted[Math.floor(sorted.length * 0.95)];
|
|
279
|
+
const p99 = sorted[Math.floor(sorted.length * 0.99)];
|
|
280
|
+
return {
|
|
281
|
+
mean: Math.round(mean * 100) / 100,
|
|
282
|
+
median: Math.round(median * 100) / 100,
|
|
283
|
+
p95: Math.round(p95 * 100) / 100,
|
|
284
|
+
p99: Math.round(p99 * 100) / 100,
|
|
285
|
+
min: Math.round(sorted[0] * 100) / 100,
|
|
286
|
+
max: Math.round(sorted[sorted.length - 1] * 100) / 100
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// =========================================================================
|
|
292
|
+
// RegressionTracker
|
|
293
|
+
// =========================================================================
|
|
294
|
+
|
|
295
|
+
/**
|
|
296
|
+
* Tracks performance baselines and detects regressions between runs.
|
|
297
|
+
*/
|
|
298
|
+
class RegressionTracker {
|
|
299
|
+
/**
|
|
300
|
+
* @param {Object} [options]
|
|
301
|
+
* @param {number} [options.f1Threshold=0.02] — F1 drop threshold to flag regression
|
|
302
|
+
* @param {number} [options.latencyThreshold=0.20] — latency increase ratio threshold
|
|
303
|
+
*/
|
|
304
|
+
constructor(options = {}) {
|
|
305
|
+
this.f1Threshold = options.f1Threshold || DEFAULT_F1_REGRESSION_THRESHOLD;
|
|
306
|
+
this.latencyThreshold = options.latencyThreshold || DEFAULT_LATENCY_REGRESSION_THRESHOLD;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
/**
|
|
310
|
+
* Save a baseline to a JSON file.
|
|
311
|
+
* @param {Object} metrics — output from BenchmarkMetrics.compute()
|
|
312
|
+
* @param {string} filePath — path to write the baseline
|
|
313
|
+
*/
|
|
314
|
+
saveBaseline(metrics, filePath) {
|
|
315
|
+
const resolved = path.resolve(filePath);
|
|
316
|
+
const dir = path.dirname(resolved);
|
|
317
|
+
if (!fs.existsSync(dir)) {
|
|
318
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
319
|
+
}
|
|
320
|
+
const baseline = {
|
|
321
|
+
timestamp: new Date().toISOString(),
|
|
322
|
+
metrics
|
|
323
|
+
};
|
|
324
|
+
fs.writeFileSync(resolved, JSON.stringify(baseline, null, 2), 'utf-8');
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Load a baseline from a JSON file.
|
|
329
|
+
* @param {string} filePath
|
|
330
|
+
* @returns {Object} baseline object with { timestamp, metrics }
|
|
331
|
+
*/
|
|
332
|
+
loadBaseline(filePath) {
|
|
333
|
+
const resolved = path.resolve(filePath);
|
|
334
|
+
if (!fs.existsSync(resolved)) {
|
|
335
|
+
throw new Error(`[Agent Shield] Baseline file not found: ${resolved}`);
|
|
336
|
+
}
|
|
337
|
+
const raw = fs.readFileSync(resolved, 'utf-8');
|
|
338
|
+
return JSON.parse(raw);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
/**
|
|
342
|
+
* Compare current metrics against a baseline and flag regressions.
|
|
343
|
+
* @param {Object} current — current metrics from BenchmarkMetrics.compute()
|
|
344
|
+
* @param {Object} baseline — baseline object from loadBaseline() or { metrics }
|
|
345
|
+
* @returns {{ passed: boolean, regressions: Array<Object>, improvements: Array<Object> }}
|
|
346
|
+
*/
|
|
347
|
+
compare(current, baseline) {
|
|
348
|
+
const baseMetrics = baseline.metrics || baseline;
|
|
349
|
+
const regressions = [];
|
|
350
|
+
const improvements = [];
|
|
351
|
+
|
|
352
|
+
// F1 score check
|
|
353
|
+
const f1Delta = current.f1 - baseMetrics.f1;
|
|
354
|
+
if (f1Delta < -this.f1Threshold) {
|
|
355
|
+
regressions.push({
|
|
356
|
+
metric: 'f1',
|
|
357
|
+
baseline: baseMetrics.f1,
|
|
358
|
+
current: current.f1,
|
|
359
|
+
delta: f1Delta,
|
|
360
|
+
message: `F1 dropped by ${Math.abs(f1Delta).toFixed(4)} (threshold: ${this.f1Threshold})`
|
|
361
|
+
});
|
|
362
|
+
} else if (f1Delta > this.f1Threshold) {
|
|
363
|
+
improvements.push({
|
|
364
|
+
metric: 'f1',
|
|
365
|
+
baseline: baseMetrics.f1,
|
|
366
|
+
current: current.f1,
|
|
367
|
+
delta: f1Delta,
|
|
368
|
+
message: `F1 improved by ${f1Delta.toFixed(4)}`
|
|
369
|
+
});
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Precision check
|
|
373
|
+
const precDelta = current.precision - baseMetrics.precision;
|
|
374
|
+
if (precDelta < -this.f1Threshold) {
|
|
375
|
+
regressions.push({
|
|
376
|
+
metric: 'precision',
|
|
377
|
+
baseline: baseMetrics.precision,
|
|
378
|
+
current: current.precision,
|
|
379
|
+
delta: precDelta,
|
|
380
|
+
message: `Precision dropped by ${Math.abs(precDelta).toFixed(4)}`
|
|
381
|
+
});
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
// Recall check
|
|
385
|
+
const recDelta = current.recall - baseMetrics.recall;
|
|
386
|
+
if (recDelta < -this.f1Threshold) {
|
|
387
|
+
regressions.push({
|
|
388
|
+
metric: 'recall',
|
|
389
|
+
baseline: baseMetrics.recall,
|
|
390
|
+
current: current.recall,
|
|
391
|
+
delta: recDelta,
|
|
392
|
+
message: `Recall dropped by ${Math.abs(recDelta).toFixed(4)}`
|
|
393
|
+
});
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// MCC check
|
|
397
|
+
if (typeof baseMetrics.mcc === 'number') {
|
|
398
|
+
const mccDelta = current.mcc - baseMetrics.mcc;
|
|
399
|
+
if (mccDelta < -this.f1Threshold) {
|
|
400
|
+
regressions.push({
|
|
401
|
+
metric: 'mcc',
|
|
402
|
+
baseline: baseMetrics.mcc,
|
|
403
|
+
current: current.mcc,
|
|
404
|
+
delta: mccDelta,
|
|
405
|
+
message: `MCC dropped by ${Math.abs(mccDelta).toFixed(4)}`
|
|
406
|
+
});
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// Latency check (mean)
|
|
411
|
+
if (baseMetrics.latency && baseMetrics.latency.mean > 0 && current.latency) {
|
|
412
|
+
const latencyRatio = (current.latency.mean - baseMetrics.latency.mean) / baseMetrics.latency.mean;
|
|
413
|
+
if (latencyRatio > this.latencyThreshold) {
|
|
414
|
+
regressions.push({
|
|
415
|
+
metric: 'latency_mean',
|
|
416
|
+
baseline: baseMetrics.latency.mean,
|
|
417
|
+
current: current.latency.mean,
|
|
418
|
+
delta: latencyRatio,
|
|
419
|
+
message: `Mean latency increased by ${(latencyRatio * 100).toFixed(1)}% (threshold: ${(this.latencyThreshold * 100).toFixed(0)}%)`
|
|
420
|
+
});
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
// Per-category regression check
|
|
425
|
+
if (baseMetrics.perCategory && current.perCategory) {
|
|
426
|
+
for (const [cat, baseStats] of Object.entries(baseMetrics.perCategory)) {
|
|
427
|
+
if (current.perCategory[cat]) {
|
|
428
|
+
const catF1Delta = current.perCategory[cat].f1 - baseStats.f1;
|
|
429
|
+
if (catF1Delta < -this.f1Threshold) {
|
|
430
|
+
regressions.push({
|
|
431
|
+
metric: `category:${cat}:f1`,
|
|
432
|
+
baseline: baseStats.f1,
|
|
433
|
+
current: current.perCategory[cat].f1,
|
|
434
|
+
delta: catF1Delta,
|
|
435
|
+
message: `Category "${cat}" F1 dropped by ${Math.abs(catF1Delta).toFixed(4)}`
|
|
436
|
+
});
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
return {
|
|
443
|
+
passed: regressions.length === 0,
|
|
444
|
+
regressions,
|
|
445
|
+
improvements
|
|
446
|
+
};
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
// =========================================================================
|
|
451
|
+
// BenchmarkReportGenerator
|
|
452
|
+
// =========================================================================
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Formats benchmark results into text, JSON, and markdown reports.
|
|
456
|
+
*/
|
|
457
|
+
class BenchmarkReportGenerator {
|
|
458
|
+
/**
|
|
459
|
+
* Generate a plain text report.
|
|
460
|
+
* @param {Object} metrics — output from BenchmarkMetrics.compute()
|
|
461
|
+
* @param {Object} [options]
|
|
462
|
+
* @param {string} [options.title='Benchmark Report']
|
|
463
|
+
* @returns {string}
|
|
464
|
+
*/
|
|
465
|
+
text(metrics, options = {}) {
|
|
466
|
+
const title = options.title || 'Benchmark Report';
|
|
467
|
+
const lines = [];
|
|
468
|
+
const sep = '='.repeat(60);
|
|
469
|
+
|
|
470
|
+
lines.push(sep);
|
|
471
|
+
lines.push(` ${title}`);
|
|
472
|
+
lines.push(sep);
|
|
473
|
+
lines.push('');
|
|
474
|
+
lines.push(` Total entries: ${metrics.total}`);
|
|
475
|
+
lines.push(` Precision: ${(metrics.precision * 100).toFixed(2)}%`);
|
|
476
|
+
lines.push(` Recall: ${(metrics.recall * 100).toFixed(2)}%`);
|
|
477
|
+
lines.push(` F1 Score: ${(metrics.f1 * 100).toFixed(2)}%`);
|
|
478
|
+
lines.push(` Accuracy: ${(metrics.accuracy * 100).toFixed(2)}%`);
|
|
479
|
+
lines.push(` MCC: ${metrics.mcc.toFixed(4)}`);
|
|
480
|
+
lines.push('');
|
|
481
|
+
|
|
482
|
+
// Confusion matrix
|
|
483
|
+
const cm = metrics.confusionMatrix;
|
|
484
|
+
lines.push(' Confusion Matrix:');
|
|
485
|
+
lines.push(` TP: ${cm.tp} FP: ${cm.fp}`);
|
|
486
|
+
lines.push(` FN: ${cm.fn} TN: ${cm.tn}`);
|
|
487
|
+
lines.push('');
|
|
488
|
+
|
|
489
|
+
// Per-category breakdown
|
|
490
|
+
if (Object.keys(metrics.perCategory).length > 0) {
|
|
491
|
+
lines.push(' Per-Category Breakdown:');
|
|
492
|
+
lines.push(` ${'Category'.padEnd(25)} ${'Prec'.padStart(7)} ${'Rec'.padStart(7)} ${'F1'.padStart(7)} ${'N'.padStart(5)}`);
|
|
493
|
+
lines.push(' ' + '-'.repeat(51));
|
|
494
|
+
for (const [cat, m] of Object.entries(metrics.perCategory)) {
|
|
495
|
+
lines.push(` ${cat.padEnd(25)} ${(m.precision * 100).toFixed(1).padStart(6)}% ${(m.recall * 100).toFixed(1).padStart(6)}% ${(m.f1 * 100).toFixed(1).padStart(6)}% ${String(m.total).padStart(5)}`);
|
|
496
|
+
}
|
|
497
|
+
lines.push('');
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
// Per-difficulty breakdown
|
|
501
|
+
if (Object.keys(metrics.perDifficulty).length > 0) {
|
|
502
|
+
lines.push(' Per-Difficulty Breakdown:');
|
|
503
|
+
for (const [diff, m] of Object.entries(metrics.perDifficulty)) {
|
|
504
|
+
lines.push(` ${diff.padEnd(10)} F1: ${(m.f1 * 100).toFixed(1)}% (n=${m.total})`);
|
|
505
|
+
}
|
|
506
|
+
lines.push('');
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
// Latency
|
|
510
|
+
if (metrics.latency && metrics.latency.mean > 0) {
|
|
511
|
+
lines.push(' Latency (ms):');
|
|
512
|
+
lines.push(` Mean: ${metrics.latency.mean}`);
|
|
513
|
+
lines.push(` Median: ${metrics.latency.median}`);
|
|
514
|
+
lines.push(` P95: ${metrics.latency.p95}`);
|
|
515
|
+
lines.push(` P99: ${metrics.latency.p99}`);
|
|
516
|
+
lines.push(` Min: ${metrics.latency.min}`);
|
|
517
|
+
lines.push(` Max: ${metrics.latency.max}`);
|
|
518
|
+
lines.push('');
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
lines.push(sep);
|
|
522
|
+
return lines.join('\n');
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
/**
|
|
526
|
+
* Generate a JSON report.
|
|
527
|
+
* @param {Object} metrics
|
|
528
|
+
* @returns {string}
|
|
529
|
+
*/
|
|
530
|
+
json(metrics) {
|
|
531
|
+
return JSON.stringify(metrics, null, 2);
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
/**
|
|
535
|
+
* Generate a markdown table report.
|
|
536
|
+
* @param {Object} metrics — output from BenchmarkMetrics.compute()
|
|
537
|
+
* @param {Object} [options]
|
|
538
|
+
* @param {string} [options.title='Benchmark Report']
|
|
539
|
+
* @returns {string}
|
|
540
|
+
*/
|
|
541
|
+
markdown(metrics, options = {}) {
|
|
542
|
+
const title = options.title || 'Benchmark Report';
|
|
543
|
+
const lines = [];
|
|
544
|
+
|
|
545
|
+
lines.push(`# ${title}`);
|
|
546
|
+
lines.push('');
|
|
547
|
+
lines.push('## Overall Metrics');
|
|
548
|
+
lines.push('');
|
|
549
|
+
lines.push('| Metric | Value |');
|
|
550
|
+
lines.push('|--------|-------|');
|
|
551
|
+
lines.push(`| Total | ${metrics.total} |`);
|
|
552
|
+
lines.push(`| Precision | ${(metrics.precision * 100).toFixed(2)}% |`);
|
|
553
|
+
lines.push(`| Recall | ${(metrics.recall * 100).toFixed(2)}% |`);
|
|
554
|
+
lines.push(`| F1 Score | ${(metrics.f1 * 100).toFixed(2)}% |`);
|
|
555
|
+
lines.push(`| Accuracy | ${(metrics.accuracy * 100).toFixed(2)}% |`);
|
|
556
|
+
lines.push(`| MCC | ${metrics.mcc.toFixed(4)} |`);
|
|
557
|
+
lines.push('');
|
|
558
|
+
|
|
559
|
+
// Confusion matrix
|
|
560
|
+
const cm = metrics.confusionMatrix;
|
|
561
|
+
lines.push('## Confusion Matrix');
|
|
562
|
+
lines.push('');
|
|
563
|
+
lines.push('| | Predicted Positive | Predicted Negative |');
|
|
564
|
+
lines.push('|---|---|---|');
|
|
565
|
+
lines.push(`| **Actual Positive** | TP: ${cm.tp} | FN: ${cm.fn} |`);
|
|
566
|
+
lines.push(`| **Actual Negative** | FP: ${cm.fp} | TN: ${cm.tn} |`);
|
|
567
|
+
lines.push('');
|
|
568
|
+
|
|
569
|
+
// Per-category
|
|
570
|
+
if (Object.keys(metrics.perCategory).length > 0) {
|
|
571
|
+
lines.push('## Per-Category Breakdown');
|
|
572
|
+
lines.push('');
|
|
573
|
+
lines.push('| Category | Precision | Recall | F1 | N |');
|
|
574
|
+
lines.push('|----------|-----------|--------|-----|---|');
|
|
575
|
+
for (const [cat, m] of Object.entries(metrics.perCategory)) {
|
|
576
|
+
lines.push(`| ${cat} | ${(m.precision * 100).toFixed(1)}% | ${(m.recall * 100).toFixed(1)}% | ${(m.f1 * 100).toFixed(1)}% | ${m.total} |`);
|
|
577
|
+
}
|
|
578
|
+
lines.push('');
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
// Latency
|
|
582
|
+
if (metrics.latency && metrics.latency.mean > 0) {
|
|
583
|
+
lines.push('## Latency');
|
|
584
|
+
lines.push('');
|
|
585
|
+
lines.push('| Stat | ms |');
|
|
586
|
+
lines.push('|------|-----|');
|
|
587
|
+
lines.push(`| Mean | ${metrics.latency.mean} |`);
|
|
588
|
+
lines.push(`| Median | ${metrics.latency.median} |`);
|
|
589
|
+
lines.push(`| P95 | ${metrics.latency.p95} |`);
|
|
590
|
+
lines.push(`| P99 | ${metrics.latency.p99} |`);
|
|
591
|
+
lines.push('');
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
return lines.join('\n');
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
/**
|
|
598
|
+
* Generate a comparison table for multiple engines.
|
|
599
|
+
* @param {Object} comparison — output from BenchmarkHarness.compare()
|
|
600
|
+
* @returns {string}
|
|
601
|
+
*/
|
|
602
|
+
comparisonText(comparison) {
|
|
603
|
+
const lines = [];
|
|
604
|
+
const sep = '='.repeat(70);
|
|
605
|
+
|
|
606
|
+
lines.push(sep);
|
|
607
|
+
lines.push(' Engine Comparison');
|
|
608
|
+
lines.push(sep);
|
|
609
|
+
lines.push('');
|
|
610
|
+
lines.push(` ${'Engine'.padEnd(20)} ${'Prec'.padStart(7)} ${'Rec'.padStart(7)} ${'F1'.padStart(7)} ${'MCC'.padStart(7)} ${'Lat(ms)'.padStart(8)}`);
|
|
611
|
+
lines.push(' ' + '-'.repeat(56));
|
|
612
|
+
|
|
613
|
+
for (const [name, m] of Object.entries(comparison)) {
|
|
614
|
+
const lat = m.latency && m.latency.mean > 0 ? m.latency.mean.toFixed(1) : 'N/A';
|
|
615
|
+
lines.push(` ${name.padEnd(20)} ${(m.precision * 100).toFixed(1).padStart(6)}% ${(m.recall * 100).toFixed(1).padStart(6)}% ${(m.f1 * 100).toFixed(1).padStart(6)}% ${m.mcc.toFixed(3).padStart(7)} ${lat.padStart(8)}`);
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
lines.push('');
|
|
619
|
+
lines.push(sep);
|
|
620
|
+
return lines.join('\n');
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
// =========================================================================
|
|
625
|
+
// BenchmarkHarness
|
|
626
|
+
// =========================================================================
|
|
627
|
+
|
|
628
|
+
/**
|
|
629
|
+
* Main benchmark harness for evaluating detection engines.
|
|
630
|
+
*
|
|
631
|
+
* @example
|
|
632
|
+
* const { BenchmarkHarness } = require('./benchmark-harness');
|
|
633
|
+
* const { scanText } = require('./detector-core');
|
|
634
|
+
*
|
|
635
|
+
* const harness = new BenchmarkHarness();
|
|
636
|
+
* harness.loadDataset('datasets/attack-corpus.json');
|
|
637
|
+
* const results = harness.run((text) => {
|
|
638
|
+
* const result = scanText(text);
|
|
639
|
+
* return result.threats.length > 0;
|
|
640
|
+
* });
|
|
641
|
+
* console.log(harness.formatReport(results));
|
|
642
|
+
*/
|
|
643
|
+
class BenchmarkHarness {
|
|
644
|
+
/**
|
|
645
|
+
* @param {Object} [options]
|
|
646
|
+
* @param {number} [options.warmupRuns=1] — number of warmup iterations (not measured)
|
|
647
|
+
*/
|
|
648
|
+
constructor(options = {}) {
|
|
649
|
+
/** @private */
|
|
650
|
+
this._entries = [];
|
|
651
|
+
/** @private */
|
|
652
|
+
this._meta = {};
|
|
653
|
+
/** @private */
|
|
654
|
+
this._loader = new DatasetLoader();
|
|
655
|
+
/** @private */
|
|
656
|
+
this._metrics = new BenchmarkMetrics();
|
|
657
|
+
/** @private */
|
|
658
|
+
this._reporter = new BenchmarkReportGenerator();
|
|
659
|
+
/** @private */
|
|
660
|
+
this._warmupRuns = options.warmupRuns || 1;
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
/**
|
|
664
|
+
* Load a benchmark dataset from a JSON file.
|
|
665
|
+
* @param {string} filePath — path to the dataset JSON
|
|
666
|
+
* @returns {BenchmarkHarness} this (for chaining)
|
|
667
|
+
*/
|
|
668
|
+
loadDataset(filePath) {
|
|
669
|
+
const { entries, meta } = this._loader.load(filePath);
|
|
670
|
+
this._entries = entries;
|
|
671
|
+
this._meta = meta;
|
|
672
|
+
return this;
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
/**
|
|
676
|
+
* Load entries directly (must conform to the standard schema).
|
|
677
|
+
* @param {Array<Object>} entries
|
|
678
|
+
* @returns {BenchmarkHarness} this (for chaining)
|
|
679
|
+
*/
|
|
680
|
+
loadEntries(entries) {
|
|
681
|
+
this._loader.validate(entries);
|
|
682
|
+
this._entries = entries;
|
|
683
|
+
return this;
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
/**
|
|
687
|
+
* Run a detector function against all loaded entries.
|
|
688
|
+
*
|
|
689
|
+
* The detector function receives the text and must return a boolean
|
|
690
|
+
* (true = threat detected, false = no threat).
|
|
691
|
+
*
|
|
692
|
+
* @param {function(string): boolean} detectorFn
|
|
693
|
+
* @returns {{ results: Array<Object>, metrics: Object }}
|
|
694
|
+
*/
|
|
695
|
+
run(detectorFn) {
|
|
696
|
+
if (this._entries.length === 0) {
|
|
697
|
+
throw new Error('[Agent Shield] No entries loaded. Call loadDataset() or loadEntries() first.');
|
|
698
|
+
}
|
|
699
|
+
if (typeof detectorFn !== 'function') {
|
|
700
|
+
throw new Error('[Agent Shield] detectorFn must be a function');
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
// Warmup runs (not measured)
|
|
704
|
+
for (let w = 0; w < this._warmupRuns; w++) {
|
|
705
|
+
for (const entry of this._entries) {
|
|
706
|
+
detectorFn(entry.text);
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
// Measured run
|
|
711
|
+
const results = [];
|
|
712
|
+
for (const entry of this._entries) {
|
|
713
|
+
const start = this._now();
|
|
714
|
+
let detected;
|
|
715
|
+
try {
|
|
716
|
+
detected = Boolean(detectorFn(entry.text));
|
|
717
|
+
} catch (err) {
|
|
718
|
+
detected = false;
|
|
719
|
+
}
|
|
720
|
+
const latencyMs = this._now() - start;
|
|
721
|
+
|
|
722
|
+
results.push({
|
|
723
|
+
entry,
|
|
724
|
+
detected,
|
|
725
|
+
expected: entry.expected_detection,
|
|
726
|
+
correct: detected === entry.expected_detection,
|
|
727
|
+
latencyMs
|
|
728
|
+
});
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
const metrics = this._metrics.compute(results);
|
|
732
|
+
return { results, metrics };
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
/**
|
|
736
|
+
* Compare multiple detector functions side-by-side.
|
|
737
|
+
*
|
|
738
|
+
* @param {Object<string, function(string): boolean>} detectors — name → detectorFn
|
|
739
|
+
* @returns {Object<string, Object>} name → metrics
|
|
740
|
+
*/
|
|
741
|
+
compare(detectors) {
|
|
742
|
+
if (!detectors || typeof detectors !== 'object') {
|
|
743
|
+
throw new Error('[Agent Shield] detectors must be an object mapping names to functions');
|
|
744
|
+
}
|
|
745
|
+
const comparison = {};
|
|
746
|
+
for (const [name, fn] of Object.entries(detectors)) {
|
|
747
|
+
const { metrics } = this.run(fn);
|
|
748
|
+
comparison[name] = metrics;
|
|
749
|
+
}
|
|
750
|
+
return comparison;
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
/**
|
|
754
|
+
* Generate a human-readable text report from run results.
|
|
755
|
+
* @param {{ results: Array, metrics: Object }} results — output from run()
|
|
756
|
+
* @returns {string}
|
|
757
|
+
*/
|
|
758
|
+
formatReport(results) {
|
|
759
|
+
return this._reporter.text(results.metrics, { title: 'Agent Shield Benchmark Report' });
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
/**
|
|
763
|
+
* Generate a comparison table from compare() output.
|
|
764
|
+
* @param {Object} comparison — output from compare()
|
|
765
|
+
* @returns {string}
|
|
766
|
+
*/
|
|
767
|
+
formatComparison(comparison) {
|
|
768
|
+
return this._reporter.comparisonText(comparison);
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
/**
|
|
772
|
+
* Generate a markdown report from run results.
|
|
773
|
+
* @param {{ results: Array, metrics: Object }} results — output from run()
|
|
774
|
+
* @returns {string}
|
|
775
|
+
*/
|
|
776
|
+
formatMarkdown(results) {
|
|
777
|
+
return this._reporter.markdown(results.metrics, { title: 'Agent Shield Benchmark Report' });
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
/**
|
|
781
|
+
* High-resolution timer.
|
|
782
|
+
* @returns {number}
|
|
783
|
+
* @private
|
|
784
|
+
*/
|
|
785
|
+
_now() {
|
|
786
|
+
if (typeof performance !== 'undefined' && performance.now) {
|
|
787
|
+
return performance.now();
|
|
788
|
+
}
|
|
789
|
+
const hr = process.hrtime();
|
|
790
|
+
return hr[0] * 1e3 + hr[1] / 1e6;
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
// =========================================================================
|
|
795
|
+
// EXPORTS
|
|
796
|
+
// =========================================================================
|
|
797
|
+
|
|
798
|
+
module.exports = {
|
|
799
|
+
BenchmarkHarness,
|
|
800
|
+
DatasetLoader,
|
|
801
|
+
BenchmarkMetrics,
|
|
802
|
+
RegressionTracker,
|
|
803
|
+
BenchmarkReportGenerator
|
|
804
|
+
};
|